In [2]:
__imp

Libraries have been loaded


## Load Additional Libraries

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

basepath = os.path.expanduser('~/Desktop/src/ml/talking_data/')

## Load Data

In [4]:
%%time

gender_age_train         = pd.read_feather(os.path.join(basepath, 'data/processed/gender_age_train.feather'))
gender_age_test          = pd.read_feather(os.path.join(basepath, 'data/processed/gender_age_test.feather'))
label_categories         = pd.read_feather(os.path.join(basepath, 'data/processed/label_categories.feather'))
app_labels               = pd.read_feather(os.path.join(basepath, 'data/processed/app_labels.feather'))
phone_brand_device_model = pd.read_feather(os.path.join(basepath, 'data/processed/phone_brand_device_model.feather')) 
app_events               = pd.read_feather(os.path.join(basepath, 'data/processed/app_events.feather'))
events                   = pd.read_feather(os.path.join(basepath, 'data/processed/events.feather'))

CPU times: user 208 ms, sys: 166 ms, total: 374 ms
Wall time: 373 ms


## Sample

In [5]:
def get_train_test(gender_age_train):
    dtr, dte, _, _ = train_test_split(gender_age_train, 
                                           gender_age_train.group, 
                                           stratify=gender_age_train.group,
                                           test_size=.2
                                          )
    
    dtr, dval, _, _ = train_test_split(dte, dte.group,
                                       stratify=dte.group,
                                       test_size=.2
                                      )
    
    del dte
    gc.collect()
    
    return dtr, dval

In [6]:
dtr, dval = get_train_test(gender_age_train)

In [7]:
dval.group.value_counts(normalize=True)

M23-26    0.128600
M32-38    0.126926
M39+      0.114869
M22-      0.100469
M29-31    0.097790
F33-42    0.074347
M27-28    0.073007
F23-      0.067649
F29-32    0.061956
F43+      0.056263
F24-26    0.056263
F27-28    0.041862
Name: group, dtype: float64

In [8]:
dtr.group.value_counts(normalize=True)

M23-26    0.128695
M32-38    0.126936
M39+      0.114963
M22-      0.100310
M29-31    0.097965
F33-42    0.074521
M27-28    0.072930
F23-      0.067655
F29-32    0.061961
F43+      0.056184
F24-26    0.056100
F27-28    0.041782
Name: group, dtype: float64

In [9]:
# check number of device with no events in training and test set
_ = dtr.merge(events, on='device_id', how='left')
print('Train: {}'.format(_[_.event_id.isnull()].shape[0] / len(_)))

_ = dval.merge(events, on='device_id', how='left')
print('Validation: {}'.format(_[_.event_id.isnull()].shape[0] / len(_)))

Train: 0.03748520139863991
Validation: 0.03720964234150264


In [10]:
# dictionary for brand names
chinese_to_eng_brands = {
'三星'   : 'samsung',
'天语'   : 'Ktouch',
'海信'   : 'hisense',
'联想'   : 'lenovo',
'欧比'   : 'obi',
'爱派尔' : 'ipair',
'努比亚' : 'nubia',
'优米'   : 'youmi',
'朵唯'   : 'dowe',
'黑米'   : 'heymi',
'锤子'   : 'hammer',
'酷比魔方': 'koobee',
'美图'   : 'meitu',
'尼比鲁' : 'nibilu',
'一加'   : 'oneplus',
'优购': 'yougo',
'诺基亚': 'nokia',
'糖葫芦': 'candy',
'中国移动':'ccmc',
'语信': 'yuxin',
'基伍': 'kiwu',
'青橙': 'greeno',
'华硕': 'asus',
'夏新': 'panosonic',
'维图': 'weitu',
'艾优尼': 'aiyouni',
'摩托罗拉': 'moto',
'乡米': 'xiangmi',
'米奇': 'micky',
'大可乐': 'bigcola',
'沃普丰': 'wpf',
'神舟': 'hasse',
'摩乐': 'mole',
'飞秒': 'fs',
'米歌': 'mige',
'富可视': 'fks',
'德赛': 'desci',
'梦米': 'mengmi',
'乐视': 'lshi',
'小杨树':'smallt',
'纽曼': 'newman',
'邦华' : 'banghua',
'E派' : 'epai',
'易派': 'epai',
'普耐尔': 'pner',
'欧新': 'ouxin',
'西米': 'ximi',
'海尔': 'haier',
'波导': 'bodao',
'糯米': 'nuomi',
'唯米': 'weimi',
'酷珀': 'kupo',
'谷歌': 'google',
'昂达': 'ada',
'聆韵': 'lingyun',
'华为': 'Huawei',
'小米': 'millet',
'魅族': 'Meizu',
'金立': 'Gionee'
}

### Traintest

In [11]:
%%time

# merge with other data frames
tr =  dtr\
        .merge(phone_brand_device_model, on='device_id', how='left')
        
tr.loc[:, 'phone_brand'] = tr.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)

te = dval\
        .merge(phone_brand_device_model, on='device_id', how='left')
        
te.loc[:, 'phone_brand'] = te.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)

traintest = pd.concat((tr, te))
ntrain    = len(tr)

del tr, te
gc.collect();

CPU times: user 144 ms, sys: 10.6 ms, total: 154 ms
Wall time: 153 ms


### Feature Engineering

In [12]:
%%time

traintest.loc[:, 'phone_brand']  = pd.factorize(traintest.phone_brand)[0]
traintest.loc[:, 'device_model'] = pd.factorize(traintest.device_model)[0]

# target encoding
traintest.loc[:, 'group']        = pd.factorize(traintest.group)[0]

CPU times: user 13.3 ms, sys: 3.34 ms, total: 16.7 ms
Wall time: 15.6 ms


In [13]:
%%time

# phone brand (freq)
traintest.loc[:, 'phone_brand_freq']  = traintest.groupby('phone_brand')['phone_brand'].transform(lambda x: len(x))

# device model (freq)
traintest.loc[:, 'device_model_freq'] = traintest.groupby('device_model')['device_model'].transform(lambda x: len(x))

# number of different device models for a particular brand
num_diff_models = traintest.groupby('phone_brand').apply(lambda x: x['device_model'].nunique())
traintest.loc[:, 'num_diff_models'] = traintest.phone_brand.map(num_diff_models)

# most_generated_event
most_generated_event = events.groupby('device_id').apply(lambda x: x['event_id'].value_counts().index.values[0])
traintest.loc[:, 'most_generated_event'] = traintest.device_id.map(most_generated_event).fillna(-1)

# hour with most number of events by device
hour_with_most_events = events.groupby('device_id')\
                              .apply(lambda x: x['timestamp'].dt.hour.value_counts().index.values[0])


traintest.loc[:, 'hour_with_most_events'] = traintest.device_id.map(hour_with_most_events).fillna(0)

# number of different hours at which events were generated
num_diff_hours = events.groupby('device_id').apply(lambda x: x['timestamp'].dt.hour.nunique())
traintest.loc[:, 'num_diff_hours'] = traintest.device_id.map(num_diff_hours).fillna(-1)

# number of events generated by a device
num_events = events.device_id.value_counts()
traintest.loc[:, 'num_events'] = traintest.device_id.map(num_events).fillna(0)

# number of different locations from where events were generated by device.
num_diff_locations = events.groupby('device_id').apply(lambda x: x.loc[:, ['longitude', 'latitude']].drop_duplicates().shape[0])
traintest.loc[:, 'num_diff_locations'] = traintest.device_id.map(num_diff_locations).fillna(-1)

del num_diff_models, most_generated_event, hour_with_most_events, num_events
del num_diff_hours, mean_time_succ_events, num_diff_locations
gc.collect();

CPU times: user 5min 42s, sys: 560 ms, total: 5min 42s
Wall time: 5min 42s


In [19]:
FEATURES = ['phone_brand', 
            'device_model', 
            'phone_brand_freq', 
            'device_model_freq',
            'num_diff_models',
            'most_generated_event',
            'hour_with_most_events',
            'num_events',
            'num_diff_hours',
            'num_diff_locations'
           ]

X_tr = traintest.iloc[:ntrain].loc[:, FEATURES]
y_tr = traintest.iloc[:ntrain].loc[:, 'group']

X_val = traintest.iloc[ntrain:].loc[:, FEATURES]
y_val = traintest.iloc[ntrain:].loc[:, 'group']

In [20]:
print('Shape of feature set (train) : {}'.format(X_tr.shape))
print('Shape of feature set (test) : {}'.format(X_val.shape))

Shape of feature set (train) : (11980, 10)
Shape of feature set (test) : (2996, 10)


In [21]:
m = RandomForestClassifier(n_estimators=125, max_depth=6, min_samples_leaf=1, n_jobs=-1, random_state=SEED)
m.fit(X_tr, y_tr)

val_preds = m.predict_proba(X_val)
print('Log Loss: {}'.format(log_loss(y_val, val_preds)))

Log Loss: 2.3977648699273484


In [22]:
m.feature_importances_

array([0.07535217, 0.11118064, 0.08139042, 0.11180738, 0.14690088,
       0.11487609, 0.07286265, 0.11901604, 0.0888724 , 0.07774132])

### Separate models for devices with no events versus devices with events.

In [47]:
dtr_events   = dtr.merge(events, on='device_id', how='left')\
                  .merge(phone_brand_device_model, on='device_id', how='left')
    
dval_events  = dval.merge(events, on='device_id', how='left')\
                   .merge(phone_brand_device_model, on='device_id', how='left')

tr_no_events = dtr_events.loc[dtr_events.event_id.isnull(), :]
tr_events    = dtr_events.loc[dtr_events.event_id.notnull(), :]

tr_no_events.loc[:, 'phone_brand'] = tr_no_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)
tr_events.loc[:, 'phone_brand']    = tr_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)

te_no_events = dval_events.loc[dval_events.event_id.isnull(), :]
te_events    = dval_events.loc[dval_events.event_id.notnull(), :]

te_no_events.loc[:, 'phone_brand'] = te_no_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)
te_events.loc[:, 'phone_brand']    = te_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)

In [48]:
%%time

# no events

traintest = pd.concat((tr_no_events, te_no_events))
ntrain    = len(tr_no_events)

del tr_no_events, te_no_events
gc.collect();

traintest.loc[:, 'phone_brand']  = pd.factorize(traintest.phone_brand)[0]
traintest.loc[:, 'device_model'] = pd.factorize(traintest.device_model)[0]

# target encoding
traintest.loc[:, 'group']        = pd.factorize(traintest.group)[0]

CPU times: user 67.6 ms, sys: 12 ms, total: 79.6 ms
Wall time: 79 ms


In [49]:
%%time

# phone brand (freq)
traintest.loc[:, 'phone_brand_freq']  = traintest.groupby('phone_brand')['phone_brand'].transform(lambda x: len(x))

# device model (freq)
traintest.loc[:, 'device_model_freq'] = traintest.groupby('device_model')['device_model'].transform(lambda x: len(x))

# number of different device models for a particular brand
num_diff_models = traintest.groupby('phone_brand').apply(lambda x: x['device_model'].nunique())
traintest.loc[:, 'num_diff_models'] = traintest.phone_brand.map(num_diff_models)

FEATURES = ['phone_brand', 
            'device_model', 
            'phone_brand_freq', 
            'device_model_freq',
            'num_diff_models'
           ]

X_tr     = traintest.iloc[:ntrain].loc[:, FEATURES]
y_tr_ne  = traintest.iloc[:ntrain].loc[:, 'group']

X_val    = traintest.iloc[ntrain:].loc[:, FEATURES]
y_val_ne = traintest.iloc[ntrain:].loc[:, 'group']

CPU times: user 230 ms, sys: 3 µs, total: 230 ms
Wall time: 229 ms


In [50]:
print('Shape of feature set (train) : {}'.format(X_tr.shape))
print('Shape of feature set (test) : {}'.format(X_val.shape))

Shape of feature set (train) : (8190, 5)
Shape of feature set (test) : (2038, 5)


In [51]:
m = RandomForestClassifier(n_estimators=125, max_depth=3, min_samples_leaf=1, n_jobs=-1, random_state=SEED)
m.fit(X_tr, y_tr_ne)

val_preds_ne = m.predict_proba(X_val)
print('Log Loss without events: {}'.format(log_loss(y_val_ne, val_preds_ne)))

Log Loss without events: 2.4130237681482973


####

In [52]:
%%time

# no events

traintest = pd.concat((tr_events, te_events))
ntrain    = len(tr_events)

del tr_events, te_events
gc.collect();

traintest.loc[:, 'phone_brand']  = pd.factorize(traintest.phone_brand)[0]
traintest.loc[:, 'device_model'] = pd.factorize(traintest.device_model)[0]

# target encoding
traintest.loc[:, 'group']        = pd.factorize(traintest.group)[0]

CPU times: user 145 ms, sys: 3.99 ms, total: 149 ms
Wall time: 147 ms


In [53]:
%%time

# phone brand (freq)
traintest.loc[:, 'phone_brand_freq']  = traintest.groupby('phone_brand')['phone_brand'].transform(lambda x: len(x))

# device model (freq)
traintest.loc[:, 'device_model_freq'] = traintest.groupby('device_model')['device_model'].transform(lambda x: len(x))

# number of different device models for a particular brand
num_diff_models = traintest.groupby('phone_brand').apply(lambda x: x['device_model'].nunique())
traintest.loc[:, 'num_diff_models'] = traintest.phone_brand.map(num_diff_models)

# most_generated_event
most_generated_event = events.groupby('device_id').apply(lambda x: x['event_id'].value_counts().index.values[0])
traintest.loc[:, 'most_generated_event'] = traintest.device_id.map(most_generated_event).fillna(-1)

# hour with most number of events by device
hour_with_most_events = events.groupby('device_id')\
                              .apply(lambda x: x['timestamp'].dt.hour.value_counts().index.values[0])


traintest.loc[:, 'hour_with_most_events'] = traintest.device_id.map(hour_with_most_events).fillna(0)

# number of different hours at which events were generated
num_diff_hours = events.groupby('device_id').apply(lambda x: x['timestamp'].dt.hour.nunique())
traintest.loc[:, 'num_diff_hours'] = traintest.device_id.map(num_diff_hours).fillna(-1)

# number of events generated by a device
num_events = events.device_id.value_counts()
traintest.loc[:, 'num_events'] = traintest.device_id.map(num_events).fillna(0)

# number of different locations from where events were generated by device.
num_diff_locations = events.groupby('device_id').apply(lambda x: x.loc[:, ['longitude', 'latitude']].drop_duplicates().shape[0])
traintest.loc[:, 'num_diff_locations'] = traintest.device_id.map(num_diff_locations).fillna(-1)

del num_diff_models, most_generated_event, hour_with_most_events, num_events
del num_diff_hours, num_diff_locations
gc.collect();

FEATURES = ['phone_brand', 
            'device_model', 
            'phone_brand_freq', 
            'device_model_freq',
            'num_diff_models',
            'most_generated_event',
            'hour_with_most_events',
            'num_events',
            'num_diff_hours',
            'num_diff_locations'
           ]

X_tr   = traintest.iloc[:ntrain].loc[:, FEATURES]
y_tr_e = traintest.iloc[:ntrain].loc[:, 'group']

X_val   = traintest.iloc[ntrain:].loc[:, FEATURES]
y_val_e = traintest.iloc[ntrain:].loc[:, 'group']

CPU times: user 3min 4s, sys: 312 ms, total: 3min 5s
Wall time: 3min 5s


In [56]:
print('Shape of feature set (train) : {}'.format(X_tr.shape))
print('Shape of feature set (test) : {}'.format(X_val.shape))

Shape of feature set (train) : (211737, 10)
Shape of feature set (test) : (53079, 10)


In [55]:
m = RandomForestClassifier(n_estimators=125, max_depth=3, min_samples_leaf=1, n_jobs=-1, random_state=SEED)
m.fit(X_tr, y_tr_e)

val_preds_e = m.predict_proba(X_val)
print('Log Loss without events: {}'.format(log_loss(y_val_e, val_preds_e)))

Log Loss without events: 2.3524177642694517


In [73]:
log_loss(np.hstack((y_val_e.values, y_val_ne.values)), np.vstack((val_preds_e, val_preds_ne)))

2.3546587250602258

** Creating separate models for devices that have generated events vs devices that have generated event leads to lower log loss. **