```
Bag of words features for app ids, phone brand and device models.
```

In [1]:
__imp

Libraries have been loaded


## Load Additional Libraries

In [2]:
import scipy as sp

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import scale

basepath = os.path.expanduser('~/Desktop/src/ml/talking_data/')

## Load Data

In [3]:
%%time

gender_age_train         = pd.read_feather(os.path.join(basepath, 'data/processed/gender_age_train.feather'))
gender_age_test          = pd.read_feather(os.path.join(basepath, 'data/processed/gender_age_test.feather'))
phone_brand_device_model = pd.read_feather(os.path.join(basepath, 'data/processed/phone_brand_device_model.feather')) 
app_events               = pd.read_feather(os.path.join(basepath, 'data/processed/app_events.feather'))
events                   = pd.read_feather(os.path.join(basepath, 'data/processed/events.feather'))

phone_brand_device_model = phone_brand_device_model.drop_duplicates()

CPU times: user 238 ms, sys: 200 ms, total: 438 ms
Wall time: 437 ms


## Sample

In [4]:
def get_train_test(train):
    dtr, dte, _, _ = train_test_split(train, 
                                      train.group, 
                                      stratify=train.group,
                                      test_size=.2
                                     )
    
    dtr, dval, _, _ = train_test_split(dte, dte.group,
                                       stratify=dte.group,
                                       test_size=.2
                                      )
    
    del dte
    gc.collect()
    
    return dtr, dval

In [5]:
train     = gender_age_train.merge(phone_brand_device_model, on='device_id', how='left')
dtr, dval = get_train_test(train)

In [6]:
dval.group.value_counts(normalize=True)

M23-26    0.128600
M32-38    0.126926
M39+      0.114869
M22-      0.100469
M29-31    0.097790
F33-42    0.074347
M27-28    0.073007
F23-      0.067649
F29-32    0.061956
F43+      0.056263
F24-26    0.056263
F27-28    0.041862
Name: group, dtype: float64

In [7]:
dtr.group.value_counts(normalize=True)

M23-26    0.128684
M32-38    0.126926
M39+      0.114953
M22-      0.100301
M29-31    0.097957
F33-42    0.074514
M27-28    0.072924
F23-      0.067649
F29-32    0.062040
F43+      0.056179
F24-26    0.056095
F27-28    0.041778
Name: group, dtype: float64

In [8]:
# check number of device with no events in training and test set
_ = dtr.merge(events, on='device_id', how='left')
print('Train: {}'.format(_[_.event_id.isnull()].shape[0] / len(_)))

_ = dval.merge(events, on='device_id', how='left')
print('Validation: {}'.format(_[_.event_id.isnull()].shape[0] / len(_)))

Train: 0.03775805196249212
Validation: 0.03938986953184958


In [6]:
# dictionary for brand names
chinese_to_eng_brands = {
'三星'   : 'samsung',
'天语'   : 'Ktouch',
'海信'   : 'hisense',
'联想'   : 'lenovo',
'欧比'   : 'obi',
'爱派尔' : 'ipair',
'努比亚' : 'nubia',
'优米'   : 'youmi',
'朵唯'   : 'dowe',
'黑米'   : 'heymi',
'锤子'   : 'hammer',
'酷比魔方': 'koobee',
'美图'   : 'meitu',
'尼比鲁' : 'nibilu',
'一加'   : 'oneplus',
'优购': 'yougo',
'诺基亚': 'nokia',
'糖葫芦': 'candy',
'中国移动':'ccmc',
'语信': 'yuxin',
'基伍': 'kiwu',
'青橙': 'greeno',
'华硕': 'asus',
'夏新': 'panosonic',
'维图': 'weitu',
'艾优尼': 'aiyouni',
'摩托罗拉': 'moto',
'乡米': 'xiangmi',
'米奇': 'micky',
'大可乐': 'bigcola',
'沃普丰': 'wpf',
'神舟': 'hasse',
'摩乐': 'mole',
'飞秒': 'fs',
'米歌': 'mige',
'富可视': 'fks',
'德赛': 'desci',
'梦米': 'mengmi',
'乐视': 'lshi',
'小杨树':'smallt',
'纽曼': 'newman',
'邦华' : 'banghua',
'E派' : 'epai',
'易派': 'epai',
'普耐尔': 'pner',
'欧新': 'ouxin',
'西米': 'ximi',
'海尔': 'haier',
'波导': 'bodao',
'糯米': 'nuomi',
'唯米': 'weimi',
'酷珀': 'kupo',
'谷歌': 'google',
'昂达': 'ada',
'聆韵': 'lingyun',
'华为': 'Huawei',
'小米': 'millet',
'魅族': 'Meizu',
'金立': 'Gionee'
}

### Separate models for devices with no events versus devices with events.

In [41]:
%%time

dtr_events   = dtr.merge(events, on='device_id', how='left')\
                  .merge(phone_brand_device_model, on='device_id', how='left')
    
dval_events  = dval.merge(events, on='device_id', how='left')\
                   .merge(phone_brand_device_model, on='device_id', how='left')

# devices in training set
tr_devices_with_events    = dtr_events.loc[dtr_events.event_id.notnull(), 'device_id']
tr_devices_with_no_events = dtr_events.loc[dtr_events.event_id.isnull(), 'device_id']

# devices in validation set
val_devices_with_events    = dval_events.loc[dval_events.event_id.notnull(), 'device_id']
val_devices_with_no_events = dval_events.loc[dval_events.event_id.isnull(), 'device_id'] 

# separate out training set based on whether events were generated by a device or not
tr_no_events = dtr.loc[dtr.device_id.isin(tr_devices_with_no_events), :]
tr_events    = dtr.loc[dtr.device_id.isin(tr_devices_with_events), :]

tr_no_events.loc[:, 'phone_brand'] = tr_no_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)
tr_events.loc[:, 'phone_brand']    = tr_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)


te_no_events = dval.loc[dval.device_id.isin(val_devices_with_no_events), :]
te_events    = dval.loc[dval.device_id.isin(val_devices_with_events), :]

te_no_events.loc[:, 'phone_brand'] = te_no_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)
te_events.loc[:, 'phone_brand']    = te_events.phone_brand.map(lambda x: chinese_to_eng_brands[x] if x in chinese_to_eng_brands else x)

del dtr_events, dval_events
gc.collect();

CPU times: user 871 ms, sys: 112 ms, total: 983 ms
Wall time: 985 ms


#### No Events

In [42]:
%%time

# no events

traintest = pd.concat((tr_no_events, te_no_events))
ntrain    = len(tr_no_events)

del tr_no_events, te_no_events
gc.collect();

# bag of words for phone brand
phone_brand_bow_vec = CountVectorizer()
phone_brand_bow     = phone_brand_bow_vec.fit_transform(traintest.phone_brand)

# bag of words for device model
device_model_bow_vec = CountVectorizer()
device_model_bow     = device_model_bow_vec.fit_transform(traintest.device_model)

# target encoding
traintest.loc[:, 'group'] = pd.factorize(traintest.group)[0]

CPU times: user 147 ms, sys: 0 ns, total: 147 ms
Wall time: 148 ms


In [43]:
# phone brand (freq)
traintest.loc[:, 'phone_brand_freq']  = traintest.groupby('phone_brand')['phone_brand'].transform(lambda x: len(x))

# device model (freq)
traintest.loc[:, 'device_model_freq'] = traintest.groupby('device_model')['device_model'].transform(lambda x: len(x))

# number of different device models for a particular brand
num_diff_models = traintest.groupby('phone_brand').apply(lambda x: x['device_model'].nunique())
traintest.loc[:, 'num_diff_models'] = traintest.phone_brand.map(num_diff_models)

FEATURES = ['phone_brand_freq', 
            'device_model_freq',
            'num_diff_models'
           ]

y_tr_ne  = traintest.iloc[:ntrain].loc[:, 'group'].values
y_val_ne = traintest.iloc[ntrain:].loc[:, 'group'].values

traintest = scale(traintest.loc[:, FEATURES].values)

X_tr     = sp.sparse.hstack((traintest[:ntrain], 
                      phone_brand_bow[:ntrain], 
                      device_model_bow[:ntrain]))

X_val    = sp.sparse.hstack((traintest[ntrain:], 
                      phone_brand_bow[ntrain:], 
                      device_model_bow[ntrain:]))

del traintest
gc.collect();

In [44]:
print('Shape of feature set (train) : {}'.format(X_tr.shape))
print('Shape of feature set (test) : {}'.format(X_val.shape))

Shape of feature set (train) : (8150, 943)
Shape of feature set (test) : (2053, 943)


In [45]:
m = LogisticRegression(C=.02, solver='lbfgs', multi_class='multinomial', n_jobs=-1, random_state=SEED)
m.fit(X_tr, y_tr_ne)

val_preds_ne = m.predict_proba(X_val)
print('Log Loss without events: {}'.format(log_loss(y_val_ne, val_preds_ne)))

Log Loss without events: 2.4145398583119966


#### Events

In [46]:
%%time

# no events

traintest = pd.concat((tr_events, te_events))
ntrain    = len(tr_events)

del tr_events, te_events
gc.collect();

# bag of words for phone brand
phone_brand_bow_vec = CountVectorizer()
phone_brand_bow     = phone_brand_bow_vec.fit_transform(traintest.phone_brand)

# bag of words for device model
device_model_bow_vec = CountVectorizer()
device_model_bow     = device_model_bow_vec.fit_transform(traintest.device_model)

# encode target
traintest.loc[:, 'group']  = pd.factorize(traintest.group)[0]

CPU times: user 99.3 ms, sys: 12 ms, total: 111 ms
Wall time: 111 ms


In [47]:
%%time

# phone brand (freq)
traintest.loc[:, 'phone_brand_freq']  = traintest.groupby('phone_brand')['phone_brand'].transform(lambda x: len(x))

# device model (freq)
traintest.loc[:, 'device_model_freq'] = traintest.groupby('device_model')['device_model'].transform(lambda x: len(x))

# number of different device models for a particular brand
num_diff_models = traintest.groupby('phone_brand').apply(lambda x: x['device_model'].nunique())
traintest.loc[:, 'num_diff_models'] = traintest.phone_brand.map(num_diff_models)

del num_diff_models
gc.collect();

# most_generated_event
most_generated_event = events.groupby('device_id').apply(lambda x: x['event_id'].value_counts().index.values[0])
traintest.loc[:, 'most_generated_event'] = traintest.device_id.map(most_generated_event).fillna(-1)

del most_generated_event
gc.collect();

# hour with most number of events by device
hour_with_most_events = events.groupby('device_id')\
                              .apply(lambda x: x['timestamp'].dt.hour.value_counts().index.values[0])


traintest.loc[:, 'hour_with_most_events'] = traintest.device_id.map(hour_with_most_events).fillna(0)

del hour_with_most_events
gc.collect();

# number of different hours at which events were generated
num_diff_hours = events.groupby('device_id').apply(lambda x: x['timestamp'].dt.hour.nunique())
traintest.loc[:, 'num_diff_hours'] = traintest.device_id.map(num_diff_hours).fillna(-1)

del num_diff_hours
gc.collect();

# number of events generated by a device
num_events = events.device_id.value_counts()
traintest.loc[:, 'num_events'] = traintest.device_id.map(num_events).fillna(0)

del num_events
gc.collect();

# number of different locations from where events were generated by device.
num_diff_locations = events.groupby('device_id').apply(lambda x: x.loc[:, ['longitude', 'latitude']].drop_duplicates().shape[0])
traintest.loc[:, 'num_diff_locations'] = traintest.device_id.map(num_diff_locations).fillna(-1)

del num_diff_locations
gc.collect();

# number of different applications
apps_with_events = traintest.merge(events, on='device_id', how='left')\
                            .merge(app_events, on='event_id', how='left')
    
num_diff_apps = apps_with_events.groupby('device_id').apply(lambda x: x['app_id'].nunique())
traintest.loc[:, 'num_diff_apps'] = traintest.device_id.map(num_diff_apps).fillna(-1)


del num_diff_apps, apps_with_events
gc.collect();

FEATURES = ['phone_brand_freq', 
            'device_model_freq',
            'num_diff_models',
            'most_generated_event',
            'hour_with_most_events',
            'num_events',
            'num_diff_hours',
            'num_diff_locations',
            'num_diff_apps'
           ]

y_tr_e  = traintest.iloc[:ntrain].loc[:, 'group'].values
y_val_e = traintest.iloc[ntrain:].loc[:, 'group'].values

traintest = scale(traintest.loc[:, FEATURES].values)

X_tr   = sp.sparse.hstack((traintest[:ntrain], 
                      phone_brand_bow[:ntrain], 
                      device_model_bow[:ntrain]))

X_val   = sp.sparse.hstack((traintest[ntrain:], 
                      phone_brand_bow[ntrain:], 
                      device_model_bow[ntrain:]))

del traintest
gc.collect();

CPU times: user 3min 37s, sys: 1.9 s, total: 3min 39s
Wall time: 3min 40s


In [48]:
print('Shape of feature set (train) : {}'.format(X_tr.shape))
print('Shape of feature set (test) : {}'.format(X_val.shape))

Shape of feature set (train) : (3794, 533)
Shape of feature set (test) : (933, 533)


In [49]:
m = LogisticRegression(C=.02, solver='lbfgs', multi_class='multinomial', n_jobs=-1, random_state=SEED)
m.fit(X_tr, y_tr_e)

val_preds_e = m.predict_proba(X_val)
print('Log Loss without events: {}'.format(log_loss(y_val_e, val_preds_e)))

Log Loss without events: 2.3680449371277104


In [51]:
# combine both devices with no events generated and devices that generated events
log_loss(np.hstack((y_val_e, y_val_ne)), np.vstack((val_preds_e, val_preds_ne)))

2.400012141813357

** Creating separate models for devices that have generated events vs devices that have generated event leads to lower log loss. **