```
Methodology:


```

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
import gc
import pickle
import re
import feather

import matplotlib.pyplot as plt
import seaborn as sns

SEED = 1982
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

sns.set(context='talk', style='ticks')
print('Libraries have been loaded')

Libraries have been loaded


## Load Additional Libraries

In [14]:
import scipy as sp

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder

basepath = os.path.expanduser('/home/ubuntu/TalkingData/')

## Load Data

In [15]:
%%time

TRAIN_FILEPATH = 'data/processed/folds/trv1.feather'
TEST_FILEPATH  = 'data/processed/folds/valv1.feather'

gatrain     = pd.read_feather(os.path.join(basepath, TRAIN_FILEPATH))
gatest      = pd.read_feather(os.path.join(basepath, TEST_FILEPATH))
phone       = pd.read_feather(os.path.join(basepath, 'data/processed/phone_brand_device_model.feather')) 
appevents   = pd.read_feather(os.path.join(basepath, 'data/processed/app_events.feather'))
applabels   = pd.read_feather(os.path.join(basepath, 'data/processed/app_labels.feather'))
events      = pd.read_feather(os.path.join(basepath, 'data/processed/events.feather'))

phone       = phone.drop_duplicates('device_id', keep='first')

# set correct index values
gatrain = gatrain.set_index('device_id')
gatest  = gatest.set_index('device_id')

phone   = phone.set_index('device_id')
events  = events.set_index('event_id')

CPU times: user 256 ms, sys: 152 ms, total: 408 ms
Wall time: 291 ms


In [16]:
# column to represent train or test row a particular device id belongs to
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow']   = np.arange(gatest.shape[0])

In [17]:
brandencoder   = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])

gatrain['brand'] = phone['brand']
gatest['brand']  = phone['brand']

Xtr_brand = sp.sparse.csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))

Xte_brand = sp.sparse.csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))

print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (55983, 131), test shape (18662, 131)


In [18]:
m            = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model']   = modelencoder.transform(m)

gatrain['model'] = phone['model']
gatest['model']  = phone['model']

Xtr_model = sp.sparse.csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))

Xte_model = sp.sparse.csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))

print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (55983, 1667), test shape (18662, 1667)


In [23]:
appencoder       = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)

deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()

Unnamed: 0,device_id,app,size,trainrow,testrow
0,-9222956879900151005,548,18,15257.0,
1,-9222956879900151005,1096,18,15257.0,
2,-9222956879900151005,1248,26,15257.0,
3,-9222956879900151005,1545,12,15257.0,
4,-9222956879900151005,1664,18,15257.0,


In [24]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = sp.sparse.csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))

d = deviceapps.dropna(subset=['testrow'])
Xte_app = sp.sparse.csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))

print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (55983, 19237), test shape (18662, 19237)


In [25]:
applabels        = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)

labelencoder       = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)

nlabels = len(labelencoder.classes_)

In [26]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())

devicelabels.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,1,15257.0,
1,-9222956879900151005,120,1,15257.0,
2,-9222956879900151005,126,1,15257.0,
3,-9222956879900151005,138,2,15257.0,
4,-9222956879900151005,147,2,15257.0,


In [28]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = sp.sparse.csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))

d = devicelabels.dropna(subset=['testrow'])
Xte_label = sp.sparse.csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))

print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

Labels data: train shape (55983, 492), test shape (18662, 492)


In [52]:
# device related features
traintest = pd.concat((gatrain, gatest))

brand_freq      = traintest.groupby('brand')['brand'].transform(lambda x: len(x))
model_freq      = traintest.groupby('model')['model'].transform(lambda x: len(x))

num_diff_models = traintest.groupby('brand').apply(lambda x: x['model'].nunique())
num_diff_models = traintest.brand.map(num_diff_models)

Xtr_other = pd.DataFrame(index=gatrain.index)
Xte_other = pd.DataFrame(index=gatest.index)

Xtr_other['brand_freq'] = brand_freq
Xte_other['brand_freq'] = brand_freq

Xtr_other['model_freq'] = model_freq
Xte_other['model_freq'] = model_freq

Xtr_other['num_diff_models'] = num_diff_models
Xte_other['num_diff_models'] = num_diff_models

traintest = pd.concat((Xtr_other, Xte_other))
traintest = scale(traintest)

Xtr_other = traintest[:len(Xtr_other)]
Xte_other = traintest[len(Xtr_other):]

In [84]:
%%time

traintest = pd.concat((gatrain, gatest))

# most_generated_event
most_generated_event = events.reset_index().groupby('device_id')\
                             .apply(lambda x: x['event_id'].value_counts().index.values[0])
most_generated_event = traintest.reset_index().device_id.map(most_generated_event).fillna(-1)
most_generated_event.index = traintest.reset_index().device_id

# hour with most number of events by device
hour_with_most_events = events.groupby('device_id')\
                              .apply(lambda x: x['timestamp'].dt.hour.value_counts().index.values[0])
hour_with_most_events = traintest.reset_index().device_id.map(hour_with_most_events).fillna(-1)
hour_with_most_events.index = traintest.reset_index().device_id

# number of different hours at which events were generated
num_diff_hours = events.reset_index().groupby('device_id').apply(lambda x: x['timestamp'].dt.hour.nunique())
num_diff_hours = traintest.reset_index().device_id.map(num_diff_hours).fillna(0)
num_diff_hours.index = traintest.reset_index().device_id

# number of events generated by a device
num_events  = events.reset_index().device_id.value_counts()
num_events  = traintest.reset_index().device_id.map(num_events).fillna(0)
num_events.index = traintest.reset_index().device_id

# number of different locations from where events were generated by device.
num_diff_locations = events.reset_index().groupby('device_id').apply(lambda x: x.loc[:, ['longitude', 'latitude']].drop_duplicates().shape[0])
num_diff_locations = traintest.reset_index().device_id.map(num_diff_locations).fillna(-1)
num_diff_locations.index = traintest.reset_index().device_id


Xtr_events = pd.DataFrame(index=gatrain.index)
Xte_events = pd.DataFrame(index=gatest.index)

Xtr_events['most_generated_event'] = most_generated_event
Xte_events['most_generated_event'] = most_generated_event

Xtr_events['hour_with_most_events'] = hour_with_most_events
Xte_events['hour_with_most_events'] = hour_with_most_events

Xtr_events['num_diff_hours'] = num_diff_hours
Xte_events['num_diff_hours'] = num_diff_hours

Xtr_events['num_events'] = num_events
Xte_events['num_events'] = num_events

Xtr_events['num_diff_locations'] = num_diff_locations
Xte_events['num_diff_locations'] = num_diff_locations

traintest = pd.concat((Xtr_events, Xte_events))
traintest = scale(traintest)

Xtr_events = traintest[:len(Xtr_events)]
Xte_events = traintest[len(Xtr_events):]

CPU times: user 2min 50s, sys: 1.16 s, total: 2min 51s
Wall time: 2min 44s


In [85]:
Xtrain = sp.sparse.hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label, Xtr_other, Xtr_events), format='csr')
Xtest =  sp.sparse.hstack((Xte_brand, Xte_model, Xte_app, Xte_label, Xte_other, Xte_events), format='csr')

print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

All features: train shape (55983, 21535), test shape (18662, 21535)


In [86]:
targetencoder = LabelEncoder().fit(gatrain.group.tolist() + gatest.group.tolist())
y_train       = targetencoder.transform(gatrain.group)
y_val         = targetencoder.transform(gatest.group)

nclasses = len(targetencoder.classes_)

In [91]:
m = LogisticRegression(C=.02, solver='lbfgs', multi_class='multinomial', n_jobs=-1, random_state=SEED)
m.fit(Xtrain, y_train)

val_preds = m.predict_proba(Xtest)
print('Log loss: {}'.format(log_loss(y_val, val_preds)))

Log loss: 2.2693569944989056


** Creating separate models for devices that have generated events vs devices that have generated event leads to lower log loss. **