In [42]:
import os
import re
import string
from IPython.display import display, Image

import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import scipy
from scipy.sparse import csr_matrix, hstack

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, f1_score
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

In [125]:
grouptrain = pd.read_csv('db/group.csv').set_index('device_id')
grouptest = pd.read_csv("input/gender_age_test.csv").set_index('device_id')
device = pd.read_csv('input/phone_brand_device_model.csv')
# Get rid of duplicate device ids in phone
device = device.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv("input/events.csv", parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv("input/app_events.csv", usecols=['event_id', 'app_id', 'is_active'], dtype={'is_active':bool})
applabels = pd.read_csv("input/app_labels.csv")
# label_categories = pd.read_csv("input/label_categories.csv")

In [131]:
device = device.rename(columns={'phone_brand' : 'device_brand'})

In [126]:
grouptrain['train_row'] = np.arange(len(grouptrain))
grouptest['test_row'] = np.arange(len(grouptest))

In [132]:
le_brand = LabelEncoder().fit(device.device_brand)
device['brand'] = le_brand.transform(device['device_brand'])
grouptrain["brand"] = device["brand"]
grouptest['brand'] = device['brand']

In [133]:
Xtrain_brand = csr_matrix((np.ones(len(grouptrain)), (grouptrain.train_row, grouptrain.brand)))
Xtest_brand = csr_matrix((np.ones(len(grouptest)), (grouptest.test_row, grouptest.brand)))

In [135]:
print('Brand features: train shape {}, test shape {}'.format(Xtrain_brand.shape, Xtest_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [137]:
mobile = device.device_brand.str.cat(device.device_model)
le_model = LabelEncoder().fit(mobile)
device['model'] = le_model.transform(mobile)
grouptrain['model'] = device['model']
grouptest['model'] = device['model']
Xtrain_model = csr_matrix((np.ones(grouptrain.shape[0]), 
                       (grouptrain.train_row, grouptrain.model)))
Xtest_model = csr_matrix((np.ones(grouptest.shape[0]), 
                       (grouptest.test_row, grouptest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtrain_model.shape, Xtest_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [139]:
le_app = LabelEncoder().fit(appevents.app_id)
appevents['app'] = le_app.transform(appevents.app_id)
n_appclasses = len(le_app.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(grouptrain[['train_row']], how='left', left_index=True, right_index=True)
                       .merge(grouptest[['test_row']], how='left', left_index=True, right_index=True)
                       .reset_index())

In [141]:
dapp = deviceapps.dropna(subset=['train_row'])
Xtrain_app = csr_matrix((np.ones(dapp.shape[0]), (dapp.train_row, dapp.app)), 
                      shape=(grouptrain.shape[0],n_appclasses))
dapp = deviceapps.dropna(subset=['test_row'])
Xtest_app = csr_matrix((np.ones(dapp.shape[0]), (dapp.test_row, dapp.app)), 
                      shape=(grouptest.shape[0],n_appclasses))

In [145]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = le_app.transform(applabels.app_id)
le_labels = LabelEncoder().fit(applabels.label_id)
applabels['label'] = le_labels.transform(applabels.label_id)
n_labelsclasses = len(le_labels.classes_)

In [146]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(grouptrain[['train_row']], how='left', left_index=True, right_index=True)
                .merge(grouptest[['test_row']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

Unnamed: 0,device_id,label,size,train_row,test_row
0,-9222956879900151005,117,1,33721.0,
1,-9222956879900151005,120,1,33721.0,
2,-9222956879900151005,126,1,33721.0,
3,-9222956879900151005,138,2,33721.0,
4,-9222956879900151005,147,2,33721.0,


In [147]:
dapp = devicelabels.dropna(subset=['train_row'])
Xtrain_label = csr_matrix((np.ones(dapp.shape[0]), (dapp.train_row, dapp.label)), 
                      shape=(grouptrain.shape[0],n_labelsclasses))
dapp = devicelabels.dropna(subset=['test_row'])
Xtest_label = csr_matrix((np.ones(dapp.shape[0]), (dapp.test_row, dapp.label)), 
                      shape=(grouptest.shape[0],n_labelsclasses))

In [148]:
Xtrain = hstack((Xtrain_brand, Xtrain_model, Xtrain_app, Xtrain_label), format='csr')
Xtest =  hstack((Xtest_brand, Xtest_model, Xtest_app, Xtest_label), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

All features: train shape (74645, 21527), test shape (112071, 21527)


In [149]:
le = LabelEncoder().fit(grouptrain.group)
y = le.transform(grouptrain.group)
n_classes = len(le.classes_)
y_dummies = np_utils.to_categorical(y)

In [150]:
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

In [151]:
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [152]:
def baseline_model():
    model = Sequential()
    model.add(Dense(50, input_dim=Xtrain.shape[1], init='normal', activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(12, init='normal', activation='sigmoid'))
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

model=baseline_model()

In [160]:
X_train, X_val, y_train, y_val = train_test_split(Xtrain, y_dummies, test_size=0.2, random_state=42)

fit= model.fit_generator(generator=batch_generator(X_train, y_train, 32, True),
                         nb_epoch=15,
                         samples_per_epoch=70496,
                         validation_data=(X_val.todense(), y_val), verbose=2
                         )

Epoch 1/15




94s - loss: 2.2317 - acc: 0.2124 - val_loss: 2.2164 - val_acc: 0.2167
Epoch 2/15
90s - loss: 2.2300 - acc: 0.2135 - val_loss: 2.2178 - val_acc: 0.2173
Epoch 3/15
89s - loss: 2.2260 - acc: 0.2178 - val_loss: 2.2205 - val_acc: 0.2168
Epoch 4/15
87s - loss: 2.2241 - acc: 0.2156 - val_loss: 2.2221 - val_acc: 0.2148
Epoch 5/15
93s - loss: 2.2224 - acc: 0.2160 - val_loss: 2.2224 - val_acc: 0.2137
Epoch 6/15
95s - loss: 2.2160 - acc: 0.2186 - val_loss: 2.2237 - val_acc: 0.2131
Epoch 7/15
87s - loss: 2.2146 - acc: 0.2189 - val_loss: 2.2251 - val_acc: 0.2131
Epoch 8/15
86s - loss: 2.2163 - acc: 0.2193 - val_loss: 2.2256 - val_acc: 0.2131
Epoch 9/15
86s - loss: 2.2056 - acc: 0.2265 - val_loss: 2.2270 - val_acc: 0.2143
Epoch 10/15
92s - loss: 2.2081 - acc: 0.2240 - val_loss: 2.2284 - val_acc: 0.2104
Epoch 11/15
91s - loss: 2.2035 - acc: 0.2241 - val_loss: 2.2283 - val_acc: 0.2121
Epoch 12/15
89s - loss: 2.2006 - acc: 0.2267 - val_loss: 2.2304 - val_acc: 0.2115
Epoch 13/15
92s - loss: 2.1987 - acc

In [161]:
scores_val = model.predict_generator(generator=batch_generatorp(X_val, 32, False), val_samples=X_val.shape[0])
scores = model.predict_generator(generator=batch_generatorp(Xtest, 32, False), val_samples=Xtest.shape[0])

print('logloss val {}'.format(log_loss(y_val, scores_val)))

logloss val 2.231851933004109


In [162]:
pred = pd.DataFrame(scores, index = grouptest.index, columns=le.classes_)

In [163]:
pred.head()

Unnamed: 0_level_0,F0-22,F23-26,F27-28,F29-32,F33-38,F39+,M0-22,M23-26,M27-28,M29-32,M33-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,3.564054e-07,1e-06,2e-06,3e-06,8e-06,2e-05,4e-06,2e-05,2.2e-05,8e-05,0.000106,0.000245
-1547860181818787117,9.153597e-07,4e-06,8e-06,1.5e-05,2.2e-05,4.3e-05,3e-06,2.9e-05,4.6e-05,0.000108,0.000139,7.3e-05
7374582448058474277,0.00142664,0.007812,0.009517,0.031844,0.033325,0.021419,0.001914,0.006517,0.010481,0.039864,0.039532,0.031826
-6220210354783429585,2.133572e-06,1e-05,6e-06,6e-06,1e-05,3.7e-05,3.9e-05,0.0002,0.00011,0.000128,0.000113,0.000105
-5893464122623104785,0.004581804,0.013597,0.008271,0.01204,0.007646,0.008446,0.015809,0.037197,0.025206,0.031289,0.023275,0.0185


In [164]:
pred.to_csv('talkdatakeras.csv')