In [1]:
import os
import re
import string
from IPython.display import display, Image

import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import scipy
from scipy.sparse import csr_matrix, hstack

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, f1_score
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

Using TensorFlow backend.


In [2]:
grouptrain = pd.read_csv('db/group.csv').set_index('device_id')
grouptest = pd.read_csv("input/gender_age_test.csv").set_index('device_id')
device = pd.read_csv('input/phone_brand_device_model.csv')
# Get rid of duplicate device ids in phone
device = device.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv("input/events.csv", parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv("input/app_events.csv", usecols=['event_id', 'app_id', 'is_active'], dtype={'is_active':bool})
applabels = pd.read_csv("input/app_labels.csv")
# label_categories = pd.read_csv("input/label_categories.csv")

In [3]:
device = device.rename(columns={'phone_brand' : 'device_brand'})

In [4]:
grouptrain['train_row'] = np.arange(len(grouptrain))
grouptest['test_row'] = np.arange(len(grouptest))

In [5]:
le_brand = LabelEncoder().fit(device.device_brand)
device['brand'] = le_brand.transform(device['device_brand'])
grouptrain["brand"] = device["brand"]
grouptest['brand'] = device['brand']

In [6]:
Xtrain_brand = csr_matrix((np.ones(len(grouptrain)), (grouptrain.train_row, grouptrain.brand)))
Xtest_brand = csr_matrix((np.ones(len(grouptest)), (grouptest.test_row, grouptest.brand)))

In [7]:
print('Brand features: train shape {}, test shape {}'.format(Xtrain_brand.shape, Xtest_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [8]:
mobile = device.device_brand.str.cat(device.device_model)
le_model = LabelEncoder().fit(mobile)
device['model'] = le_model.transform(mobile)
grouptrain['model'] = device['model']
grouptest['model'] = device['model']
Xtrain_model = csr_matrix((np.ones(grouptrain.shape[0]), 
                       (grouptrain.train_row, grouptrain.model)))
Xtest_model = csr_matrix((np.ones(grouptest.shape[0]), 
                       (grouptest.test_row, grouptest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtrain_model.shape, Xtest_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [9]:
le_app = LabelEncoder().fit(appevents.app_id)
appevents['app'] = le_app.transform(appevents.app_id)
n_appclasses = len(le_app.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(grouptrain[['train_row']], how='left', left_index=True, right_index=True)
                       .merge(grouptest[['test_row']], how='left', left_index=True, right_index=True)
                       .reset_index())

In [10]:
dapp = deviceapps.dropna(subset=['train_row'])
Xtrain_app = csr_matrix((np.ones(dapp.shape[0]), (dapp.train_row, dapp.app)), 
                      shape=(grouptrain.shape[0],n_appclasses))
dapp = deviceapps.dropna(subset=['test_row'])
Xtest_app = csr_matrix((np.ones(dapp.shape[0]), (dapp.test_row, dapp.app)), 
                      shape=(grouptest.shape[0],n_appclasses))

In [11]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = le_app.transform(applabels.app_id)
le_labels = LabelEncoder().fit(applabels.label_id)
applabels['label'] = le_labels.transform(applabels.label_id)
n_labelsclasses = len(le_labels.classes_)

In [12]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(grouptrain[['train_row']], how='left', left_index=True, right_index=True)
                .merge(grouptest[['test_row']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

Unnamed: 0,device_id,label,size,train_row,test_row
0,-9222956879900151005,117,1,33721.0,
1,-9222956879900151005,120,1,33721.0,
2,-9222956879900151005,126,1,33721.0,
3,-9222956879900151005,138,2,33721.0,
4,-9222956879900151005,147,2,33721.0,


In [13]:
dapp = devicelabels.dropna(subset=['train_row'])
Xtrain_label = csr_matrix((np.ones(dapp.shape[0]), (dapp.train_row, dapp.label)), 
                      shape=(grouptrain.shape[0],n_labelsclasses))
dapp = devicelabels.dropna(subset=['test_row'])
Xtest_label = csr_matrix((np.ones(dapp.shape[0]), (dapp.test_row, dapp.label)), 
                      shape=(grouptest.shape[0],n_labelsclasses))

In [14]:
Xtrain = hstack((Xtrain_brand, Xtrain_model, Xtrain_app, Xtrain_label), format='csr')
Xtest =  hstack((Xtest_brand, Xtest_model, Xtest_app, Xtest_label), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

All features: train shape (74645, 21527), test shape (112071, 21527)


In [15]:
le = LabelEncoder().fit(grouptrain.group)
y = le.transform(grouptrain.group)
n_classes = len(le.classes_)

In [16]:
clf = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
clf.fit(Xtrain, y)
pred = pd.DataFrame(clf.predict_proba(Xtest), index = grouptest.index, columns=le.classes_)
pred.head()

Unnamed: 0_level_0,F0-22,F23-26,F27-28,F29-32,F33-38,F39+,M0-22,M23-26,M27-28,M29-32,M33-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.001019,0.006505,0.014032,0.013384,0.019838,0.043656,0.011813,0.034313,0.074756,0.169202,0.206469,0.405014
-1547860181818787117,0.006048,0.014264,0.031302,0.059026,0.047321,0.171971,0.006033,0.093162,0.056459,0.116598,0.181641,0.216174
7374582448058474277,0.015713,0.046021,0.03583,0.159737,0.118812,0.123501,0.013025,0.026272,0.043605,0.152647,0.129625,0.135212
-6220210354783429585,0.00321,0.025001,0.008382,0.012112,0.020735,0.229004,0.045639,0.135253,0.069829,0.122261,0.143592,0.18498
-5893464122623104785,0.035634,0.080179,0.042478,0.062225,0.044948,0.053792,0.091886,0.164689,0.097585,0.133064,0.100781,0.092741
