In [1]:
# based on https://www.kaggle.com/dvasyukova/talkingdata-mobile-user-demographics/a-linear-model-on-apps-and-labels/discussion


import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
from sklearn import pipeline, grid_search



In [2]:
datadir = './raw_data'
train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),index_col='device_id')
test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'), index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))

# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')

events = pd.read_csv(os.path.join(datadir,'events.csv'),parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'),usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
train.head()

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


In [4]:
test.head()

1002079943728939269
-1547860181818787117
7374582448058474277
-6220210354783429585
-5893464122623104785


In [5]:
phone.head()

Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,小米,红米
1277779817574759137,小米,MI 2
5137427614288105724,三星,Galaxy S4
3669464369358936369,SUGAR,时尚手机
-5019277647504317457,三星,Galaxy Note 2


In [6]:
events.head()

Unnamed: 0_level_0,device_id,timestamp,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [7]:
appevents.head()

Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,True
1,2,-5720078949152207372,False
2,2,-1633887856876571208,False
3,2,-653184325010919369,True
4,2,8693964245073640147,True


In [8]:
applabels.head()

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [9]:
train['trainrow'] = np.arange(train.shape[0])
test['testrow'] = np.arange(test.shape[0])

In [10]:
train.head()

Unnamed: 0_level_0,gender,age,group,trainrow
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8076087639492063270,M,35,M32-38,0
-2897161552818060146,M,35,M32-38,1
-8260683887967679142,M,35,M32-38,2
-4938849341048082022,M,30,M29-31,3
245133531816851882,M,30,M29-31,4


In [11]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])

train['brand'] = phone['brand']
test['brand'] = phone['brand']

Xtr_brand = csr_matrix((np.ones(train.shape[0]), (train.trainrow, train.brand)))
Xte_brand = csr_matrix((np.ones(test.shape[0]), (test.testrow, test.brand)))

print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [12]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
train['model'] = phone['model']
test['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(train.shape[0]), (train.trainrow, train.model)))
Xte_model = csr_matrix((np.ones(test.shape[0]), (test.testrow, test.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


merge device_id column from events table to app_events

group the resulting dataframe by device_id and app and aggregate

merge in trainrow and testrow columns to know at which row to put each device in the features matrix

In [13]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)

deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(train[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(test[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())

deviceapps.head()

Unnamed: 0,device_id,app,size,trainrow,testrow
0,-9222956879900151005,548,18,21594.0,
1,-9222956879900151005,1096,18,21594.0,
2,-9222956879900151005,1248,26,21594.0,
3,-9222956879900151005,1545,12,21594.0,
4,-9222956879900151005,1664,18,21594.0,


In [14]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),shape=(train.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), shape=(test.shape[0],napps))

print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [15]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

In [16]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(train[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(test[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,1,21594.0,
1,-9222956879900151005,120,1,21594.0,
2,-9222956879900151005,126,1,21594.0,
3,-9222956879900151005,138,2,21594.0,
4,-9222956879900151005,147,2,21594.0,


In [17]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(train.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),shape=(test.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)


In [28]:
X_train = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
X_test =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print('All features: train shape {}, test shape {}'.format(X_train.shape, X_test.shape))

All features: train shape (74645, 21527), test shape (112071, 21527)


In [29]:
targetencoder = LabelEncoder().fit(train.group)
Y_train = targetencoder.transform(train.group)
nclasses = len(targetencoder.classes_)

In [33]:
for i in (20,22,25,27,30):
    
    X_train = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
    X_test =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
    print('All features: train shape {}, test shape {}'.format(X_train.shape, X_test.shape))
    
    targetencoder = LabelEncoder().fit(train.group)
    Y_train = targetencoder.transform(train.group)
    nclasses = len(targetencoder.classes_)
           
    selector = SelectPercentile(f_classif, percentile=i)
    selector.fit(X_train, Y_train)

    X_train = selector.transform(X_train)
    X_test = selector.transform(X_test)

    lg = LogisticRegression(random_state=23, fit_intercept=True, class_weight= None,multi_class='multinomial',solver='lbfgs')
    param_grid = {'C': [0.01,0.005]}
    lg_gs = grid_search.GridSearchCV(estimator = lg, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
    lg_gs.fit(X_train, Y_train)
    print("Feature sel %",i)
    print("# Num of Features: ", X_train.shape[1])
    print("Best parameters found by grid search:", lg_gs.best_params_)
    print("Best CV score:", lg_gs.best_score_)
    print("Best CV score:", lg_gs.grid_scores_)

All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.6min finished


Feature sel % 20
# Num of Features:  4305
Best parameters found by grid search: {'C': 0.01}
Best CV score: -2.28324532162
Best CV score: [mean: -2.28325, std: 0.03914, params: {'C': 0.01}, mean: -2.29225, std: 0.03610, params: {'C': 0.005}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.7min finished


Feature sel % 22
# Num of Features:  4735
Best parameters found by grid search: {'C': 0.01}
Best CV score: -2.28322082012
Best CV score: [mean: -2.28322, std: 0.03910, params: {'C': 0.01}, mean: -2.29223, std: 0.03608, params: {'C': 0.005}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


Feature sel % 25
# Num of Features:  5381
Best parameters found by grid search: {'C': 0.01}
Best CV score: -2.28316906
Best CV score: [mean: -2.28317, std: 0.03912, params: {'C': 0.01}, mean: -2.29220, std: 0.03609, params: {'C': 0.005}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.5min finished


Feature sel % 27
# Num of Features:  5813
Best parameters found by grid search: {'C': 0.01}
Best CV score: -2.28325653209
Best CV score: [mean: -2.28326, std: 0.03913, params: {'C': 0.01}, mean: -2.29223, std: 0.03609, params: {'C': 0.005}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.9min finished


Feature sel % 30
# Num of Features:  6458
Best parameters found by grid search: {'C': 0.01}
Best CV score: -2.28325854348
Best CV score: [mean: -2.28326, std: 0.03910, params: {'C': 0.01}, mean: -2.29223, std: 0.03608, params: {'C': 0.005}]


In [34]:
X_train = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
X_test =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print('All features: train shape {}, test shape {}'.format(X_train.shape, X_test.shape))
    
targetencoder = LabelEncoder().fit(train.group)
Y_train = targetencoder.transform(train.group)
nclasses = len(targetencoder.classes_)
           
selector = SelectPercentile(f_classif, percentile=25)
selector.fit(X_train, Y_train)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

lg = LogisticRegression(random_state=23, fit_intercept=True, class_weight= None,multi_class='multinomial',solver='lbfgs')
param_grid = {'C': [0.01,0.008],'fit_intercept':[True,False]}
lg_gs = grid_search.GridSearchCV(estimator = lg, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
lg_gs.fit(X_train, Y_train)
print("Feature sel %",i)
print("# Num of Features: ", X_train.shape[1])
print("Best parameters found by grid search:", lg_gs.best_params_)
print("Best CV score:", lg_gs.best_score_)
print("Best CV score:", lg_gs.grid_scores_)

All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.8min finished


Feature sel % 30
# Num of Features:  5381
Best parameters found by grid search: {'fit_intercept': True, 'C': 0.01}
Best CV score: -2.28316906
Best CV score: [mean: -2.28317, std: 0.03912, params: {'fit_intercept': True, 'C': 0.01}, mean: -2.28962, std: 0.03957, params: {'fit_intercept': False, 'C': 0.01}, mean: -2.28539, std: 0.03827, params: {'fit_intercept': True, 'C': 0.008}, mean: -2.29275, std: 0.03878, params: {'fit_intercept': False, 'C': 0.008}]


In [37]:
pred = pd.DataFrame(lg_gs.predict_proba(X_test), index = test.index, columns=targetencoder.classes_)
print(pred.shape)
pred.head()

(112071, 12)


Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.002033,0.006819,0.014414,0.017255,0.027333,0.047357,0.013426,0.036571,0.073591,0.11885,0.237829,0.404522
-1547860181818787117,0.007087,0.015188,0.027163,0.056455,0.078285,0.142082,0.008127,0.082532,0.050589,0.081477,0.216549,0.234466
7374582448058474277,0.026777,0.040104,0.040142,0.152076,0.168599,0.082911,0.014833,0.029422,0.041585,0.097946,0.171509,0.134096
-6220210354783429585,0.004797,0.027516,0.011455,0.015601,0.059172,0.17018,0.040908,0.118377,0.061111,0.090918,0.183585,0.21638
-5893464122623104785,0.052083,0.062311,0.043251,0.062435,0.059733,0.04622,0.101093,0.159362,0.091255,0.100551,0.128011,0.093695


In [38]:
pred.to_csv('.\\submissions\\sub_lr2.csv',index=False)