This script is based on https://www.kaggle.com/dvasyukova/talkingdata-mobile-user-demographics/a-linear-model-on-apps-and-labels/discussion

The script loads, tranforms, and merger the data. Then it constructs a sparce Matrix and applies Logistic Regression to classify each device_id. Then a Random Forest is applied on these prediction and other feature enginering. 


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
from sklearn import pipeline, grid_search
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from scipy import stats



Load data and check how it looks (More detailed info on the data can be found in EDA_Taking_Data notebook)

In [2]:
datadir = './raw_data'
train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),index_col='device_id')
test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'), index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))

# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')

events = pd.read_csv(os.path.join(datadir,'events.csv'),parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'),usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
train.head()

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


In [4]:
test.head()

1002079943728939269
-1547860181818787117
7374582448058474277
-6220210354783429585
-5893464122623104785


In [5]:
phone.head()

Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,小米,红米
1277779817574759137,小米,MI 2
5137427614288105724,三星,Galaxy S4
3669464369358936369,SUGAR,时尚手机
-5019277647504317457,三星,Galaxy Note 2


In [6]:
appevents.head()

Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,True
1,2,-5720078949152207372,False
2,2,-1633887856876571208,False
3,2,-653184325010919369,True
4,2,8693964245073640147,True


In [7]:
applabels.head()

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [8]:
# get size of train and test
train['trainrow'] = np.arange(train.shape[0])
test['testrow'] = np.arange(test.shape[0])

Creation of sparce matrices for Brand, Model, Apps, and Apps Labels

In [9]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])

train['brand'] = phone['brand']
test['brand'] = phone['brand']

Xtr_brand = csr_matrix((np.ones(train.shape[0]), (train.trainrow, train.brand)))
Xte_brand = csr_matrix((np.ones(test.shape[0]), (test.testrow, test.brand)))

print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [10]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
train['model'] = phone['model']
test['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(train.shape[0]), (train.trainrow, train.model)))
Xte_model = csr_matrix((np.ones(test.shape[0]), (test.testrow, test.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))
del phone

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [11]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)

deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(train[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(test[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())

deviceapps.head()

Unnamed: 0,device_id,app,size,trainrow,testrow
0,-9222956879900151005,548,18,21594.0,
1,-9222956879900151005,1096,18,21594.0,
2,-9222956879900151005,1248,26,21594.0,
3,-9222956879900151005,1545,12,21594.0,
4,-9222956879900151005,1664,18,21594.0,


In [12]:
events['timestamp'] = pd.to_datetime(events['timestamp'])
events['hour'] = events['timestamp'].map(lambda x : x.hour)
events['min'] = events['timestamp'].map(lambda x : x.minute)
events['day'] = events['timestamp'].map(lambda x : x.day)


I fix latitude and longitud. There are many 0,0 values. These values are there when there was an error measuring. I'll change  this values to the mode of the device_id if this is not 0:

In [None]:
# fix longitud and latitude 
ids_to_fix = list(events.loc[(events.longitude < 5) & (events.latitude < 5) & (events.latitude > -5) & (events.longitude > -5),'device_id'].unique())
print("total entries to fix: ", len(ids_to_fix))
print("progress ... ")

counter = 0

for i in ids_to_fix:
    
    all_long = events[events.device_id == i].longitude
    all_lat = events[events.device_id == i].latitude
    
    try:
        
        mode_long = float(stats.mode(all_long[(all_long > 2) | (all_long < -2)])[0])
        mode_lat = float(stats.mode(all_lat[(all_lat > 2) | (all_lat < -2)])[0])
    
        events.loc[(events.device_id==i) & (events.longitude < 2) & (events.longitude > -2), 'longitude'] = mode_long
        events.loc[(events.device_id==i) & (events.latitude < 2) & (events.latitude > -2), 'latitude'] = mode_lat
        
    except:
        events.loc[events.device_id == i,'longitude'] = 0
        events.loc[events.device_id == i,'latitude'] = 0
    
    counter =  counter + 1
    if counter % 5000 == 0: 
        print(counter)

#events.to_csv('.\\transformed_data\\events.csv',index=False)

In [14]:
events.head(20)

Unnamed: 0,device_id,timestamp,longitude,latitude,hour,min,day
0,29182687948017175,2016-05-01 00:55:25,121.38,31.24,0,55,1
1,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,0,54,1
2,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,0,8,1
3,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,0,6,1
4,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,0,7,1
5,1476664663289716375,2016-05-01 00:27:21,0.0,0.0,0,27,1
6,5990807147117726237,2016-05-01 00:15:13,113.73,23.0,0,15,1
7,1782450055857303792,2016-05-01 00:15:35,113.94,34.7,0,15,1
8,-2073340001552902943,2016-05-01 00:15:33,0.0,0.0,0,15,1
9,-8195816569128397698,2016-05-01 00:41:31,119.34,26.04,0,41,1


In [13]:
# Comment out to load events already tranformed

del events
events = pd.read_csv('.\\transformed_data\\events.csv')
events.head()

Unnamed: 0,device_id,timestamp,longitude,latitude,hour,min,day
0,29182687948017175,2016-05-01 00:55:25,121.38,31.24,0,55,1
1,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,0,54,1
2,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,0,8,1
3,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,0,6,1
4,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,0,7,1


In [14]:
def replace (row):
    if row['amax'] != -1 and row['std']==-1:
        row['std']=0
    return row

gd = events.groupby('device_id')

longitude = gd['longitude'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
longitude['std'].fillna(-1,inplace=True)
longitude.replace(to_replace=0, value=-1, inplace=True)
longitude = longitude.rename(columns={'amax':'long_max','amin':'long_min','mean':'long_mean','median':'long_median','std':'long_std', 'mode':'long_mode'})


latitude = gd['latitude'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
latitude['std'].fillna(-1,inplace=True)
latitude.replace(to_replace=0, value=-1, inplace=True)
latitude = latitude.rename(columns={'amax':'lat_max','amin':'lat_min','mean':'lat_mean','median':'lat_median','std':'lat_std', 'mode':'lat_mode'})

hour = gd['hour'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
hour = hour.rename(columns={'amax':'hour_max','amin':'hour_min','mean':'hour_mean','median':'hour_median','std':'hour_std', 'mode':'hour_mode'})
hour['hour_std'].fillna(0,inplace=True)


minute = gd['min'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
minute = minute.rename(columns={'amax':'min_max','amin':'min_min','mean':'min_mean','median':'min_median','std':'min_std', 'mode':'min_mode'})
minute['min_std'].fillna(0,inplace=True)


day = gd['day'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
day = day.rename(columns={'amax':'day_max','amin':'day_min','mean':'day_mean','median':'day_median','std':'day_std', 'mode':'day_mode'})
day['day_std'].fillna(0,inplace=True)


In [15]:
longitude['long_mode'] = longitude['long_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
latitude['lat_mode'] = latitude['lat_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
hour['hour_mode'] = hour['hour_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
minute['min_mode'] = minute['min_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
day['day_mode'] = day['day_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)

In [16]:
del events

In [17]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),shape=(train.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), shape=(test.shape[0],napps))

print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [18]:
#applabels = pd.read_csv('./transformed_data/app_labels.csv')
applabels.head()

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [19]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

In [20]:
nlabels

492

In [21]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(train[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(test[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,1,21594.0,
1,-9222956879900151005,120,1,21594.0,
2,-9222956879900151005,126,1,21594.0,
3,-9222956879900151005,138,2,21594.0,
4,-9222956879900151005,147,2,21594.0,


In [22]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(train.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),shape=(test.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)


In [23]:
X_train = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
X_test =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print('All features: train shape {}, test shape {}'.format(X_train.shape, X_test.shape))

All features: train shape (74645, 21527), test shape (112071, 21527)


In [24]:
targetencoder = LabelEncoder().fit(train.group)
Y_train = targetencoder.transform(train.group)
nclasses = len(targetencoder.classes_)

Now that I have merge all space matrices I do feature selection (f_classif). I test which % works best with Logistic regression:

In [26]:
for i in (21,22):
    
    X_train = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
    X_test =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
    print('All features: train shape {}, test shape {}'.format(X_train.shape, X_test.shape))
    
    targetencoder = LabelEncoder().fit(train.group)
    Y_train = targetencoder.transform(train.group)
    nclasses = len(targetencoder.classes_)
           
    selector = SelectPercentile(f_classif, percentile=i)
    selector.fit(X_train, Y_train)

    X_train = selector.transform(X_train)
    X_test = selector.transform(X_test)

    lg = LogisticRegression(random_state=23, fit_intercept=True, class_weight= None,multi_class='multinomial',solver='lbfgs')
    param_grid = {'C': [0.01,0.02]}
    lg_gs = grid_search.GridSearchCV(estimator = lg, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
    lg_gs.fit(X_train, Y_train)
    print("Feature sel %",i)
    print("# Num of Features: ", X_train.shape[1])
    print("Best parameters found by grid search:", lg_gs.best_params_)
    print("Best CV score:", lg_gs.best_score_)
    print("Best CV score:", lg_gs.grid_scores_)


All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.6min finished


Feature sel % 21
# Num of Features:  4520
Best parameters found by grid search: {'C': 0.02}
Best CV score: -2.2808033846
Best CV score: [mean: -2.28323, std: 0.03914, params: {'C': 0.01}, mean: -2.28080, std: 0.04088, params: {'C': 0.02}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.6min finished


Feature sel % 22
# Num of Features:  4735
Best parameters found by grid search: {'C': 0.02}
Best CV score: -2.28074957969
Best CV score: [mean: -2.28322, std: 0.03910, params: {'C': 0.01}, mean: -2.28075, std: 0.04082, params: {'C': 0.02}]


Now I calculate the probs for every group and join these results with long, lat and time info to run a random forest for final predictions

In [27]:
pred_train = pd.DataFrame(lg_gs.predict_proba(X_train), index = train.index, columns=targetencoder.classes_).reset_index()
print(pred_train.shape)
pred_train.head()

(74645, 13)


Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,-8076087639492063270,0.047493,0.065405,0.04282,0.06298,0.056873,0.044005,0.092716,0.162992,0.097672,0.101242,0.133037,0.092765
1,-2897161552818060146,0.047493,0.065405,0.04282,0.06298,0.056873,0.044005,0.092716,0.162992,0.097672,0.101242,0.133037,0.092765
2,-8260683887967679142,0.12913,0.065995,0.110943,0.063984,0.032393,0.035304,0.028364,0.102785,0.039319,0.073993,0.194298,0.123493
3,-4938849341048082022,0.05987,0.060034,0.042982,0.066702,0.077601,0.058478,0.102485,0.121519,0.078207,0.100052,0.125444,0.106626
4,245133531816851882,0.063184,0.071119,0.05015,0.062791,0.071574,0.044631,0.132182,0.141446,0.080956,0.09776,0.09768,0.086528


In [28]:
pred_test = pd.DataFrame(lg_gs.predict_proba(X_test), index = test.index, columns=targetencoder.classes_).reset_index()
print(pred_test.shape)
pred_test.head()

(112071, 13)


Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.001428,0.005957,0.013528,0.013671,0.02465,0.045969,0.011914,0.032744,0.074542,0.117421,0.253848,0.404329
1,-1547860181818787117,0.007579,0.013868,0.030752,0.060238,0.071574,0.151359,0.006105,0.089614,0.053106,0.07489,0.23361,0.207304
2,7374582448058474277,0.024119,0.037797,0.036462,0.158582,0.164146,0.080618,0.013062,0.026213,0.042532,0.103632,0.183762,0.129076
3,-6220210354783429585,0.003143,0.033565,0.009411,0.012331,0.054864,0.18849,0.042114,0.121374,0.067112,0.086888,0.178283,0.202425
4,-5893464122623104785,0.047493,0.065405,0.04282,0.06298,0.056873,0.044005,0.092716,0.162992,0.097672,0.101242,0.133037,0.092765


In [29]:
m1 = pd.merge(pred_train, hour, how='left', on='device_id')
m2 = pd.merge(m1, minute, how='left', on='device_id')
m3 = pd.merge(m2, longitude, how='left', on='device_id')
m4 = pd.merge(m3, day, how='left', on='device_id')
X_train = pd.merge(m4, latitude, how='left', on='device_id')

del m1
del m2
del m3
del m4

m1 = pd.merge(pred_test, hour, how='left', on='device_id')
m2 = pd.merge(m1, minute, how='left', on='device_id')
m3 = pd.merge(m2, longitude, how='left', on='device_id')
m4 = pd.merge(m3, day, how='left', on='device_id')
X_test = pd.merge(m4, latitude, how='left', on='device_id')

del m1
del m2
del m3
del m4



In [None]:
X_test.fillna(-1, inplace=True)
X_train.fillna(-1, inplace=True)

X_train.drop('device_id',1,inplace=True)
X_test.drop('device_id',1,inplace=True)

In [30]:
X_train.replace(to_replace=-1, value=np.NaN, inplace=True)
X_test.replace(to_replace=-1, value=np.NaN, inplace=True)

for col in list(X_train.isnull().sum().index[12:]):
    med_train = np.median(X_train[X_train[col].notnull()][col])
    X_train[col].fillna(med_train, inplace=True)
    
    med_test = np.median(X_test[X_test[col].notnull()][col])
    X_test[col].fillna(med_test, inplace=True)
    
X_train.drop('device_id',1,inplace=True)
X_test.drop('device_id',1,inplace=True)

In [31]:
del hour
del minute
del longitude
del latitude
del pred_train
del pred_test
del day

I test several trees ensambles and choose the one that performs best using CV (log_loss)

In [56]:
# Feature selection to remove noise
rfc = RandomForestClassifier(n_estimators = 400, n_jobs=-1, random_state=23, min_samples_leaf= 35)
rfc.fit(X_train, Y_train)
features_lb = sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), X_train.columns), reverse=True, key=lambda pair: pair[0])
features_lb

[(0.12570000000000001, 'M39+'),
 (0.107, 'M32-38'),
 (0.1031, 'M22-'),
 (0.099400000000000002, 'M23-26'),
 (0.0809, 'F23-'),
 (0.074800000000000005, 'M29-31'),
 (0.069500000000000006, 'F33-42'),
 (0.054100000000000002, 'F24-26'),
 (0.052400000000000002, 'F29-32'),
 (0.052400000000000002, 'M27-28'),
 (0.050000000000000003, 'F43+'),
 (0.045199999999999997, 'F27-28'),
 (0.0064999999999999997, 'day_std'),
 (0.0060000000000000001, 'hour_mean'),
 (0.0060000000000000001, 'hour_std'),
 (0.0058999999999999999, 'min_std'),
 (0.0054000000000000003, 'min_mean'),
 (0.0048999999999999998, 'day_mean'),
 (0.0043, 'min_max'),
 (0.0043, 'min_median'),
 (0.0041000000000000003, 'hour_min'),
 (0.0041000000000000003, 'min_mode'),
 (0.0038, 'hour_median'),
 (0.0037000000000000002, 'min_min'),
 (0.0033, 'hour_mode'),
 (0.0025000000000000001, 'hour_max'),
 (0.0020999999999999999, 'day_mode'),
 (0.0016999999999999999, 'day_median'),
 (0.0016000000000000001, 'lat_mean'),
 (0.0015, 'lat_min'),
 (0.0014, 'long_min

In [57]:
print("\nTraining Ensamble Random Forest ....")
rfc = RandomForestClassifier(n_jobs=-1, random_state=23)
param_grid = {'n_estimators':[250],'min_samples_leaf': [20,30,40,45]}
rfc_gs = grid_search.GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
rfc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", rfc_gs.best_params_)
print("Best CV score:", rfc_gs.best_score_)
print("Best CV score:", rfc_gs.grid_scores_)

# 2.25664 [mean: -2.22787, std: 0.04694, params: {'min_samples_leaf': 60, 'n_estimators': 200}]
#-2.20669923957


Training Ensamble Random Forest ....
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.1min finished


Best parameters found by grid search: {'n_estimators': 250, 'min_samples_leaf': 30}
Best CV score: -2.20941663257
Best CV score: [mean: -2.20963, std: 0.05173, params: {'n_estimators': 250, 'min_samples_leaf': 20}, mean: -2.20942, std: 0.05100, params: {'n_estimators': 250, 'min_samples_leaf': 30}, mean: -2.20999, std: 0.04991, params: {'n_estimators': 250, 'min_samples_leaf': 40}, mean: -2.21049, std: 0.04975, params: {'n_estimators': 250, 'min_samples_leaf': 45}]


In [58]:
print("\nTraining Ensamble Adaboost ....")
abc = AdaBoostClassifier(n_estimators=20, random_state=3)
param_grid = {'learning_rate':[0.01]}
abc_gs = grid_search.GridSearchCV(estimator = abc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
abc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", abc_gs.best_params_)
print("Best CV score:", abc_gs.best_score_)
print("Best CV score:", abc_gs.grid_scores_)


Training Ensamble Adaboost ....
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.7s finished


Best parameters found by grid search: {'learning_rate': 0.01}
Best CV score: -2.38545306525
Best CV score: [mean: -2.38545, std: 0.01176, params: {'learning_rate': 0.01}]


In [None]:
print("\nTraining Ensamble Gradient Boosting ....")
gbc = GradientBoostingClassifier(random_state=3, n_estimators = 200, max_features ='auto', subsample=0.9)
param_grid = {'min_samples_leaf':[10,50,100],'learning_rate':[0.01]}
gbc_gs = grid_search.GridSearchCV(estimator = gbc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
gbc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", gbc_gs.best_params_)
print("Best CV score:", gbc_gs.best_score_)
print("Best CV score:", gbc_gs.grid_scores_)

In [32]:
print("\nTraining Extra Trees Classifier ....")
etc = ExtraTreesClassifier(n_estimators=300, criterion='gini', max_features='auto', n_jobs=-1, random_state=23)
param_grid = {'min_samples_leaf':[5,3]}
etc_gs = grid_search.GridSearchCV(estimator = etc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
etc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", etc_gs.best_params_)
print("Best CV score:", etc_gs.best_score_)
print("Best CV score:", etc_gs.grid_scores_)


Training Extra Trees Classifier ....
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.7min finished


Best parameters found by grid search: {'min_samples_leaf': 5}
Best CV score: -2.20629668295
Best CV score: [mean: -2.20630, std: 0.05141, params: {'min_samples_leaf': 5}, mean: -2.20878, std: 0.05236, params: {'min_samples_leaf': 3}]


In [33]:
pred = pd.DataFrame(etc_gs.predict_proba(X_test), index = test.index, columns=targetencoder.classes_).reset_index()
print(pred.shape)
pred.head()

(112071, 13)


Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.003312,0.007772,0.006843,0.015459,0.038605,0.058415,0.008924,0.043286,0.044197,0.117741,0.238715,0.416732
1,-1547860181818787117,0.012711,0.013363,0.020778,0.06353,0.091204,0.141512,0.021931,0.058956,0.05095,0.082628,0.249958,0.192479
2,7374582448058474277,0.018663,0.036719,0.044474,0.14537,0.226642,0.097471,0.033321,0.029307,0.03631,0.090964,0.137951,0.102808
3,-6220210354783429585,0.010604,0.021142,0.025212,0.040569,0.063317,0.119393,0.031926,0.135205,0.068889,0.117324,0.163626,0.202793
4,-5893464122623104785,0.039893,0.077214,0.045475,0.062758,0.046131,0.037388,0.081623,0.168802,0.107432,0.105616,0.132364,0.095305


In [34]:
pred.to_csv('.\\submissions\\sub_et.csv',index=False)