This script is based on https://www.kaggle.com/dvasyukova/talkingdata-mobile-user-demographics/a-linear-model-on-apps-and-labels/discussion

The script loads, tranforms, and merger the data. Then it constructs a sparce Matrix and applies Logistic Regression to classify each device_id. Then a Random Forest is applied on these prediction and other feature enginering. 


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
from sklearn import pipeline, grid_search
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from scipy import stats



Load data and check how it looks (More detailed info on the data can be found in EDA_Taking_Data notebook)

In [2]:
datadir = './raw_data'
train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),index_col='device_id')
test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'), index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))

# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')

events = pd.read_csv(os.path.join(datadir,'events.csv'),parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'),usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
train.head()

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


In [4]:
test.head()

1002079943728939269
-1547860181818787117
7374582448058474277
-6220210354783429585
-5893464122623104785


In [5]:
phone.head()

Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,小米,红米
1277779817574759137,小米,MI 2
5137427614288105724,三星,Galaxy S4
3669464369358936369,SUGAR,时尚手机
-5019277647504317457,三星,Galaxy Note 2


In [7]:
appevents.head()

Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,True
1,2,-5720078949152207372,False
2,2,-1633887856876571208,False
3,2,-653184325010919369,True
4,2,8693964245073640147,True


In [8]:
applabels.head()

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [9]:
# get size of train and test
train['trainrow'] = np.arange(train.shape[0])
test['testrow'] = np.arange(test.shape[0])

Creation of sparce matrices for Brand, Model, Apps, and Apps Labels

In [10]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])

train['brand'] = phone['brand']
test['brand'] = phone['brand']

Xtr_brand = csr_matrix((np.ones(train.shape[0]), (train.trainrow, train.brand)))
Xte_brand = csr_matrix((np.ones(test.shape[0]), (test.testrow, test.brand)))

print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [11]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
train['model'] = phone['model']
test['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(train.shape[0]), (train.trainrow, train.model)))
Xte_model = csr_matrix((np.ones(test.shape[0]), (test.testrow, test.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))
del phone

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [13]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)

deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(train[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(test[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())

deviceapps.head()

Unnamed: 0,device_id,app,size,trainrow,testrow
0,-9222956879900151005,548,18,21594.0,
1,-9222956879900151005,1096,18,21594.0,
2,-9222956879900151005,1248,26,21594.0,
3,-9222956879900151005,1545,12,21594.0,
4,-9222956879900151005,1664,18,21594.0,


In [14]:
events['timestamp'] = pd.to_datetime(events['timestamp'])
events['hour'] = events['timestamp'].map(lambda x : x.hour)
events['min'] = events['timestamp'].map(lambda x : x.minute)
events['day'] = events['timestamp'].map(lambda x : x.day)


I fix latitude and longitud. There are many 0,0 values. These values are there when there was an error measuring. I'll change  this values to the mode of the device_id if this is not 0:

In [None]:
# fix longitud and latitude 
ids_to_fix = list(events.loc[(events.longitude < 5) & (events.latitude < 5) & (events.latitude > -5) & (events.longitude > -5),'device_id'].unique())
print("total entries to fix: ", len(ids_to_fix))
print("progress ... ")

counter = 0

for i in ids_to_fix:
    
    all_long = events[events.device_id == i].longitude
    all_lat = events[events.device_id == i].latitude
    
    try:
        
        mode_long = float(stats.mode(all_long[(all_long > 2) | (all_long < -2)])[0])
        mode_lat = float(stats.mode(all_lat[(all_lat > 2) | (all_lat < -2)])[0])
    
        events.loc[(events.device_id==i) & (events.longitude < 2) & (events.longitude > -2), 'longitude'] = mode_long
        events.loc[(events.device_id==i) & (events.latitude < 2) & (events.latitude > -2), 'latitude'] = mode_lat
        
    except:
        events.loc[events.device_id == i,'longitude'] = 0
        events.loc[events.device_id == i,'latitude'] = 0
    
    counter =  counter + 1
    if counter % 5000 == 0: 
        print(counter)

#events.to_csv('.\\transformed_data\\events.csv',index=False)

In [18]:
events.head()

Unnamed: 0_level_0,device_id,timestamp,longitude,latitude,hour,min,day
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,0,55,1
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,0,54,1
3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,0,8,1
4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,0,6,1
5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,0,7,1


In [20]:
# Comment out to load events already tranformed

#del events
#events = pd.read_csv('.\\transformed_data\\events.csv')
#events.head()

Unnamed: 0,device_id,timestamp,longitude,latitude,hour,min,day
0,29182687948017175,2016-05-01 00:55:25,121.38,31.24,0,55,1
1,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,0,54,1
2,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,0,8,1
3,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,0,6,1
4,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,0,7,1


In [21]:
def replace (row):
    if row['amax'] != -1 and row['std']==-1:
        row['std']=0
    return row

gd = events.groupby('device_id')

longitude = gd['longitude'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
longitude['std'].fillna(-1,inplace=True)
longitude.replace(to_replace=0, value=-1, inplace=True)
longitude = longitude.rename(columns={'amax':'long_max','amin':'long_min','mean':'long_mean','median':'long_median','std':'long_std', 'mode':'long_mode'})


latitude = gd['latitude'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
latitude['std'].fillna(-1,inplace=True)
latitude.replace(to_replace=0, value=-1, inplace=True)
latitude = latitude.rename(columns={'amax':'lat_max','amin':'lat_min','mean':'lat_mean','median':'lat_median','std':'lat_std', 'mode':'lat_mode'})

hour = gd['hour'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
hour = hour.rename(columns={'amax':'hour_max','amin':'hour_min','mean':'hour_mean','median':'hour_median','std':'hour_std', 'mode':'hour_mode'})
hour['hour_std'].fillna(0,inplace=True)


minute = gd['min'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
minute = minute.rename(columns={'amax':'min_max','amin':'min_min','mean':'min_mean','median':'min_median','std':'min_std', 'mode':'min_mode'})
minute['min_std'].fillna(0,inplace=True)


day = gd['day'].agg([np.max, np.min, np.mean, np.std, np.median, stats.mode]).reset_index()
day = day.rename(columns={'amax':'day_max','amin':'day_min','mean':'day_mean','median':'day_median','std':'day_std', 'mode':'day_mode'})
day['day_std'].fillna(0,inplace=True)



#month = gd['month'].agg([np.max, np.min, np.mean, np.std, np.median]).reset_index()
#month = month.rename(columns={'amax':'month_max','amin':'month_min','mean':'month_mean','median':'month_median','std':'month_std'})
#month['month_std'].fillna(0,inplace=True)

In [22]:
longitude['long_mode'] = longitude['long_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
latitude['lat_mode'] = latitude['lat_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
hour['hour_mode'] = hour['hour_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
minute['min_mode'] = minute['min_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)
day['day_mode'] = day['day_mode'].map(lambda x: float(x[0]) if float(x[0])!=0 else -1)

In [23]:
del events

In [24]:
latitude    

Unnamed: 0,device_id,lat_max,lat_min,lat_mean,lat_std,lat_median,lat_mode
0,-9222956879900151005,23.19,23.19,23.190000,-1.000000,23.190,23.19
1,-9222661944218806987,-1.00,-1.00,-1.000000,-1.000000,-1.000,-1.00
2,-9222399302879214035,-1.00,-1.00,-1.000000,-1.000000,-1.000,-1.00
3,-9221825537663503111,34.92,33.46,34.211818,0.652633,33.630,34.92
4,-9221767098072603291,-1.00,-1.00,-1.000000,-1.000000,-1.000,-1.00
5,-9221079146476055829,-1.00,-1.00,-1.000000,-1.000000,-1.000,-1.00
6,-9221026417907250887,30.89,30.87,30.871361,0.003993,30.870,30.87
7,-9220830859283101130,-1.00,-1.00,-1.000000,-1.000000,-1.000,-1.00
8,-9220452176650064280,-1.00,-1.00,-1.000000,-1.000000,-1.000,-1.00
9,-9220329415676028483,-1.00,-1.00,-1.000000,-1.000000,-1.000,-1.00


In [25]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),shape=(train.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), shape=(test.shape[0],napps))

print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [26]:
#applabels = pd.read_csv('./transformed_data/app_labels.csv')
applabels.head()

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [28]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

In [29]:
nlabels

492

In [30]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(train[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(test[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,1,21594.0,
1,-9222956879900151005,120,1,21594.0,
2,-9222956879900151005,126,1,21594.0,
3,-9222956879900151005,138,2,21594.0,
4,-9222956879900151005,147,2,21594.0,


In [31]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(train.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),shape=(test.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)


In [32]:
X_train = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
X_test =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print('All features: train shape {}, test shape {}'.format(X_train.shape, X_test.shape))

All features: train shape (74645, 21527), test shape (112071, 21527)


In [33]:
targetencoder = LabelEncoder().fit(train.group)
Y_train = targetencoder.transform(train.group)
nclasses = len(targetencoder.classes_)

Now that I have merge all space matrices I do feature selection (f_classif). I test which % works best with Logistic regression:

In [34]:
for i in (20,21,22):
    
    X_train = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
    X_test =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
    print('All features: train shape {}, test shape {}'.format(X_train.shape, X_test.shape))
    
    targetencoder = LabelEncoder().fit(train.group)
    Y_train = targetencoder.transform(train.group)
    nclasses = len(targetencoder.classes_)
           
    selector = SelectPercentile(f_classif, percentile=i)
    selector.fit(X_train, Y_train)

    X_train = selector.transform(X_train)
    X_test = selector.transform(X_test)

    lg = LogisticRegression(random_state=23, fit_intercept=True, class_weight= None,multi_class='multinomial',solver='lbfgs')
    param_grid = {'C': [0.01,0.02]}
    lg_gs = grid_search.GridSearchCV(estimator = lg, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
    lg_gs.fit(X_train, Y_train)
    print("Feature sel %",i)
    print("# Num of Features: ", X_train.shape[1])
    print("Best parameters found by grid search:", lg_gs.best_params_)
    print("Best CV score:", lg_gs.best_score_)
    print("Best CV score:", lg_gs.grid_scores_)


All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.9min finished


Feature sel % 23
# Num of Features:  4951
Best parameters found by grid search: {'C': 0.02}
Best CV score: -2.28075868789
Best CV score: [mean: -2.28317, std: 0.03912, params: {'C': 0.01}, mean: -2.28076, std: 0.04088, params: {'C': 0.02}, mean: -2.28296, std: 0.04120, params: {'C': 0.03}, mean: -2.29025, std: 0.04074, params: {'C': 0.05}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.9min finished


Feature sel % 22
# Num of Features:  4735
Best parameters found by grid search: {'C': 0.02}
Best CV score: -2.28074957969
Best CV score: [mean: -2.28322, std: 0.03910, params: {'C': 0.01}, mean: -2.28075, std: 0.04082, params: {'C': 0.02}, mean: -2.28286, std: 0.04135, params: {'C': 0.03}, mean: -2.28961, std: 0.04070, params: {'C': 0.05}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.8min finished


Feature sel % 24
# Num of Features:  5166
Best parameters found by grid search: {'C': 0.02}
Best CV score: -2.28077968275
Best CV score: [mean: -2.28317, std: 0.03913, params: {'C': 0.01}, mean: -2.28078, std: 0.04086, params: {'C': 0.02}, mean: -2.28290, std: 0.04129, params: {'C': 0.03}, mean: -2.29048, std: 0.04156, params: {'C': 0.05}]
All features: train shape (74645, 21527), test shape (112071, 21527)




Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  3.0min finished


Feature sel % 25
# Num of Features:  5381
Best parameters found by grid search: {'C': 0.02}
Best CV score: -2.28075333319
Best CV score: [mean: -2.28317, std: 0.03912, params: {'C': 0.01}, mean: -2.28075, std: 0.04084, params: {'C': 0.02}, mean: -2.28283, std: 0.04125, params: {'C': 0.03}, mean: -2.29042, std: 0.04089, params: {'C': 0.05}]


Now I calculate the probs for every group and join these results with long, lat and time info to run a random forest for final predictions

In [35]:
pred_train = pd.DataFrame(lg_gs.predict_proba(X_train), index = train.index, columns=targetencoder.classes_).reset_index()
print(pred_train.shape)
pred_train.head()

(74645, 13)


Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,-8076087639492063270,0.046878,0.065165,0.042776,0.062275,0.056507,0.043436,0.091346,0.165146,0.098118,0.102053,0.132843,0.093459
1,-2897161552818060146,0.046878,0.065165,0.042776,0.062275,0.056507,0.043436,0.091346,0.165146,0.098118,0.102053,0.132843,0.093459
2,-8260683887967679142,0.129003,0.065807,0.114101,0.065997,0.036978,0.03479,0.027287,0.107192,0.036948,0.067977,0.189286,0.124635
3,-4938849341048082022,0.059445,0.059776,0.042987,0.066272,0.077406,0.058205,0.102712,0.122706,0.078157,0.100491,0.124623,0.10722
4,245133531816851882,0.062773,0.070972,0.050278,0.062266,0.071374,0.044189,0.132311,0.142678,0.080909,0.098209,0.097001,0.087039


In [36]:
pred_test = pd.DataFrame(lg_gs.predict_proba(X_test), index = test.index, columns=targetencoder.classes_).reset_index()
print(pred_test.shape)
pred_test.head()

(112071, 13)


Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.001406,0.005902,0.013673,0.01322,0.024521,0.044835,0.011796,0.032044,0.074043,0.120894,0.257728,0.399938
1,-1547860181818787117,0.007748,0.013682,0.030964,0.059669,0.071888,0.150966,0.005917,0.090234,0.053776,0.076563,0.230768,0.207826
2,7374582448058474277,0.023906,0.037819,0.036153,0.15827,0.165869,0.080492,0.01334,0.026618,0.043251,0.104368,0.180009,0.129906
3,-6220210354783429585,0.003138,0.034009,0.009404,0.012093,0.055049,0.185755,0.041355,0.12172,0.067024,0.089099,0.175942,0.205411
4,-5893464122623104785,0.046878,0.065165,0.042776,0.062275,0.056507,0.043436,0.091346,0.165146,0.098118,0.102053,0.132843,0.093459


In [37]:
m1 = pd.merge(pred_train, hour, how='left', on='device_id')
m2 = pd.merge(m1, minute, how='left', on='device_id')
m3 = pd.merge(m2, longitude, how='left', on='device_id')
m4 = pd.merge(m3, day, how='left', on='device_id')
X_train = pd.merge(m4, latitude, how='left', on='device_id')

del m1
del m2
del m3
del m4

m1 = pd.merge(pred_test, hour, how='left', on='device_id')
m2 = pd.merge(m1, minute, how='left', on='device_id')
m3 = pd.merge(m2, longitude, how='left', on='device_id')
m4 = pd.merge(m3, day, how='left', on='device_id')
X_test = pd.merge(m4, latitude, how='left', on='device_id')

del m1
del m2
del m3
del m4

X_test.fillna(-1, inplace=True)
X_train.fillna(-1, inplace=True)

X_train.drop('device_id',1,inplace=True)
X_test.drop('device_id',1,inplace=True)

In [38]:
X_test.columns

Index(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+', 'hour_max', 'hour_min',
       'hour_mean', 'hour_std', 'hour_median', 'hour_mode', 'min_max',
       'min_min', 'min_mean', 'min_std', 'min_median', 'min_mode', 'long_max',
       'long_min', 'long_mean', 'long_std', 'long_median', 'long_mode',
       'day_max', 'day_min', 'day_mean', 'day_std', 'day_median', 'day_mode',
       'lat_max', 'lat_min', 'lat_mean', 'lat_std', 'lat_median', 'lat_mode'],
      dtype='object')

In [39]:
del hour
del minute
del longitude
del latitude
del pred_train
del pred_test
del day

I test several trees ensambles and choose the one that performs best using CV (log_loss)

In [53]:
# Feature selection to remove noise
rfc = RandomForestClassifier(n_estimators = 400, n_jobs=-1, random_state=23, min_samples_leaf= 35)
rfc.fit(X_train, Y_train)
features_lb = sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), X_train.columns), reverse=True, key=lambda pair: pair[0])
feature = []
for pair in features_lb:
    feature.append(pair[1])  
get_rid = feature[28:]
get_rid

['lat_min',
 'long_max',
 'long_mean',
 'long_median',
 'long_mode',
 'lat_mode',
 'long_min',
 'lat_max',
 'lat_mean',
 'lat_median',
 'day_max',
 'day_min',
 'long_std',
 'lat_std']

In [56]:
X_train.drop(get_rid, 1, inplace=True)
X_test.drop(get_rid, 1, inplace=True)

print("\nTraining Ensamble Random Forest ....")
rfc = RandomForestClassifier(n_jobs=-1, random_state=23)
param_grid = {'n_estimators':[250],'min_samples_leaf': [40,45]}
rfc_gs = grid_search.GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
rfc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", rfc_gs.best_params_)
print("Best CV score:", rfc_gs.best_score_)
print("Best CV score:", rfc_gs.grid_scores_)

# 2.25664 [mean: -2.22787, std: 0.04694, params: {'min_samples_leaf': 60, 'n_estimators': 200}]
#-2.20669923957


Training Ensamble Random Forest ....
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.1min finished


Best parameters found by grid search: {'min_samples_leaf': 45, 'n_estimators': 250}
Best CV score: -2.20609545925
Best CV score: [mean: -2.20617, std: 0.05231, params: {'min_samples_leaf': 40, 'n_estimators': 250}, mean: -2.20610, std: 0.05186, params: {'min_samples_leaf': 45, 'n_estimators': 250}]


In [None]:
print("\nTraining Ensamble Adaboost ....")
abc = AdaBoostClassifier(n_estimators=50, random_state=3)
param_grid = {'learning_rate':[0.01]}
abc_gs = grid_search.GridSearchCV(estimator = abc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
abc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", abc_gs.best_params_)
print("Best CV score:", abc_gs.best_score_)
print("Best CV score:", abc_gs.grid_scores_)

In [None]:
print("\nTraining Ensamble Gradient Boosting ....")
gbc = GradientBoostingClassifier(random_state=3, n_estimators = 200, max_features ='auto', subsample=0.9)
param_grid = {'min_samples_leaf':[10,50,100],'learning_rate':[0.01]}
gbc_gs = grid_search.GridSearchCV(estimator = gbc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
gbc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", gbc_gs.best_params_)
print("Best CV score:", gbc_gs.best_score_)
print("Best CV score:", gbc_gs.grid_scores_)


Training Ensamble Gradient Boosting ....
Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [None]:
print("\nTraining Extra Trees Classifier ....")
etc = ExtraTreesClassifier(n_estimators=200, criterion='gini', max_features='auto', n_jobs=-1, random_state=23)
param_grid = {'min_samples_leaf':[2,10,30,45]}
etc_gs = grid_search.GridSearchCV(estimator = etc, param_grid = param_grid, cv = 5, n_jobs=-1, scoring='log_loss', error_score=0, verbose=2) 
etc_gs.fit(X_train, Y_train)
print("Best parameters found by grid search:", etc_gs.best_params_)
print("Best CV score:", etc_gs.best_score_)
print("Best CV score:", etc_gs.grid_scores_)

In [57]:
pred = pd.DataFrame(rfc_gs.predict_proba(X_test), index = test.index, columns=targetencoder.classes_).reset_index()
print(pred.shape)
pred.head()

(112071, 13)


Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.003436,0.005227,0.008564,0.015283,0.040488,0.037504,0.010398,0.031193,0.046525,0.100406,0.231091,0.469885
1,-1547860181818787117,0.011409,0.019589,0.024976,0.052607,0.078563,0.146865,0.02511,0.056067,0.051568,0.092691,0.28099,0.159564
2,7374582448058474277,0.017278,0.037337,0.045107,0.174493,0.224904,0.09784,0.021488,0.031194,0.036185,0.078063,0.128999,0.107113
3,-6220210354783429585,0.01637,0.022319,0.021027,0.042148,0.06501,0.127933,0.049979,0.11864,0.074297,0.11369,0.174256,0.174331
4,-5893464122623104785,0.029765,0.088116,0.043785,0.062717,0.035126,0.032274,0.066686,0.180191,0.119594,0.106891,0.140035,0.094821


In [58]:
pred.to_csv('.\\submissions\\sub_rf.csv',index=False)