In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

np.random.seed(44)



In [2]:
# load files
train = pd.read_csv('../processed/train_processed.csv')
test = pd.read_csv('../processed/test_processed.csv')

In [4]:
## Encode categorical variables to numerical variables

lbl = LabelEncoder()

lbl.fit(train.group)
train['group'] = lbl.transform(train.group)

lbl.fit(pd.concat([train.device_model, test.device_model], axis=0))
train['device_model'] = lbl.transform(train.device_model)
test['device_model'] = lbl.transform(test.device_model)

lbl.fit(pd.concat([train.phone_brand, test.phone_brand], axis=0))
train['phone_brand'] = lbl.transform(train.phone_brand)
test['phone_brand'] = lbl.transform(test.phone_brand)

In [5]:
features = train.columns.drop('group')

X =  train[features]
y = train.group

## Split into training and test set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=0)

In [7]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(55983, 6) (18662, 6) (55983L,) (18662L,)


In [8]:
X_train.columns

Index([u'device_id', u'phone_brand', u'device_model', u'brand_popularity',
       u'model_popularity', u'event_generated'],
      dtype='object')

## Train the model

In [49]:
# tree = DecisionTreeClassifier(criterion='gini', max_depth=3, max_features='auto')
# tree.fit(X_train, y_train)

tree = RandomForestClassifier(n_estimators=100, max_depth=7, max_features='auto', n_jobs=-1)
tree.fit(X_train, y_train)

# tree = GradientBoostingClassifier()
# tree.fit(X_train, y_train)

# tree = XGBClassifier()
# tree.fit(X_train, y_train)

print 'Log Loss on the training set %f ' %(log_loss(y_train, tree.predict_proba(X_train)))
print 'Log Loss on the test set %f ' %(log_loss(y_test, tree.predict_proba(X_test)))

Log Loss on the training set 2.362101 
Log Loss on the test set 2.393004 


### Tuning Parameters

In [None]:
tuned_parameters = {
        'n_estimators': [100],
        'max_depth': [5, 6, 7, 8, 9, 10],
        'max_features': ['auto', 'sqrt']
    }

skf = StratifiedKFold(y=y_train, random_state=1)
clf = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1), param_grid=tuned_parameters, cv=skf, scoring='log_loss')
clf.fit(X_train, y_train)

In [36]:
print clf.grid_scores_
print 
print clf.best_params_
print 
print clf.best_score_

[mean: -2.40871, std: 0.00042, params: {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 5, 'min_samples_leaf': 1}, mean: -2.40757, std: 0.00061, params: {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 1}, mean: -2.40801, std: 0.00046, params: {'max_features': 'auto', 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}, mean: -2.40744, std: 0.00194, params: {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 5, 'min_samples_leaf': 3}, mean: -2.40788, std: 0.00047, params: {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 3}, mean: -2.40734, std: 0.00025, params: {'max_features': 'auto', 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 3}, mean: -2.41019, std: 0.00138, params: {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 5, 'min_samples_leaf': 5}, mean: -2.40764, std: 0.00053, params: {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 5}, mean: -2.4073

In [37]:
print 'Log loss on unseen data %f '%(log_loss(y_test, clf.best_estimator_.predict_proba(X_test)))

Log loss on unseen data 2.395194 


In [47]:
# clf.best_estimator_.fit(X, y)

tree.fit(X, y)
predictions = tree.predict_proba(test)

In [48]:
submission = pd.read_csv('../data/sample_submission.csv'); submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
1,-1547860181818787117,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
2,7374582448058474277,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
3,-6220210354783429585,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
4,-5893464122623104785,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833


In [49]:
submission[submission.columns[1:]] = predictions

In [50]:
submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.058821,0.049063,0.04251,0.06212,0.068697,0.054834,0.100907,0.131264,0.076163,0.108989,0.126944,0.119686
1,-1547860181818787117,0.05224,0.04975,0.043025,0.060598,0.070117,0.059207,0.097065,0.128428,0.075037,0.107221,0.132059,0.125253
2,7374582448058474277,0.068733,0.049186,0.034094,0.05679,0.093136,0.058529,0.067624,0.110315,0.060424,0.110032,0.128593,0.162544
3,-6220210354783429585,0.049885,0.052674,0.033831,0.069916,0.082186,0.058712,0.056424,0.13036,0.073977,0.107748,0.144906,0.139381
4,-5893464122623104785,0.059065,0.056753,0.038754,0.059861,0.06359,0.049354,0.105775,0.160211,0.087838,0.097975,0.122886,0.097939


In [51]:
submission.to_csv('../submissions/random_forest_classifier_2.394721.csv', index=False)