In [80]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

np.random.seed(44)

In [72]:
# load files
train = pd.read_csv('../processed/train_processed.csv')
test = pd.read_csv('../processed/test_processed.csv')

In [73]:
lbl = LabelEncoder()

lbl.fit(train.group)
train['group'] = lbl.transform(train.group)

lbl.fit(pd.concat([train.device_model, test.device_model], axis=0))
train['device_model'] = lbl.transform(train.device_model)
test['device_model'] = lbl.transform(test.device_model)

In [74]:
features = ['count_device_model', 'device_id', 'device_model', 'log_count_events']

X =  train[features]
y = train.group

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=0)

In [76]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(55983, 4) (18662, 4) (55983L,) (18662L,)


In [77]:
X_train.columns

Index([u'count_device_model', u'device_id', u'device_model',
       u'log_count_events'],
      dtype='object')

## Train the model

In [86]:
# tree = DecisionTreeClassifier(max_depth=4)
# tree.fit(X_train, y_train)
tree = RandomForestClassifier(n_estimators=25,max_depth=7, n_jobs=-1)
tree.fit(X_train, y_train)

print 'Log Loss on the training set %f ' %(log_loss(y_train, tree.predict_proba(X_train)))
print 'Log Loss on the test set %f ' %(log_loss(y_test, tree.predict_proba(X_test)))

Log Loss on the training set 2.367293 
Log Loss on the test set 2.397258 


### Tuning Parameters

In [None]:
tuned_parameters = {
        'n_estimators': [10, 50, 100],
        'max_depth': [3, 10, 15],
        'min_samples_split': [2, 4],
        'min_samples_leaf': [1, 3]
    }

clf = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1), param_grid=tuned_parameters, cv=3, scoring='log_loss')
clf.fit(X_train, y_train)

In [65]:
tree.fit(X, y)

predictions = tree.predict_proba(test)

In [67]:
submission = pd.read_csv('../data/sample_submission.csv'); submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
1,-1547860181818787117,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
2,7374582448058474277,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
3,-6220210354783429585,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
4,-5893464122623104785,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833


In [69]:
submission[submission.columns[1:]] = predictions

In [32]:
submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.060833,0.051025,0.044302,0.055654,0.059621,0.048821,0.133899,0.147895,0.08089,0.097862,0.121997,0.097201
1,-1547860181818787117,0.060833,0.051025,0.044302,0.055654,0.059621,0.048821,0.133899,0.147895,0.08089,0.097862,0.121997,0.097201
2,7374582448058474277,0.072006,0.05258,0.038608,0.055031,0.075316,0.062079,0.093823,0.119561,0.067839,0.097377,0.133472,0.132308
3,-6220210354783429585,0.052087,0.05235,0.04174,0.074097,0.077867,0.064977,0.084882,0.123816,0.072694,0.101456,0.139512,0.114521
4,-5893464122623104785,0.056184,0.054568,0.036378,0.047898,0.080234,0.063864,0.090744,0.128537,0.069321,0.103678,0.13945,0.129143


In [71]:
submission.to_csv('../submissions/decision_tree_classifier_2.403822.csv', index=False)