In [34]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

np.random.seed(44)

In [2]:
# load files
train = pd.read_csv('../processed/train_processed.csv')
test = pd.read_csv('../processed/test_processed.csv')

In [4]:
lbl = LabelEncoder()

lbl.fit(train.group)
train['group'] = lbl.transform(train.group)

lbl.fit(pd.concat([train.device_model, test.device_model], axis=0))
train['device_model'] = lbl.transform(train.device_model)
test['device_model'] = lbl.transform(test.device_model)

In [6]:
features = train.columns.drop('group')

X =  train[features]
y = train.group

## Split into training and test set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=0)

In [9]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(55983, 136) (18662, 136) (55983L,) (18662L,)


In [10]:
X_train.columns

Index([u'device_id', u'device_model', u'E人E本', u'E派', u'HTC', u'LG', u'LOGO',
       u'Lovme', u'MIL', u'OPPO',
       ...
       u'青葱', u'飞利浦', u'飞秒', u'首云', u'魅族', u'鲜米', u'黑米', u'count_device_model',
       u'count_events', u'log_count_events'],
      dtype='object', length=136)

## Train the model

In [33]:
# tree = DecisionTreeClassifier(max_depth=4)
# tree.fit(X_train, y_train)

tree = RandomForestClassifier(n_estimators=100, max_depth=8, criterion='entropy', max_features='sqrt', n_jobs=-1)
tree.fit(X_train, y_train)

print 'Log Loss on the training set %f ' %(log_loss(y_train, tree.predict_proba(X_train)))
print 'Log Loss on the test set %f ' %(log_loss(y_test, tree.predict_proba(X_test)))

Log Loss on the training set 2.358141 
Log Loss on the test set 2.398738 


### Tuning Parameters

In [35]:
tuned_parameters = {
        'n_estimators': [10, 50, 100],
        'max_depth': [5, 10, 15],
        'min_samples_leaf': [1, 3, 5],
        'max_features': ['auto', 'sqrt', 'log2']
    }

skf = StratifiedKFold(y=y_train, random_state=1)
clf = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1), param_grid=tuned_parameters, cv=skf, scoring='log_loss')
clf.fit(X_train, y_train)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[ 9  7 ..., 11 10], n_folds=3, shuffle=False, random_state=1),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [5, 10, 15], 'min_samples_leaf': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [36]:
print clf.grid_scores_
print 
print clf.best_params_
print 
print clf.best_score_

[mean: -2.40871, std: 0.00042, params: {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 5, 'min_samples_leaf': 1}, mean: -2.40757, std: 0.00061, params: {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 1}, mean: -2.40801, std: 0.00046, params: {'max_features': 'auto', 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}, mean: -2.40744, std: 0.00194, params: {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 5, 'min_samples_leaf': 3}, mean: -2.40788, std: 0.00047, params: {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 3}, mean: -2.40734, std: 0.00025, params: {'max_features': 'auto', 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 3}, mean: -2.41019, std: 0.00138, params: {'max_features': 'auto', 'n_estimators': 10, 'max_depth': 5, 'min_samples_leaf': 5}, mean: -2.40764, std: 0.00053, params: {'max_features': 'auto', 'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 5}, mean: -2.4073

In [37]:
print 'Log loss on unseen data %f '%(log_loss(y_test, clf.best_estimator_.predict_proba(X_test)))

Log loss on unseen data 2.395194 


In [94]:
clf.best_estimator_.fit(X, y)

predictions = clf.best_estimator_.predict_proba(test)

In [95]:
submission = pd.read_csv('../data/sample_submission.csv'); submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
1,-1547860181818787117,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
2,7374582448058474277,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
3,-6220210354783429585,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
4,-5893464122623104785,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833


In [96]:
submission[submission.columns[1:]] = predictions

In [97]:
submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.04818,0.049462,0.03971,0.070945,0.077589,0.057783,0.081051,0.12996,0.072653,0.109312,0.139821,0.123534
1,-1547860181818787117,0.052818,0.049582,0.034941,0.061876,0.072086,0.073516,0.076712,0.127446,0.0696,0.107155,0.14202,0.13225
2,7374582448058474277,0.088844,0.063656,0.063119,0.057051,0.06699,0.056965,0.110054,0.094622,0.065683,0.085074,0.127268,0.120674
3,-6220210354783429585,0.041442,0.043341,0.02882,0.057614,0.066906,0.066377,0.065847,0.119215,0.076738,0.119894,0.163777,0.15003
4,-5893464122623104785,0.042922,0.077138,0.044311,0.068691,0.03826,0.03556,0.080273,0.172094,0.112059,0.113597,0.12761,0.087485


In [98]:
submission.to_csv('../submissions/decision_tree_classifier_2.395680.csv', index=False)