## Model Duration Using Features

In [264]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import scipy
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
import warnings

warnings.filterwarnings('ignore') #do not display warnings (SettingWithCopyWarning etc.)

%matplotlib inline

In [301]:
df = pd.read_pickle('../data/df.pkl')

In [280]:
df_fixed = df[df['resolution_final'] == 'fixed']
df_fixed.drop(['resolution_final'], axis=1, inplace=True)

In [281]:
df_fixed['duration_bin'] = pd.qcut(df_fixed['duration_days'], 5, labels=[0,1,2,3,4])
df_fixed.drop(['resolution_final'], axis=1, inplace=True)
df_fixed['duration_bin'].value_counts()


0    25791
4    21991
3    21872
2    20664
1    19825
Name: duration_bin, dtype: int64

In [283]:
#df_fixed.columns

In [284]:
df_fixed.head(1).T
df_fixed.shape

(110143, 42)

In [285]:
component_value_cnts = df_fixed['component_init'].value_counts()
df_fixed['component_init'] = df_fixed['component_init'].map(lambda x: x if component_value_cnts[x] > 100 else 'unimportant')

Train test split

In [286]:
y_all = df_fixed['duration_bin'] #classifier
X_all = df_fixed
X_all.drop(['duration_bin'], axis=1, inplace=True)

X, X_test, y, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=42)

In [287]:
print 'Majority is {}% cases in train'.format(y.value_counts().max() * 100 / y.value_counts().sum())
print 'Majority is {}% cases in test'.format(y_test.value_counts().max() * 100 / y_test.value_counts().sum())

Majority is 23.3200576222% cases in train
Majority is 23.703515398% cases in test


In [288]:
cnt_vectorizer = CountVectorizer()

opsys = cnt_vectorizer.fit_transform(X.pop('op_sys_init'))
X = pd.concat([X.reset_index(drop=True), pd.DataFrame(opsys.toarray()).reset_index(drop=True)], axis=1, join='inner')

opsys_test = cnt_vectorizer.transform(X_test.pop('op_sys_init'))
X_test = pd.concat([X_test.reset_index(drop=True), 
                    pd.DataFrame(opsys_test.toarray()).reset_index(drop=True)], axis=1, join='inner')

In [289]:
component = cnt_vectorizer.fit_transform(X.pop('component_init'))
X = pd.concat([X.reset_index(drop=True), 
               pd.DataFrame(component.toarray(),
                            columns=['component_' + str(x) for x in xrange(component.shape[1])]).reset_index(drop=True)], 
              axis=1, join='inner')

component_test = cnt_vectorizer.transform(X_test.pop('component_init'))
X_test = pd.concat([X_test.reset_index(drop=True), 
                   pd.DataFrame(component_test.toarray(), 
                                columns=['component_' + str(x) for x in xrange(component_test.shape[1])]).reset_index(drop=True)], 
                    axis=1, join='inner')

In [290]:
print X.shape, opsys.shape, component.shape

(82607, 285) (82607, 47) (82607, 199)


### Multinomial NB Classifier

In [291]:
nb_model = MultinomialNB()

In [292]:
nb_model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [293]:
y_pred = nb_model.predict(X_test)

print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

accuracy: 0.263509587449
precision: 0.235297981457
recall: 0.263509587449

confusion matrix: 
 [[2955   78  168  326 3000]
 [1911   65  147  234 2557]
 [1660   72  152  239 2982]
 [1478   74  150  240 3542]
 [1323   51  121  167 3844]]


### Random Forest Classifier

In [294]:
rf_model = RandomForestClassifier(n_estimators=20, criterion='gini', 
                               max_depth=3, max_features='auto', 
                               bootstrap=True, oob_score=True,
                               random_state=None, warm_start=False)

In [295]:
rf_model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [296]:
y_pred = rf_model.predict(X_test)

#print 'feature importances: {}'.format(rf_model.feature_importances_)
print 'oob score: {}'.format(rf_model.oob_score_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

oob score: 0.272301378818

accuracy: 0.277200755375
precision: 0.213893651662
recall: 0.277200755375

confusion matrix: 
 [[5659    0    2    1  865]
 [4097    0    0    2  815]
 [4072    0    1    4 1028]
 [4124    0    0    3 1357]
 [3530    0    1    5 1970]]


### Gradient Boost Classifier

In [297]:
gb_model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                   n_estimators=100, subsample=1.0,
                                   max_depth=3, init=None, 
                                   random_state=None, max_features=None, 
                                   verbose=0, max_leaf_nodes=None, warm_start=False)

In [298]:
gb_model.fit(X,y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [299]:
y_pred = gb_model.predict(X_test)

#print 'feature importances: {}'.format(gb_model.feature_importances_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))


accuracy: 0.31188262638
precision: 0.278834942821
recall: 0.31188262638

confusion matrix: 
 [[4140  225  131  879 1152]
 [2535  223  151  896 1109]
 [2253  214  159 1113 1366]
 [2011  180  186 1351 1756]
 [1490  107  102 1092 2715]]
