## Model Duration Using NLP

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
import warnings

warnings.filterwarnings('ignore') #do not display warnings (SettingWithCopyWarning etc.)

In [2]:
df = pd.read_pickle('../data/df_nlp.pkl')
df['duration_days'] = (df['closing'] - df['opening']).apply(lambda x: float(x.days))
df.drop(['closing', 'opening'], axis=1, inplace=True)

#word_list_reduced = pickle.load(open('../data/vocabulary.pkl', 'rb'))
vectorizer = pickle.load(open('../data/tfidf_vectorizer.pkl', 'rb'))

In [3]:
from collections import Counter
Counter([len(x.split()) for x in vectorizer.get_feature_names()])

Counter({1: 7583, 2: 8203, 3: 4214})

Only look at fixed

In [4]:
df_fixed = df[df['resolution_final'] == 'fixed']
df_fixed.drop(['resolution_final'], axis=1, inplace=True)

In [5]:
df_fixed['duration_bin'] = pd.qcut(df_fixed['duration_days'], 5, labels=[0,1,2,3,4])
df_fixed.drop(['duration_days'], axis=1, inplace=True)
df_fixed['duration_bin'].value_counts()

0    26351
4    22875
3    22438
2    22418
1    20317
Name: duration_bin, dtype: int64

Train test split

In [6]:
X_all = df_fixed['desc_init']
#X_all = df['short_desc_init']
y_all = df_fixed['duration_bin']

X, X_test, y, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=42)

Class imbalance

In [7]:
print 'Majority is {}% cases in train'.format(y.value_counts().max() * 100 / y.value_counts().sum())
print 'Majority is {}% cases in test'.format(y_test.value_counts().max() * 100 / y_test.value_counts().sum())

Majority is 22.9140199769% cases in train
Majority is 23.3951048951% cases in test


Vectorize stuff

In [8]:
tfidf = vectorizer.transform(X)
print tfidf.shape
tfidf_test = vectorizer.transform(X_test)
print tfidf_test.shape

(85799, 20000)
(28600, 20000)


### Multinomial NB - desc_init

In [9]:
nb_model = MultinomialNB()

In [10]:
nb_model.fit(tfidf, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
y_pred = nb_model.predict(tfidf_test)

print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

accuracy: 0.30451048951
precision: 0.278014303767
recall: 0.30451048951

confusion matrix: 
 [[3973  224  495  629 1370]
 [2514  263  487  633 1195]
 [2339  243  573  812 1558]
 [1898  213  546  958 1988]
 [1528  171  351  697 2942]]


### Random Forest - desc_init

In [12]:
rf_model = RandomForestClassifier(n_estimators=20, criterion='gini', 
                               max_depth=3, max_features='auto', 
                               bootstrap=True, oob_score=True,
                               random_state=None, warm_start=False)

In [13]:
rf_model.fit(tfidf,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [14]:
y_pred = rf_model.predict(tfidf_test)

print 'feature importances: {}'.format(rf_model.feature_importances_)
print 'oob score: {}'.format(rf_model.oob_score_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

feature importances: [ 0.  0.  0. ...,  0.  0.  0.]
oob score: 0.256191797107

accuracy: 0.262552447552
precision: 0.292449598084
recall: 0.262552447552

confusion matrix: 
 [[6029    0    0    1  661]
 [4513    0    0    1  578]
 [4759    0    1    3  762]
 [4644    0    1    4  954]
 [4212    0    0    2 1475]]


### Gradient Boost - desc_init

In [15]:
gb_model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                   n_estimators=100, subsample=1.0,
                                   max_depth=3, init=None, 
                                   random_state=None, max_features=None, 
                                   verbose=0, max_leaf_nodes=None, warm_start=False)

In [16]:
gb_model.fit(tfidf,y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [17]:
y_pred = gb_model.predict(tfidf_test.toarray())

print 'feature importances: {}'.format(gb_model.feature_importances_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

feature importances: [ 0.          0.00066981  0.         ...,  0.          0.          0.        ]

accuracy: 0.298146853147
precision: 0.279535725834
recall: 0.298146853147

confusion matrix: 
 [[4915   96  273  495  912]
 [3331  120  276  549  816]
 [3228  107  331  686 1173]
 [2827   86  315  872 1503]
 [2391   57  200  752 2289]]
