## Model Severity Using Features

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import scipy
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
import warnings

warnings.filterwarnings('ignore') #do not display warnings (SettingWithCopyWarning etc.)

%matplotlib inline

In [13]:
df = pd.read_pickle('../data/df_severity.pkl')

In [44]:
df_fixed = df[df['severity_final'] != 'enhancement']
df_fixed = df_fixed[df_fixed['severity_final'] != 'normal']

In [45]:
df_fixed['severity_final'].value_counts()

critical    35128
major       35034
minor       17764
trivial      7397
blocker      3728
Name: severity_final, dtype: int64

In [46]:
df_fixed.drop(['severity_init_other', 'severity_init_trivial', 'severity_init_minor', 'severity_init_normal', 'severity_init_major', 'severity_init_critical', 'severity_init_blocker'], axis=1, inplace=True)

In [None]:
df_fixed.drop(['component_init', 'op_sys_init'], axis=1, inplace=True)

In [48]:
component_value_cnts = df_fixed['component_init'].value_counts()
df_fixed['component_init'] = df_fixed['component_init'].map(lambda x: x if component_value_cnts[x] > 100 else 'unimportant')

In [47]:
df_fixed.columns

Index([             u'severity_final',              u'component_init',
                       u'op_sys_init',            u'reporter_bug_cnt',
                     u'duration_days',       u'assigned_to_init_bool',
               u'bug_status_init_new', u'bug_status_init_unconfirmed',
          u'bug_status_init_assigned',    u'bug_status_init_resolved',
          u'bug_status_init_verified',      u'bug_status_init_closed',
          u'bug_status_init_reopened',                 u'cc_init_cnt',
               u'priority_init_other',            u'priority_init_p1',
                  u'priority_init_p2',            u'priority_init_p3',
                  u'priority_init_p4',            u'priority_init_p5',
                u'product_init_other',           u'product_init_core',
              u'product_init_firefox',    u'product_init_thunderbird',
             u'product_init_bugzilla',        u'product_init_browser',
             u'product_init_webtools',            u'product_init_psm',
      

Train test split

In [49]:
y_all = df_fixed['severity_final'] #classifier
X_all = df_fixed
X_all.drop(['severity_final'], axis=1, inplace=True)

X, X_test, y, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=42)

In [50]:
print 'Majority is {}% cases in train'.format(y.value_counts().max() * 100 / y.value_counts().sum())
print 'Majority is {}% cases in test'.format(y_test.value_counts().max() * 100 / y_test.value_counts().sum())

Majority is 35.565636442% cases in train
Majority is 35.6297702217% cases in test


Vectorize op_sys and component

In [51]:
cnt_vectorizer = CountVectorizer()

opsys = cnt_vectorizer.fit_transform(X.pop('op_sys_init'))
X = pd.concat([X.reset_index(drop=True), pd.DataFrame(opsys.toarray()).reset_index(drop=True)], axis=1, join='inner')

opsys_test = cnt_vectorizer.transform(X_test.pop('op_sys_init'))
X_test = pd.concat([X_test.reset_index(drop=True), 
                    pd.DataFrame(opsys_test.toarray()).reset_index(drop=True)], axis=1, join='inner')

In [52]:
component = cnt_vectorizer.fit_transform(X.pop('component_init'))
X = pd.concat([X.reset_index(drop=True), 
               pd.DataFrame(component.toarray(),
                            columns=['component_' + str(x) for x in xrange(component.shape[1])]).reset_index(drop=True)], 
              axis=1, join='inner')

component_test = cnt_vectorizer.transform(X_test.pop('component_init'))
X_test = pd.concat([X_test.reset_index(drop=True), 
                   pd.DataFrame(component_test.toarray(), 
                                columns=['component_' + str(x) for x in xrange(component_test.shape[1])]).reset_index(drop=True)], 
                    axis=1, join='inner')

In [53]:
print X.shape, opsys.shape, component.shape

(74288, 232) (74288, 45) (74288, 154)


### Multinomial NB Classifier

In [54]:
nb_model = MultinomialNB()

In [55]:
nb_model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
y_pred = nb_model.predict(X_test)

print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

accuracy: 0.28926220571
precision: 0.35837878667
recall: 0.28926220571

confusion matrix: 
 [[ 625   13  242   19   25]
 [3262  205 4374  714  152]
 [2073  120 5605  914  111]
 [ 929   88 2599  667  124]
 [ 624   42  892  283   61]]


### Random Forest Classifier

In [57]:
rf_model = RandomForestClassifier(n_estimators=20, criterion='gini', 
                               max_depth=3, max_features='auto', 
                               bootstrap=True, oob_score=True,
                               random_state=None, warm_start=False)

In [58]:
rf_model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [59]:
y_pred = rf_model.predict(X_test)

#print 'feature importances: {}'.format(rf_model.feature_importances_)
print 'oob score: {}'.format(rf_model.oob_score_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

oob score: 0.417294852466

accuracy: 0.416468117756
precision: 0.29794221512
recall: 0.416468117756

confusion matrix: 
 [[   0  667  257    0    0]
 [   0 3975 4732    0    0]
 [   0 2485 6338    0    0]
 [   0 1261 3146    0    0]
 [   0  689 1213    0    0]]


In [60]:
print 'classification report: \n {}'.format(sklearn.metrics.classification_report(y_test, y_pred))

classification report: 
              precision    recall  f1-score   support

    blocker       0.00      0.00      0.00       924
   critical       0.44      0.46      0.45      8707
      major       0.40      0.72      0.52      8823
      minor       0.00      0.00      0.00      4407
    trivial       0.00      0.00      0.00      1902

avg / total       0.30      0.42      0.34     24763



### Gradient Boost Classifier

In [61]:
gb_model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                   n_estimators=100, subsample=1.0,
                                   max_depth=3, init=None, 
                                   random_state=None, max_features=None, 
                                   verbose=0, max_leaf_nodes=None, warm_start=False)

In [62]:
gb_model.fit(X,y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [63]:
y_pred = gb_model.predict(X_test)

#print 'feature importances: {}'.format(gb_model.feature_importances_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))


accuracy: 0.477244275734
precision: 0.483423050563
recall: 0.477244275734

confusion matrix: 
 [[ 113  406  368   25   12]
 [  38 4962 3576  108   23]
 [  38 2435 6050  253   47]
 [  22  922 2878  483  102]
 [  28  426 1019  219  210]]


In [64]:
print 'classification report: \n {}'.format(sklearn.metrics.classification_report(y_test, y_pred))

classification report: 
              precision    recall  f1-score   support

    blocker       0.47      0.12      0.19       924
   critical       0.54      0.57      0.56      8707
      major       0.44      0.69      0.53      8823
      minor       0.44      0.11      0.18      4407
    trivial       0.53      0.11      0.18      1902

avg / total       0.48      0.48      0.44     24763

