## Model Severity Using NLP

In [2]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
import warnings

warnings.filterwarnings('ignore') #do not display warnings (SettingWithCopyWarning etc.)

In [3]:
df = pd.read_pickle('../data/df_nlp_severity.pkl')
#word_list_reduced = pickle.load(open('../data/vocabulary.pkl', 'rb'))
vectorizer = pickle.load(open('../data/tfidf_vectorizer.pkl', 'rb'))

Only look at not normal (default)

*"Following the work of Lamkanfi et al. [15, 16], we do not consider the severity label normal as this is the default option and “many reports just did not bother to consciously assess the bug severity” [15, 16]. Thus we treat this data as unlabeled data and do not use it for our testing." Information Retrieval Based Nearest Neighbor Classification for Fine-Grained Bug Severity Prediction: Tian, Lo, Sun, 2011*

In [13]:
df_fixed = df[df['severity_final'] != 'enhancement']
df_fixed = df_fixed[df_fixed['severity_final'] != 'normal']

In [14]:
df_fixed['severity_final'].value_counts()

critical    35149
major       35078
minor       17830
trivial      7429
blocker      3740
Name: severity_final, dtype: int64

Train test split

In [15]:
X_all = df_fixed['desc_init']
#X_all = df['short_desc_init']
y_all = df_fixed['severity_final']

X, X_test, y, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=42)

Class imbalance

In [16]:
print 'Majority is {}% cases in train'.format(y.value_counts().max() * 100 / y.value_counts().sum())
print 'Majority is {}% cases in test'.format(y_test.value_counts().max() * 100 / y_test.value_counts().sum())

Majority is 35.4546553971% cases in train
Majority is 35.340831217% cases in test


Vectorize stuff

In [17]:
tfidf = vectorizer.transform(X)
print tfidf.shape
tfidf_test = vectorizer.transform(X_test)
print tfidf_test.shape

(74419, 20000)
(24807, 20000)


### Multinomial NB - desc_init

In [18]:
nb_model = MultinomialNB()

In [19]:
nb_model.fit(tfidf, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
y_pred = nb_model.predict(tfidf_test)

print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

accuracy: 0.540855403717
precision: 0.564328830269
recall: 0.540855403717

confusion matrix: 
 [[  82  298  501   43    7]
 [  95 4819 3670  169   11]
 [ 113  820 7113  706   15]
 [  28  292 2848 1324   48]
 [  10  181 1002  533   79]]


**Results to beat:**
```
Severity  Precision  Recall  F-Measure
blocker   53.4%       8.0%   13.9%
critical  69.6%      61.6%   65.3%
major     52.0%      60.2%   55.8%
minor     42.0%      15.7%   22.9%
trivial   60.5%      12.4%   20.6%
```

In [34]:
print 'classification report: \n {}'.format(sklearn.metrics.classification_report(y_test, y_pred))

classification report: 
              precision    recall  f1-score   support

    blocker       0.25      0.09      0.13       931
   critical       0.75      0.55      0.64      8764
      major       0.47      0.81      0.60      8767
      minor       0.48      0.29      0.36      4540
    trivial       0.49      0.04      0.08      1805

avg / total       0.56      0.54      0.51     24807



### Random Forest - desc_init

In [21]:
rf_model = RandomForestClassifier(n_estimators=20, criterion='gini', 
                               max_depth=3, max_features='auto', 
                               bootstrap=True, oob_score=True,
                               random_state=None, warm_start=False)

In [22]:
rf_model.fit(tfidf,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [23]:
y_pred = rf_model.predict(tfidf_test)

print 'feature importances: {}'.format(rf_model.feature_importances_)
print 'oob score: {}'.format(rf_model.oob_score_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

feature importances: [ 0.  0.  0. ...,  0.  0.  0.]
oob score: 0.449858235128

accuracy: 0.459789575523
precision: 0.325742214826
recall: 0.459789575523

confusion matrix: 
 [[   0  728  203    0    0]
 [   0 5586 3178    0    0]
 [   0 2947 5820    0    0]
 [   0 1617 2923    0    0]
 [   0  766 1039    0    0]]


### Gradient Boost - desc_init

In [24]:
gb_model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                   n_estimators=100, subsample=1.0,
                                   max_depth=3, init=None, 
                                   random_state=None, max_features=None, 
                                   verbose=0, max_leaf_nodes=None, warm_start=False)

In [25]:
gb_model.fit(tfidf,y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [17]:
y_pred = gb_model.predict(tfidf_test.toarray())

print 'feature importances: {}'.format(gb_model.feature_importances_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

feature importances: [ 0.          0.00066981  0.         ...,  0.          0.          0.        ]

accuracy: 0.298146853147
precision: 0.279535725834
recall: 0.298146853147

confusion matrix: 
 [[4915   96  273  495  912]
 [3331  120  276  549  816]
 [3228  107  331  686 1173]
 [2827   86  315  872 1503]
 [2391   57  200  752 2289]]
