## Model Severity

In [2]:
from __future__ import division
import pymongo
import json
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
import sklearn.metrics

%matplotlib inline

### Data Preparation

In [3]:
engine = create_engine('postgresql://lucka@localhost:5432/bugs')

In [4]:
df = pd.read_sql_query('select * from final',con=engine)

In [5]:
# get rid of enhancements
df = df[df['severity_init'] != 'enhancement']

# calc resolution time (duration)
df['duration'] = df['closing'] - df['opening']
df['duration_days'] = df['duration'].apply(lambda x: float(x.days))

# is there assignee
df['assigned_to_init_bool'] = df['assigned_to_init'].map(lambda x: 0 if x == '' else 1)

# bug_status to int
bug_status_map = dict({'new':1, 'unconfirmed':2, 'assigned':3, 'resolved':4, 'verified':5, 'closed':6, 'reopened':7})
df['bug_status_init'] = df['bug_status_init'].map(lambda x: bug_status_map[x] if x in bug_status_map.keys() else 0)
df['bug_status_final'] = df['bug_status_final'].map(lambda x: bug_status_map[x] if x in bug_status_map.keys() else 0)

# count number of initially cced
df['cc_init_cnt'] = df['cc_init'].map(lambda x: x.count('@'))

# priority to int
priority_map = dict({'p1':1, 'p2':2, 'p3':3, 'p4':4, 'p5':5})
df['priority_init'] = df['priority_init'].map(lambda x: priority_map[x] if x in priority_map.keys() else 0)
df['priority_final'] = df['priority_final'].map(lambda x: priority_map[x] if x in priority_map.keys() else 0)

# only keep top products
product_map = dict({'core':1, 'firefox':2, 'thunderbird':3, 'bugzilla':4, 'browser':5, 'webtools':6, 'psm':7})
df['top_product_init'] = df['product_init'].map(lambda x: product_map[x] if x in product_map.keys() else 0)
df['top_product_final'] = df['product_final'].map(lambda x: product_map[x] if x in product_map.keys() else 0)

# severity to int
severity_map = dict({'trivial':1, 'minor':2, 'normal':3, 'major':4, 'critical':5, 'blocker':6})
df['severity_init'] = df['severity_init'].map(lambda x: severity_map[x] if x in severity_map.keys() else 0)
df['severity_final'] = df['severity_final'].map(lambda x: severity_map[x] if x in severity_map.keys() else 0)

# version to int
version_map = dict({'trunk':1, 'unspecified':2, 'other':3, 'other branch':4, '2.0 branch':5, '1.0 branch':6})
df['version_init'] = df['version_init'].map(lambda x: version_map[x] if x in version_map.keys() else 0)
df['version_final'] = df['version_final'].map(lambda x: version_map[x] if x in version_map.keys() else 0)

In [6]:
# short_desc_init_wordcnt
df['short_desc_init_wordcnt'] = df['short_desc_init'].map(lambda x: len(x.split()))

# desc_wordcnt
df['desc_init_wordcnt'] = df['desc_init'].map(lambda x: len(x.split()))

In [136]:
df_init = df[[
    #'reporter', #not really useful, is it?
    'assigned_to_init_bool',
    'bug_status_init',
    'cc_init_cnt',
    #'component_init', #need to vectorize
    #'op_sys_init', #need to vetorize
    #'priority_init', #almost always empty
    'top_product_init',
    #'severity_init',
    'short_desc_init_wordcnt',
    #'short_desc_init', #need to vectorize
    'version_init',
    #'desc_init',
    'desc_init_wordcnt',
]]

In [137]:
df_init.head(1).T

Unnamed: 0,0
assigned_to_init_bool,0
bug_status_init,1
cc_init_cnt,0
top_product_init,1
short_desc_init_wordcnt,3
version_init,1
desc_init_wordcnt,0


Train test split

In [138]:
X_all = df_init.values
y_all = df['severity_final'].values

X, X_test, y, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=42)

### Random Forest

In [127]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=20, criterion='gini', 
                               max_depth=3, max_features='auto', 
                               bootstrap=True, oob_score=True,
                               random_state=None, warm_start=False)

In [128]:
rf_model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [129]:
y_pred = rf_model.predict(X_test)

print 'feature importances: {}'.format(rf_model.feature_importances_)
print 'oob score: {}'.format(rf_model.oob_score_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

feature importances: [  2.12418238e-03   3.90746602e-02   1.75534628e-03   3.24667723e-02
   8.25498661e-01   4.73162562e-04   2.29645755e-02   7.56426401e-02]
oob score: 0.825847948201

accuracy: 0.842957664947
precision: 0.756063791704
recall: 0.842957664947

confusion matrix: 
 [[    0     0     0   835    55    24     0]
 [    0     0     0  1756    43    22     0]
 [    0     0     0  4320    66    22     0]
 [    0     0     0 51483   334   299     0]
 [    0     0     0  1120  7328   301     0]
 [    0     0     0  1727   335  6718     0]
 [    0     0     0   205    46   698     0]]


  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [120]:
print y_pred.mean()

0.0


### Gradient Boost

In [132]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                   n_estimators=100, subsample=1.0,
                                   max_depth=3, init=None, 
                                   random_state=None, max_features=None, 
                                   verbose=0, max_leaf_nodes=None, warm_start=False)

In [139]:
gb_model.fit(X,y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [141]:
y_pred = gb_model.predict(X_test)

print 'feature importances: {}'.format(gb_model.feature_importances_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

feature importances: [ 0.03907605  0.13797198  0.0186802   0.24428588  0.14290109  0.12872565
  0.28835914]

accuracy: 0.670967492957
precision: 0.582189220394
recall: 0.670967492957

confusion matrix: 
 [[    0     1     0   911     0     0     2]
 [    1     3     0  1815     0     0     2]
 [    1     4     0  4398     0     5     0]
 [    0     7     0 51882     2   220     5]
 [    0     1     0  8704     3    40     1]
 [    2     0     0  8519     0   256     3]
 [    0     6     0   914     0    14    15]]


  sample_weight=sample_weight)
  sample_weight=sample_weight)


### NLP (TFIDF) on short_desc

Deal with class imbalance (undersample normal (3) severity)

In [195]:
df_nlp = df[df['severity_final'] != 3][['short_desc_init','severity_final']]
print len(df_nlp)
df_nlp = df_nlp.append(df[df['severity_final'] == 3][['short_desc_init','severity_final']][:35000])
print len(df_nlp)

# shuffle by sampling the whole thing
df_nlp = df_nlp.sample(frac=1)

102534
137534


In [184]:
df_nlp['severity_final'].value_counts()

5    35128
4    35034
3    35000
2    17764
1     7397
6     3728
0     3483
Name: severity_final, dtype: int64

In [185]:
X, X_test, y, y_test = train_test_split(df_nlp['short_desc_init'], df_nlp['severity_final'], test_size=0.25, random_state=42)

Oversample normal (3) severity in test to make it more real

In [220]:
X_test = X_test.append(df[df['severity_final'] == 3]['short_desc_init'][-10000:])
y_test = y_test.append(df[df['severity_final'] == 3]['severity_final'][-10000:])

In [221]:
print X_test.head(1)
print y_test.head(1)

10033    unable to resolve url.
Name: short_desc_init, dtype: object
10033    3
Name: severity_final, dtype: int64


In [222]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(input='content', lowercase=True, tokenizer=None, 
                        stop_words='english', use_idf=True)
tfidf = vectorizer.fit_transform(X)

In [223]:
print tfidf.shape
tfidf_test = vectorizer.transform(X_test)
print tfidf_test.shape

(103150, 41279)

#### Multinomial Naive Bayes

In [217]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

In [218]:
nb_model.fit(tfidf,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [226]:
y_pred = nb_model.predict(tfidf_test)

print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

accuracy: 0.457091365624
precision: 0.538161635446
recall: 0.457091365624

confusion matrix: 
 [[    1     0    21   410   397    28     0]
 [    0    13    59  1040   662   121     1]
 [    0     3   110  1814  2304   253     0]
 [    0     2   217 10952 10169  2384     2]
 [    0     0    34  2026  5433  1248     0]
 [    0     0     8   778  1927  6019     0]
 [    0     0     8   248   403   244    45]]


  sample_weight=sample_weight)
  sample_weight=sample_weight)


#### Gradient Boost

In [198]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                   n_estimators=100, subsample=1.0,
                                   max_depth=3, init=None, 
                                   random_state=None, max_features=None, 
                                   verbose=0, max_leaf_nodes=None, warm_start=False)

In [199]:
# runs about 5 mins for tfidf.shape (103150, 41279)
gb_model.fit(tfidf,y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [201]:
y_pred = gb_model.predict(tfidf_test.toarray())

print 'feature importances: {}'.format(gb_model.feature_importances_)
print ''
print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))

feature importances: [ 0.  0.  0. ...,  0.  0.  0.]

accuracy: 0.465798045603
precision: 0.508639836125
recall: 0.465798045603

confusion matrix: 
 [[  77    2   15  534  223    5    1]
 [   6   90   54 1314  410   20    2]
 [  10   45  222 2731 1377   92    7]
 [  38   42  110 5653 2372  489   22]
 [  19   12   76 4012 3922  688   12]
 [   6    3   13 1555 1193 5946   16]
 [   3    0    2  375  304  158  106]]


  sample_weight=sample_weight)
  sample_weight=sample_weight)


### NLP (TFIDF) on description

In [7]:
df_third = df[['desc_init', 'severity_final']].sample(frac=0.3)

In [43]:
df_third['desc_init']

118228    To reproduce:Type username and password and hi...
123121    User-Agent:       Mozilla/5.0 (Windows; U; Win...
62873     User-Agent:       Mozilla/5.0 (X11; U; Linux i...
180568    User-Agent:       Mozilla/5.0 (X11; U; Linux i...
268739    User-Agent:       Mozilla/4.0 (compatible; MSI...
42303     User-Agent:       Mozilla/5.0 (X11; U; Linux i...
236045    Using 3/19 build on Win 95, Win 98, Win NT, Ma...
254036    User-Agent:       Mozilla/5.0 (Windows; U; Win...
35525     Class CNewlineToken allocates an static variab...
37281     Taken from bug 54194\n\nApplets do not print t...
256194    Hovering over an active tab calls grab cursor ...
102473    Build ID: 2005-06-09-06, Windows XP Seamonkey ...
67634     User-Agent:       Mozilla/4.0 (compatible; MSI...
320885    Using build 1999102008 on Win 95.\n\nOpen atta...
163358    The |getElementByTagName| method of DOM |Docum...
214512         port Altss fixes from Aviary Branch to Trunk
11610     User-Agent:       Mozilla/5.0 

In [8]:
X, X_test, y, y_test = train_test_split(df_third['desc_init'], df_third['severity_final'], test_size=0.25, random_state=42)

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(input='content', lowercase=True, tokenizer=None, 
                        stop_words='english', use_idf=True, 
                        token_pattern='[a-zA-Z]+',) #default token_pattern='(?u)\b\w\w+\b'
tfidf = vectorizer.fit_transform(X)

In [45]:
print 'train shape: {}'.format(tfidf.shape)
tfidf_test = vectorizer.transform(X_test)
print 'test shape: {}'.format(tfidf_test.shape)

train shape: (69963, 141415)
test shape: (23321, 141415)


In [46]:
X.head(2)

101678    User-Agent:       Mozilla/5.0 (Macintosh; U; P...
8125      We need to be able to configure at runtime whi...
Name: desc_init, dtype: object

In [51]:
vectorizer.get_feature_names()[:10]

[u'aa',
 u'aaa',
 u'aaaa',
 u'aaaaa',
 u'aaaaaa',
 u'aaaaaaa',
 u'aaaaaaaa',
 u'aaaaaaaaa',
 u'aaaaaaaaaa',
 u'aaaaaaaaaaa']

#### Dimensionality Reduction - PCA

In [None]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=10) #number of dimensions/topics required
pca_model.fit(tfidf.toarray())

In [None]:
pca_train = pca_model.transform(tfidf.toarray())

sum(pca_model.explained_variance_ratio_)

In [None]:
pca_test = pca_model.transform(tfidf_test.toarray())

#### Multinomial Naive Bayes

In [231]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

In [None]:
nb_model.fit(tfidf,y)

In [None]:
y_pred = nb_model.predict(tfidf_test)

print 'accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred))
print 'precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred))
print 'recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred))
print ''
print 'confusion matrix: \n {}'.format(sklearn.metrics.confusion_matrix(y_test, y_pred))