In [2]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix


import pandas as pd
from pandas import Series
import numpy as np
import csv
import matplotlib.pyplot as plt


In [3]:
# read file into pandas from the working directory
df= pd.read_csv('mozilla.csv', header=None, names=['Bug ID','Severity','Summary'])

In [4]:
# examine the shape
df.shape

(2301, 3)

In [5]:
# examine the first 10 rows
df.head(10)

Unnamed: 0,Bug ID,Severity,Summary
0,413749,severe,Missing GenerateJava.emtl file in modisco java...
1,467000,severe,[Popup Menu] Too many refreshes when building ...
2,280999,severe,"Symbolic Folder Links, Editor uses absolute ca..."
3,192802,severe,Resolving Proxies using the Transactional API
4,515596,severe,download.eclipse.org timeouts
5,334881,non-severe,Stackoverflow when auto-completing an exception.
6,127835,non-severe,ECore sample: Represent EReference with specif...
7,269347,non-severe,No Code Assist launched after a dot operator.
8,272089,non-severe,[theme] improve the usability of focusing
9,220870,non-severe,Provide a JFace viewer for MonthCalendar


In [6]:
df.head(5)

Unnamed: 0,Bug ID,Severity,Summary
0,413749,severe,Missing GenerateJava.emtl file in modisco java...
1,467000,severe,[Popup Menu] Too many refreshes when building ...
2,280999,severe,"Symbolic Folder Links, Editor uses absolute ca..."
3,192802,severe,Resolving Proxies using the Transactional API
4,515596,severe,download.eclipse.org timeouts


In [7]:
# examine the class distribution
df.Severity.value_counts()

non-severe     2076
severe          224
enhancement       1
Name: Severity, dtype: int64

In [8]:
# convert label to a numerical variable
df['severity_num'] = df.Severity.map({'non-severe':0,'severe':1})

In [9]:
df.head(5)

Unnamed: 0,Bug ID,Severity,Summary,severity_num
0,413749,severe,Missing GenerateJava.emtl file in modisco java...,1.0
1,467000,severe,[Popup Menu] Too many refreshes when building ...,1.0
2,280999,severe,"Symbolic Folder Links, Editor uses absolute ca...",1.0
3,192802,severe,Resolving Proxies using the Transactional API,1.0
4,515596,severe,download.eclipse.org timeouts,1.0


In [10]:
# how to define X and y (from the eclipse data) for use with COUNTVECTORIZER
X= df.Summary
y = df.severity_num
print(X.shape)
print(y.shape)

(2301L,)
(2301L,)


In [11]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1725L,)
(576L,)
(1725L,)
(576L,)


In [12]:
# instantiate the vectorizer
vect = CountVectorizer()

In [13]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [14]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [15]:
# examine the document-term matrix
X_train_dtm

<1725x3650 sparse matrix of type '<type 'numpy.int64'>'
	with 15010 stored elements in Compressed Sparse Row format>

In [16]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<576x3650 sparse matrix of type '<type 'numpy.int64'>'
	with 4437 stored elements in Compressed Sparse Row format>

In [None]:
print(__doc__)
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification

df= pd.read_csv('eclipse.csv', header=None, names=['Bug ID','severity_num','Summary'])

# Build a classification task using 100 informative features
X, y = make_classification(n_samples=2000, n_features=1000, n_informative=100,
                           n_redundant=2, n_repeated=0, n_classes=8,
                           n_clusters_per_class=1, random_state=0)

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

Automatically created module for IPython interactive environment


In [None]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [None]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
from sklearn import metrics
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# print message text for the false positives (non-severe incorrectly classified as severe)
X_test[y_test < y_pred_class]

In [None]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

In [None]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)