In [116]:
import json
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd
import re
import seaborn as sns

from classifier import *
from heapq import nlargest
from issues import get_num_code_lines
from nltk.stem import PorterStemmer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Pre-Processing

In [117]:
with open('../data/eclipse/eclipse_issues.json') as json_data:
    issues = json.load(json_data)

In [118]:
print("Number of issues: " + str(len(issues)))
labeled_issues = [issue for issue in issues if issue['completed_by']]
print("Number of labeled issues: " + str(len(labeled_issues)))

Number of issues: 440743
Number of labeled issues: 216364


In [119]:
# create a data frame from the list of issues
df_list = []
for issue in labeled_issues:
        df_dict = {}
        df_dict['title'] = issue['short_desc'].replace("\n", "").replace("\t", "")
        df_dict['body'] = issue['long_desc'].replace("\n", "").replace("\t", "")
        df_dict['closed_date'] = pd.to_datetime(issue['completed_at'][:10])
        df_dict['created_date'] = pd.to_datetime(issue['created_at'][:10])
        df_dict['completed_by'] = issue['completed_by']
        df_dict['product'] = issue['product']
        df_dict['component'] = issue['component']
        df_list.append(df_dict)
df = pd.DataFrame(df_list).sort_values('closed_date')
df.tail(2)

Unnamed: 0,body,closed_date,completed_by,component,created_date,product,title
214155,The new page is not offering versioning report...,2018-07-05,nboldt,releng\n\n (show other bugs)\n,2014-08-19,WTP Releng\n\n,Versioning Report for new builds
164418,I received this NPE when comparing two project...,2018-07-09,loskutov,Compare\n\n (show other bugs)\n,2010-09-23,Platform\n\n,NPE when comparing two projects


In [120]:
# one hot encode the component and team columns
df = pd.get_dummies(df, columns=["component", "product"], prefix=["component", "product"])
df.head(2)

Unnamed: 0,body,closed_date,completed_by,created_date,title,component_AGF  (show other bugs),component_AGF Chart  (show other bugs),component_AI  (show other bugs),component_AJBrowser  (show other bugs),component_AJDoc  (show other bugs),...,product_Web Tools,product_WindowBuilder,product_Woolsey,product_Working Groups,product_XWT,product_Xtend,product_e4,product_eTrice,product_m2e,product_z_Archived
1240,- new java project- project properties / Java ...,2001-10-11,Claude_Knaus,2001-10-10,Classpath variable selection dialog has wrong ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1022,To reproduce:Perform a refactoring on a Java r...,2001-10-11,akiezun,2001-10-10,Missing up/down arrows in the refactoring sour...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
size = 210000
#get list of devs who solve an issue in last three months (from end of train set)
active_devs = set()
dev_counts = {}
filter_date = df.iloc[size]['closed_date'] - pd.to_timedelta(90, unit='d')
for _, row in df.iterrows():
    if (row['closed_date'] > filter_date):
        dev_counts[row['completed_by']] = dev_counts.get(row['completed_by'], 0) + 1
#print(dev_counts)
for dev in dev_counts:
    if dev_counts[dev] >= 3:
        active_devs.add(dev)
#print(active_devs)
# remove all issues not solved by an active dev
df = df[df['completed_by'].isin(active_devs)].reset_index(drop=True)
print(len(df))

82093


In [122]:
def preprocess(text):
    stemmer = PorterStemmer()
    # remove punctuation and non-alpha numeric characters
    split1 = ' '.join([word for word in re.split('\W+', text) if word.isalpha()])
    # split camel case words apart (necessary for embedded code) and apply stemmer to all words
    split2 = ' '.join([stemmer.stem(word) for word in re.sub('(?!^)([A-Z][a-z]+)', r' \1', split1).split()])
    return split2

In [123]:
# use tf-idf w/ stemming, stop-word removal, and non-alphabetic word removal to generate features
df['body'] = df['body'].apply(preprocess)
vectorizer_body = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_body.fit(df['body'])
vector_body = vectorizer_body.transform(df['body'])
# summarize encoded vector
print(vector_body.shape)
df['title'] = df['title'].apply(preprocess)
vectorizer_title = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_title.fit(df['title'])
vector_title = vectorizer_title.transform(df['title'])
# summarize encoded vector
print(vector_title.shape)

(82093, 59589)
(82093, 10429)


In [124]:
print(type(vector_title))
print(type(vector_body))
sparse_data = hstack((vector_title, vector_body))
print(sparse_data.shape)
#TODO: these should be numerical features before combining
df['completed_by'] = df['completed_by'].astype('category')
df['completed_by_encode'] = df['completed_by'].cat.codes
#df['completed_by'] = pd.factorize(df['completed_by'])
#sparse_data = hstack((sparse_data,np.array(df['completed_by_encode'])[:,None]))
print(sparse_data.shape)
#TODO: add component and team
filter_cols = [col for col in df if col.startswith('product') or col.startswith('component')]
sparse_data = hstack((sparse_data,np.array(df[filter_cols]))).tocsr()
print(sparse_data.shape)

<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
(82093, 70018)
(82093, 70018)
(82093, 71023)


# Machine Learning

In [125]:
size = 80000
X_train = sparse_data[:size]
X_test = sparse_data[size:]
y_train = df['completed_by'][:size]
y_test = df['completed_by'][size:]

'''
train_df = df[:size]
test_df = df[size:]
X_train = train_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title', 'completed_by_encode'], axis=1)
y_train = train_df['completed_by']
X_test = test_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title', 'completed_by_encode'], axis=1)
y_test = test_df['completed_by']
'''
#print(len(X_train))
#print(len(X_test))
#X_train.head(5)

"\ntrain_df = df[:size]\ntest_df = df[size:]\nX_train = train_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title', 'completed_by_encode'], axis=1)\ny_train = train_df['completed_by']\nX_test = test_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title', 'completed_by_encode'], axis=1)\ny_test = test_df['completed_by']\n"

In [126]:
def is_correct_top_k(pred_prob, k, actual, labels):
    indices = [i for i in range(len(pred_prob))]
    top_indices = nlargest(k, indices, key=lambda i: pred_prob[i])
    top_choices = set([labels[i] for i in top_indices])
    return actual in top_choices

In [None]:
classifier = MLPClassifier(hidden_layer_sizes=(25,25))
classifier.fit(X_train,y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[size + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[size + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[size + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[size + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))

In [None]:
'''
classifier = GaussianNB()
classifier.fit(X_train,y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[size + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[size + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[size + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[size + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))
'''

In [None]:
'''
classifier = BernoulliNB()
classifier.fit(X_train,y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[size + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[size + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[size + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[size + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))
'''

In [None]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[size + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[size + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[size + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[size + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))

In [None]:
'''
classifier = RandomForestClassifier(n_estimators=1000)
classifier.fit(X_train, y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[size + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[size + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[size + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[size + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))
'''

In [None]:
#TODO: try SVC (might need to do something else)
'''
classifier = SVC(probability=True)
classifier.fit(X_train, y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[size + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[size + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[size + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[size + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))
'''

# Analysis

In [None]:
classifier = MLPClassifier(hidden_layer_sizes=(25,25))
classifier.fit(X_train,y_train)
preds = classifier.predict(X_test)
print(preds)

In [None]:
actual = y_test
pd.value_counts(actual).plot.bar()

In [None]:
pd.value_counts(df['completed_by']).plot.bar()

In [None]:
pd.value_counts(preds).plot.bar()

In [None]:
overall_counts = pd.value_counts(df['completed_by'])
# list of devs sorted in order of highest contribution
sorted_dev_list = overall_counts.index.values
pred_counts = pd.value_counts(preds)
actual_counts = pd.value_counts(actual)

percent_diff = {}
vals = []
counts = []
missing = set()
for dev in sorted_dev_list:
    if dev in pred_counts:
        val = 100 * (pred_counts[dev] - actual_counts[dev]) / (actual_counts[dev])
        percent_diff[dev] = val
        counts.append(overall_counts[dev])
        vals.append(val)
    else:
        missing.add(dev)
print("devs not included in predictions: ")
print(list(missing))

In [None]:
plt.bar(percent_diff.keys(), percent_diff.values())
plt.title("Percent Difference Prediction Rate vs. Actual Rate")
plt.xticks(rotation=90)
plt.show()

In [None]:
# remove the outlier
#del vals[2]
#del counts[2]
# create best fit line
print(counts)
print(vals)
counts2 = []
vals2 = []
for i, val in enumerate(vals):
    if val == float("inf") or val > 1100:
        continue
    counts2.append(counts[i])
    vals2.append(vals[i])
print(counts2)
print(vals2)
counts = counts2
vals = vals2
z = np.polyfit(x=counts, y=vals, deg=1)
p = np.poly1d(z)
trend_line = p(counts)
# test best fit
yhat = trend_line    
ybar = np.sum(vals)/len(vals)
ssreg = np.sum((yhat-ybar)**2) 
sstot = np.sum((vals - ybar)**2)
print("R^2: " + str(ssreg / sstot))
# create plots
plt.scatter(counts, vals)
plt.title("Percent Difference Between Prediction Rate and Actual Rate vs. Issue Count")
plt.xticks(rotation=90)
plt.xlabel("Number of Issues Solved")
plt.ylabel("% diff. pred rate and actual rate")
plt.plot(counts, trend_line)
plt.show()

In [None]:
#TODO: look at assigning open issues and seeing what overspecialization problem would look like.

In [None]:
test_df.head(5)

In [None]:
test_df.tail(5)