In [92]:
import json
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd
import re
import seaborn as sns

from classifier import *
from heapq import nlargest
from issues import get_num_code_lines
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Pre-Processing

In [93]:
with open('../data/flutter/flutter_issues_labeled.json') as json_data:
    issues = json.load(json_data)

In [94]:
print("Number of issues: " + str(len(issues)))
labeled_issues = [issue for issue in issues if len(issue['completed_by']) > 0]
print("Number of labeled issues: " + str(len(labeled_issues)))

Number of issues: 7170
Number of labeled issues: 2504


In [95]:
# create a data frame from the list of issues
df_list = []
for issue in labeled_issues[25:]:
        df_dict = {}
        df_dict['comments'] = issue['comments']
        if (not issue['body']):
            issue['body'] = ""
        df_dict['title'] = issue['title']
        df_dict['body'] = issue['body']
        df_dict['closed_date'] = pd.to_datetime(issue['closed_at'])
        df_dict['created_date'] = pd.to_datetime(issue['created_at'])
        df_dict['completed_by'] = issue['completed_by']
        df_dict['labels'] = [label['name'] for label in issue['labels']]
        df_dict['assignees'] = [assignee['login'] for assignee in issue['assignees']]
        df_list.append(df_dict)
df = pd.DataFrame(df_list).sort_values('closed_date')
df.head(2)

Unnamed: 0,assignees,body,closed_date,comments,completed_by,created_date,labels,title
1,[abarth],When I try to `flutter start` any of the examp...,2015-11-09 20:13:32,3,"[abarth, DanTup]",2015-11-08 20:33:37,"[easy fix, tool]",`flutter start` doesn't give good error messag...
6,[],"<a href=""https://github.com/Hixie""><img src=""h...",2015-11-09 20:18:24,1,[yjbanov],2015-11-09 20:16:52,"[framework, severe: new feature]",Swipe to change between Tabs


In [96]:
print(len(df))
df_temp = df[df.apply(lambda x: len(x['assignees']) != 0, axis=1)].reset_index(drop=True)
print(len(df_temp))
df_temp.head(10)

2479
1540
1540


Unnamed: 0,assignees,body,closed_date,comments,completed_by,created_date,labels,title
0,[abarth],When I try to `flutter start` any of the examp...,2015-11-09 20:13:32,3,"[abarth, DanTup]",2015-11-08 20:33:37,"[easy fix, tool]",`flutter start` doesn't give good error messag...
1,[abarth],flutter start --debug\nsevere: To copy files t...,2015-11-09 21:43:40,1,[abarth],2015-11-09 20:50:45,[tool],Exception running source build of engine on Mac
2,[collinjackson],cc @tvolkert \n,2015-11-10 00:40:38,1,[collinjackson],2015-11-09 21:07:13,[],Implement Dart's HTTP package in Flutter with ...
3,[abarth],When the widget is created with an initial str...,2015-11-11 17:56:29,3,[abarth],2015-11-11 00:57:05,"[a: text input, framework, severe: regression]",Cursor in TextInput is displayed incorrectly
4,[abarth],In components that take a `List<T>` and an `It...,2015-11-12 20:25:12,0,[abarth],2015-11-12 06:10:10,"[easy fix, framework]",ItemBuilder should also get the index of the i...
5,[HansMuller],cc @HansMuller \n,2015-11-13 00:40:47,0,[HansMuller],2015-11-12 20:27:39,"[framework, severe: regression, ⚠ TODAY]",closing the menu in stocks throws an exception
6,[abarth],This is similar to an old issue from the old i...,2015-11-13 00:58:57,4,[abarth],2015-11-12 22:25:34,"[framework, severe: regression]",Updated Widget is sized wrong (but placed corr...
7,[abarth],RenderEditableParagraph width looks as if it w...,2015-11-13 19:19:02,7,[abarth],2015-11-11 11:50:37,"[a: text input, framework]","When there's no text in an input widget, the c..."
8,[HansMuller],Currently if you tap on a stock row (to bring ...,2015-11-16 20:00:10,0,[HansMuller],2015-11-16 19:28:51,[severe: regression],Stocks demo tap fails if BottomSheet is visible
9,[collinjackson],,2015-11-16 22:35:59,0,"[devoncarew, collinjackson]",2015-11-13 23:19:50,"[easy fix, tool]",flutter logs --c should be a synonym for flutt...


In [60]:
# count number of issues with multiple completers
counts = {}
for index, row in df.iterrows():
    count = len(row['completed_by'])
    counts[count] = counts.get(count, 0) + 1
print(counts)
#TODO: confirm that filtering "noise" is best strategy here
print("Number of total issues: " + str(len(df)))
df = df[df.apply(lambda x: len(x['completed_by']) == 1, axis=1)].reset_index(drop=True)
df['completed_by'] = df['completed_by'].apply(lambda  x : x[0])
print("Number of issues with single solver: " + str(len(df)))

{2: 197, 1: 2268, 3: 14}
Number of total issues: 2479
Number of issues with single solver: 2268


In [61]:
# one hot encode the label column
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('labels')),
                          columns=mlb.classes_,
                          index=df.index))
df.head(2)

Unnamed: 0,assignees,body,closed_date,comments,completed_by,created_date,title,a: accessibility,a: animation,a: china,...,team: gallery,tool,waiting for PR to land (fixed),waiting for customer response,⌘‬ platform-mac,⌺‬ platform-ios,▣ platform-android,○ platform-fuchsia,⚠ TODAY,❖ platform-windows
0,[],"<a href=""https://github.com/Hixie""><img src=""h...",2015-11-09 20:18:24,1,yjbanov,2015-11-09 20:16:52,Swipe to change between Tabs,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[abarth],flutter start --debug\nsevere: To copy files t...,2015-11-09 21:43:40,1,abarth,2015-11-09 20:50:45,Exception running source build of engine on Mac,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [62]:
def preprocess(text):
    stemmer = PorterStemmer()
    # remove punctuation and non-alpha numeric characters
    split1 = ' '.join([word for word in re.split('\W+', text) if word.isalpha()])
    # split camel case words apart (necessary for embedded code) and apply stemmer to all words
    split2 = ' '.join([stemmer.stem(word) for word in re.sub('(?!^)([A-Z][a-z]+)', r' \1', split1).split()])
    return split2

In [63]:
# use tf-idf w/ stemming, stop-word removal, and non-alphabetic word removal to generate features
df['body'] = df['body'].apply(preprocess)
vectorizer_body = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_body.fit(df['body'])
vector_body = vectorizer_body.transform(df['body'])
# summarize encoded vector
print(vector_body.shape)
df['title'] = df['title'].apply(preprocess)
vectorizer_title = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_title.fit(df['title'])
vector_title = vectorizer_title.transform(df['title'])
# summarize encoded vector
print(vector_title.shape)

(2268, 4960)
(2268, 1804)


In [64]:
title_df = pd.DataFrame(vector_title.todense())
body_df = pd.DataFrame(vector_body.todense())
df = pd.concat([df, title_df, body_df], axis=1)
df.head(2)

Unnamed: 0,assignees,body,closed_date,comments,completed_by,created_date,title,a: accessibility,a: animation,a: china,...,4950,4951,4952,4953,4954,4955,4956,4957,4958,4959
0,[],a href http github com hixi img src http avata...,2015-11-09 20:18:24,1,yjbanov,2015-11-09 20:16:52,swipe to chang between tab,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,[abarth],flutter start debug sever To copi file to io d...,2015-11-09 21:43:40,1,abarth,2015-11-09 20:50:45,except run sourc build of engin on mac,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
# get list of devs who solve an issue in last three months (from end of train set)
active_devs = set()
dev_counts = {}
filter_date = df.iloc[2000]['closed_date'] - pd.to_timedelta(90, unit='d')
for _, row in df.iterrows():
    if (row['closed_date'] > filter_date):
        dev_counts[row['completed_by']] = dev_counts.get(row['completed_by'], 0) + 1
print(dev_counts)
for dev in dev_counts:
    if dev_counts[dev] >= 3:
        active_devs.add(dev)
print(active_devs)
# remove all issues not solved by an active dev
df = df[df['completed_by'].isin(active_devs)].reset_index(drop=True)
print(len(df))

{'jason-simmons': 21, 'aam': 27, 'yjbanov': 16, 'cbracken': 34, 'Hixie': 33, 'gspencergoog': 10, 'xster': 50, 'goderbauer': 41, 'a-siva': 1, 'chinmaygarde': 1, 'mravn-google': 21, 'HansMuller': 35, 'abarth': 1, 'tvolkert': 17, 'amirh': 4, 'Skylled': 1, 'leobispo': 1, 'sigurdm': 1, 'mehmetf': 1, 'fredriks': 1, 'jakobr-google': 1, 'crelier': 1, 'alibitek': 1, 'mit-mit': 3, 'jcollins-g': 11, 'dnfield': 2, 'DanTup': 5, 'mraleph': 4, 'rmacnak-google': 1, 'DaveShuckerow': 2, 'szakarias': 3, 'xqwzts': 5, 'jonahwilliams': 27, 'danrubel': 1, 'konifar': 2, 'sroddy': 1, 'matanlurey': 1, 'srawlins': 1, 'sbaranov': 1, 'OhadRau': 1, 'scheglov': 2, 'filleduchaos': 1, 'blasten': 2, 'mattsarett': 1, 'devoncarew': 6, 'liyuqian': 1, 'slightfoot': 2, 'ng1905': 1, 'fmatosqg': 2, 'paulcbetts': 1}
{'mraleph', 'xster', 'jason-simmons', 'mravn-google', 'yjbanov', 'szakarias', 'jonahwilliams', 'gspencergoog', 'Hixie', 'tvolkert', 'amirh', 'HansMuller', 'mit-mit', 'goderbauer', 'jcollins-g', 'DanTup', 'devoncare

# Machine Learning

In [66]:
train_df = df[:1100]
test_df = df[1100:]
X_train = train_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title', 'comments', 'assignees'], axis=1)
y_train = train_df['completed_by']
X_test = test_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title', 'comments', 'assignees'], axis=1)
y_test = test_df['completed_by']
print(len(X_train))
print(len(X_test))

1100
337


In [67]:
def is_correct_top_k(pred_prob, k, actual, labels):
    indices = [i for i in range(len(pred_prob))]
    top_indices = nlargest(k, indices, key=lambda i: pred_prob[i])
    top_choices = set([labels[i] for i in top_indices])
    return actual in top_choices

In [68]:
classifier = MLPClassifier(hidden_layer_sizes=(25,25))
classifier.fit(X_train,y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[1100 + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[1100 + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[1100 + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[1100 + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))

Top 1 Accuracy: 0.3264094955489614
Top 2 Accuracy: 0.4421364985163205
Top 3 Accuracy: 0.5074183976261127
Top 4 Accuracy: 0.5667655786350149
Top 5 Accuracy: 0.6142433234421365


In [69]:
classifier = GaussianNB()
classifier.fit(X_train,y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[1100 + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[1100 + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[1100 + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[1100 + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))

Top 1 Accuracy: 0.21364985163204747
Top 2 Accuracy: 0.228486646884273
Top 3 Accuracy: 0.2818991097922849
Top 4 Accuracy: 0.314540059347181
Top 5 Accuracy: 0.3649851632047478


In [70]:
classifier = BernoulliNB()
classifier.fit(X_train,y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[1100 + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[1100 + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[1100 + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[1100 + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))

Top 1 Accuracy: 0.14540059347181009
Top 2 Accuracy: 0.22551928783382788
Top 3 Accuracy: 0.26409495548961426
Top 4 Accuracy: 0.34718100890207715
Top 5 Accuracy: 0.4391691394658754


In [71]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[1100 + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[1100 + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[1100 + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[1100 + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))

Top 1 Accuracy: 0.26409495548961426
Top 2 Accuracy: 0.34718100890207715
Top 3 Accuracy: 0.3827893175074184
Top 4 Accuracy: 0.41839762611275966
Top 5 Accuracy: 0.45103857566765576


In [72]:
classifier = RandomForestClassifier(n_estimators=1000)
classifier.fit(X_train, y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[1100 + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[1100 + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[1100 + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[1100 + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))

KeyboardInterrupt: 

In [None]:
'''
#TODO: try SVC (might need to do something else)
classifier = SVC(probability=True)
classifier.fit(X_train, y_train)
classes = classifier.classes_
print("Top 1 Accuracy: " + str(classifier.score(X_test, y_test)))
pred_probs = classifier.predict_proba(X_test)
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 2, y_test[1100 + idx], classes):
        correct += 1
print ("Top 2 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 3, y_test[1100 + idx], classes):
        correct += 1
print ("Top 3 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 4, y_test[1100 + idx], classes):
        correct += 1
print ("Top 4 Accuracy: " + str((correct/len(y_test))))
correct = 0
for idx, pred in enumerate(pred_probs):
    if is_correct_top_k(pred, 5, y_test[1100 + idx], classes):
        correct += 1
print ("Top 5 Accuracy: " + str((correct/len(y_test))))
'''

# Analysis

In [None]:
classifier = MLPClassifier(hidden_layer_sizes=(25,25))
classifier.fit(X_train,y_train)
preds = classifier.predict(X_test)
print(preds)

In [None]:
actual = test_df['completed_by']
pd.value_counts(actual).plot.bar()

In [None]:
pd.value_counts(df['completed_by']).plot.bar()

In [None]:
pd.value_counts(preds).plot.bar()

In [None]:
overall_counts = pd.value_counts(df['completed_by'])
# list of devs sorted in order of highest contribution
sorted_dev_list = overall_counts.index.values
pred_counts = pd.value_counts(preds)
actual_counts = pd.value_counts(actual)

percent_diff = {}
vals = []
counts = []
missing = set()
for dev in sorted_dev_list:
    if dev in pred_counts:
        val = 100 * (pred_counts[dev] - actual_counts[dev]) / (actual_counts[dev])
        percent_diff[dev] = val
        counts.append(overall_counts[dev])
        vals.append(val)
    else:
        missing.add(dev)
print("devs not included in predictions: ")
print(list(missing))

In [None]:
plt.bar(percent_diff.keys(), percent_diff.values())
plt.title("Percent Difference Prediction Rate vs. Actual Rate")
plt.xticks(rotation=90)
plt.show()

In [None]:
# remove the outlier
del vals[2]
del counts[2]
# create best fit line
z = np.polyfit(x=counts, y=vals, deg=1)
p = np.poly1d(z)
trend_line = p(counts)
# test best fit
yhat = trend_line    
ybar = np.sum(vals)/len(vals)
ssreg = np.sum((yhat-ybar)**2) 
sstot = np.sum((vals - ybar)**2)
print("R^2: " + str(ssreg / sstot))
# create plots
plt.scatter(counts, vals)
plt.title("Percent Difference Between Prediction Rate and Actual Rate vs. Issue Count")
plt.xticks(rotation=90)
plt.xlabel("Number of Issues Solved")
plt.ylabel("% diff. pred rate and actual rate")
plt.plot(counts, trend_line)
plt.show()

In [None]:
#TODO: look at assigning open issues and seeing what overspecialization problem would look like.

In [None]:
test_df.head(5)

In [None]:
test_df.tail(5)

## Open Issues

In [None]:
with open('../data/flutter/flutter_issues_open.json') as json_data:
    issues = json.load(json_data)
print("Number of open issues: " + str(len(issues)))

In [None]:
# create a data frame from the list of issues
df_list = []
for issue in issues:
        df_dict = {}
        df_dict['comments'] = issue['comments']
        if (not issue['body']):
            issue['body'] = ""
        df_dict['title'] = issue['title']
        df_dict['body'] = issue['body']
        #df_dict['closed_date'] = pd.to_datetime(issue['closed_at'])
        df_dict['created_date'] = pd.to_datetime(issue['created_at'])
        #df_dict['completed_by'] = issue['completed_by']
        df_dict['labels'] = [label['name'] for label in issue['labels'] if label in mlb.classes_]
        df_list.append(df_dict)
df = pd.DataFrame(df_list).sort_values('created_date')
df.head(2)

In [None]:
# one hot encode the label column
df = df.join(pd.DataFrame(mlb.transform(df.pop('labels')),
                          columns=mlb.classes_,
                          index=df.index))
df.head(2)

In [None]:
# use tf-idf w/ stemming, stop-word removal, and non-alphabetic word removal to generate features
df['body'] = df['body'].apply(preprocess)
vector_body = vectorizer_body.transform(df['body'])
# summarize encoded vector
print(vector_body.shape)
df['title'] = df['title'].apply(preprocess)
vector_title = vectorizer_title.transform(df['title'])
# summarize encoded vector
print(vector_title.shape)

In [None]:
title_df = pd.DataFrame(vector_title.todense())
body_df = pd.DataFrame(vector_body.todense())
df = pd.concat([df, title_df, body_df], axis=1)
df.head(2)

In [None]:
X = df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title', 'comments'], axis=1)
print(X.shape)
preds = classifier.predict(X)
print(preds)

In [None]:
pd.value_counts(preds).plot.bar()