In [1536]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from classifier import *
from issues import get_num_code_lines
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.model_selection import train_test_split

# Pre-Processing

In [1537]:
# merge new files here
with open('../data/flutter/flutter_issues_labeled_1.json') as json_data:
    data1 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_2.json') as json_data:
    data2 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_3.json') as json_data:
    data3 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_4.json') as json_data:
    data4 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_5.json') as json_data:
    data5 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_6.json') as json_data:
    data6 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_7.json') as json_data:
    data7 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_8.json') as json_data:
    data8 = json.load(json_data)
data = data1 + data2 + data3 + data4 + data5 + data6 + data7 + data8

In [1538]:
#with open('../data/flutter/flutter_issues_labeled.json') as json_data:
#    data = json.load(json_data)

In [1539]:
def get_label(issue, assignee):
    if len(issue['matching_prs']) != 0 or len(issue['matching_commits']) != 0:
        if issue['training_labels'][assignee] == 1:
            return 1
        return 0
    return 2

In [1540]:
df_list = []
for issue in data:
    issue = preprocess_issue(issue)
    for assignee in issue['training_labels']:
        df_dict = {}
        df_dict['comments'] = issue['comments']
        df_dict['assignee'] = assignee
        if (not issue['body']):
            issue['body'] = ""
        df_dict['len_body'] = len(issue['body'])
        df_dict['len_code_in_body'] = get_num_code_lines(issue)
        df_dict['is_doc_change'] = int(classify_issue(issue))
        df_dict['closed_date'] = pd.to_datetime(issue['closed_at'])
        #TODO: figure out how to one-hot-encode labels!!!!!!!!
        df_dict['label'] = get_label(issue, assignee)
        # only consider issues that ultimately have a PR by someone
        if df_dict['label'] == 2:
            break
        df_list.append(df_dict)
df = pd.DataFrame(df_list).sort_values('closed_date')

In [1541]:
# Create data for each user
grouped_df = df.groupby('assignee')
df['num_issues_for_user'] = grouped_df['label'].apply(lambda x: x.expanding().count().shift()).fillna(0).astype(int)
df['num_successful_issues_for_user'] = grouped_df['label'].apply(lambda x: x.expanding().sum().shift()).fillna(0).astype(int)
df['percent_successful_for_user'] = (df['num_successful_issues_for_user'] / df['num_issues_for_user'])
df['mean_success_percent'] = df['percent_successful_for_user'].expanding().mean().shift().fillna(0)
df['percent_successful_for_user'] = df['percent_successful_for_user'].fillna(df['mean_success_percent'])
display(df.head(25))

Unnamed: 0,assignee,closed_date,comments,is_doc_change,label,len_body,len_code_in_body,num_issues_for_user,num_successful_issues_for_user,percent_successful_for_user,mean_success_percent
2,abarth,2015-05-19 22:53:58,2,0,0,435,0,0,0,0.0,0.0
1,eseidelGoogle,2015-11-07 07:40:51,6,0,0,953,0,0,0,0.0,0.0
0,abarth,2015-11-07 07:41:08,3,0,0,26,0,1,0,0.0,0.0
6,abarth,2015-11-09 20:13:32,3,1,1,439,0,2,0,0.0,0.0
18,abarth,2015-11-09 21:43:40,1,0,1,2877,0,3,1,0.333333,0.0
20,collinjackson,2015-11-10 00:40:38,1,0,1,13,0,0,0,0.111111,0.111111
102,abarth,2015-11-11 17:56:29,3,0,1,185,0,4,2,0.5,0.111111
111,abarth,2015-11-12 20:25:12,0,1,1,193,0,5,3,0.6,0.208333
113,HansMuller,2015-11-13 00:40:47,0,0,1,15,0,0,0,0.286667,0.286667
116,abarth,2015-11-13 00:58:57,4,0,1,4034,3206,6,4,0.666667,0.286667


# Data Analysis

In [1542]:
num_positive_labels = 0
total_num_labels = 0
for issue in data:
    for assignee in issue['training_labels']:
        if issue['training_labels'][assignee] == 1:
            num_positive_labels += 1
        total_num_labels += 1
print("num positive events: " + str(num_positive_labels))
print("num total events: " + str(total_num_labels))

num positive events: 1434
num total events: 3095


Now filter out any issues that never end up getting a commit/PR before they are closed. Idea being that these are not 'real' issues.

In [1543]:
num_positive_labels = 0
total_num_labels = 0
for issue in data:
    for assignee in issue['training_labels']:
        if len(issue['matching_prs']) != 0 or len(issue['matching_commits']) != 0:
            if issue['training_labels'][assignee] == 1:
                num_positive_labels += 1
            total_num_labels += 1
print("num positive events: " + str(num_positive_labels))
print("num total events: " + str(total_num_labels))

num positive events: 1434
num total events: 1756


In [1544]:
person_counts = {}
person_totals = {}
for issue in data:
    for assignee in issue['training_labels']:
        if len(issue['matching_prs']) != 0 or len(issue['matching_commits']) != 0:
            if issue['training_labels'][assignee] == 1:
                person_counts[assignee] = person_counts.get(assignee, 0) + 1
            person_totals[assignee] = person_totals.get(assignee, 0) + 1
person_percents = {}
for person in person_totals:
    person_percents[person] = person_counts.get(person, 0) / person_totals[person]
    
print("Number of successful tasks: ")
print(person_counts)
print("Number of total tasks: ")
print(person_totals)
print("Percentage of tasks completed: ")
print(person_percents)

Number of successful tasks: 
{'Hixie': 240, 'abarth': 443, 'collinjackson': 15, 'goderbauer': 15, 'HansMuller': 178, 'devoncarew': 26, 'dragostis': 6, 'chinmaygarde': 17, 'sethladd': 20, 'yjbanov': 40, 'mpcomplete': 22, 'jason-simmons': 23, 'johnmccutchan': 20, 'apwilson': 2, 'krisgiesing': 4, 'danrubel': 18, 'qchong': 1, 'xster': 76, 'jimbeveridge': 1, 'tvolkert': 19, 'pq': 6, 'aghassemi': 3, 'jakobr-google': 28, 'cbracken': 69, 'gspencergoog': 15, 'dvdwasibi': 1, 'mravn-google': 39, 'lequem': 1, 'mehmetf': 4, 'mit-mit': 13, 'mattsarett': 1, 'szakarias': 6, 'sivachandra': 1, 'jcollins-g': 15, 'B3rn475': 3, 'jonahwilliams': 20, 'amirh': 3, 'rmacnak-google': 1, 'sigurdm': 1, 'aam': 8, 'DanTup': 2, 'mraleph': 3, 'sbaranov': 1, 'DaveShuckerow': 1, 'scheglov': 2, 'blasten': 1}
Number of total tasks: 
{'abarth': 522, 'eseidelGoogle': 5, 'jason-simmons': 37, 'Hixie': 298, 'collinjackson': 31, 'mpcomplete': 27, 'sethladd': 33, 'goderbauer': 18, 'krisgiesing': 6, 'HansMuller': 189, 'devoncarew

In [1545]:
zero_count = 0
total_count = 0
for person in person_percents:
    if person_percents[person] == 0.0:
        zero_count += 1
    total_count += 1
print(float(zero_count) / total_count)

0.14814814814814814


In [1546]:
display(df[df['num_issues_for_user'] == 0].head(5))
print("num first time issues " + str(len(df[df['num_issues_for_user'] == 0])))
print("num successful first time issues " + str(len(df[(df['num_issues_for_user'] == 0) & (df['label'] == 1)])))

Unnamed: 0,assignee,closed_date,comments,is_doc_change,label,len_body,len_code_in_body,num_issues_for_user,num_successful_issues_for_user,percent_successful_for_user,mean_success_percent
2,abarth,2015-05-19 22:53:58,2,0,0,435,0,0,0,0.0,0.0
1,eseidelGoogle,2015-11-07 07:40:51,6,0,0,953,0,0,0,0.0,0.0
20,collinjackson,2015-11-10 00:40:38,1,0,1,13,0,0,0,0.111111,0.111111
113,HansMuller,2015-11-13 00:40:47,0,0,1,15,0,0,0,0.286667,0.286667
106,Hixie,2015-11-17 21:46:08,2,1,0,146,0,0,0,0.611839,0.611839


num first time issues 54
num successful first time issues 34


In [1547]:
#TODO: compare characteristics of issues that are successfully completed.

# Classifier

In [1548]:
# Think of Features - past history for dev, etc.
features = ['comments', 'is_doc_change', 'len_body', 'len_code_in_body', 'percent_successful_for_user', 'num_issues_for_user']
df = df.reset_index(drop=True)
print(df[features].head(5))

   comments  is_doc_change  len_body  len_code_in_body  \
0         2              0       435                 0   
1         6              0       953                 0   
2         3              0        26                 0   
3         3              1       439                 0   
4         1              0      2877                 0   

   percent_successful_for_user  num_issues_for_user  
0                     0.000000                    0  
1                     0.000000                    0  
2                     0.000000                    1  
3                     0.000000                    2  
4                     0.333333                    3  


In [1549]:
#TODOs:
# Compare issue title/body to issues that user previously tackled
# Get features for user from other repos
# Normalize columns
# One Hot Encode UserName, Labels, etc.
# Sliding windows for previous events for user.
# deal with String fields (e.g. assignee)
# Use doc classifier as a feature
# Use other features
# Compare performance to naive model (just take previous % of completion for user...)
# Move to 3 class classifier (two binary combined: can predict if they will get a PR up, will that PR be accepted)

In [1550]:
length = int(len(df) * .85)
#x_train, x_test, y_train, y_test = train_test_split(df[features], df['label'], test_size=0.2, random_state=1)
x_train = df[features][:length]
x_test = df[features][length:]
y_train = df['label'][:length]
y_test = df['label'][length:]


print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

print(sum(y_train))
print(sum(y_test))

1492
1492
264
264
1223
211


In [1551]:
reg_classifier = LogisticRegression(C=1)
reg_classifier.fit(x_train, y_train)
preds = reg_classifier.predict(x_test)
print("R^2 for model: " + str(r2_score(y_test, preds)))
print(confusion_matrix(y_test, preds))
print(accuracy_score(y_test, preds))

R^2 for model: -0.2747920951444154
[[  2  51]
 [  3 208]]
0.7954545454545454


In [1552]:
coefficients = pd.concat([pd.DataFrame(df[features].columns),pd.DataFrame(np.transpose(reg_classifier.coef_))], axis = 1)
print(coefficients)

                             0         0
0                     comments -0.075595
1                is_doc_change  0.364381
2                     len_body -0.000041
3             len_code_in_body  0.000107
4  percent_successful_for_user  2.526940
5          num_issues_for_user -0.000200
