In [823]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from classifier import *
from issues import get_num_code_lines
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, r2_score
from sklearn.model_selection import train_test_split

# Pre-Processing

In [824]:
with open('../data/flutter/flutter_issues_labeled_2.json') as json_data:
    data = json.load(json_data)

In [825]:
df_list = []
for issue in data:
    issue = preprocess_issue(issue)
    for assignee in issue['training_labels']:
        df_dict = {}
        df_dict['comments'] = issue['comments']
        df_dict['assignee'] = assignee
        if (not issue['body']):
            issue['body'] = ""
        df_dict['len_body'] = len(issue['body'])
        df_dict['len_code_in_body'] = get_num_code_lines(issue)
        df_dict['is_doc_change'] = int(classify_issue(issue))
        df_dict['closed_date'] = pd.to_datetime(issue['closed_at'])
        #TODO: figure out how to one-hot-encode labels!!!!!!!!
        #TODO: sliding window for user statistics!!
        df_dict['label'] = issue['training_labels'][assignee]
        df_list.append(df_dict)
df = pd.DataFrame(df_list).sort_values('closed_date')

In [826]:
# Create data for each user
grouped_df = df.groupby('assignee')
df['num_issues_for_user'] = grouped_df['label'].apply(lambda x: x.expanding().count().shift()).fillna(0).astype(int)
df['num_successful_issues_for_user'] = grouped_df['label'].apply(lambda x: x.expanding().sum().shift()).fillna(0).astype(int)
df['percent_successful_for_user'] = (df['num_successful_issues_for_user'] / df['num_issues_for_user']).fillna(0)
display(df.head(15))

Unnamed: 0,assignee,closed_date,comments,is_doc_change,label,len_body,len_code_in_body,num_issues_for_user,num_successful_issues_for_user,percent_successful_for_user
2,abarth,2015-05-19 22:53:58,2,0,0,435,0,0,0,0.0
1,eseidelGoogle,2015-11-07 07:40:51,6,0,0,953,0,0,0,0.0
0,abarth,2015-11-07 07:41:08,3,0,0,26,0,1,0,0.0
6,abarth,2015-11-09 20:13:32,3,1,1,439,0,2,0,0.0
18,abarth,2015-11-09 21:43:40,1,0,1,2877,0,3,1,0.333333
20,collinjackson,2015-11-10 00:40:38,1,0,1,13,0,0,0,0.0
102,abarth,2015-11-11 17:56:29,3,0,1,185,0,4,2,0.5
114,abarth,2015-11-12 19:16:30,7,1,0,1490,114,5,3,0.6
113,abarth,2015-11-12 20:25:12,0,1,1,193,0,6,3,0.5
118,HansMuller,2015-11-13 00:40:47,0,0,1,15,0,0,0,0.0


# Data Analysis

In [827]:
num_positive_labels = 0
total_num_labels = 0
for issue in data:
    for assignee in issue['training_labels']:
        if issue['training_labels'][assignee] == 1:
            num_positive_labels += 1
        total_num_labels += 1
print("num positive events: " + str(num_positive_labels))
print("num total events: " + str(total_num_labels))

num positive events: 819
num total events: 2990


Now filter out any issues that never end up getting a commit/PR before they are closed. Idea being that these are not 'real' issues.

In [828]:
num_positive_labels = 0
total_num_labels = 0
for issue in data:
    for assignee in issue['training_labels']:
        if len(issue['matching_prs']) != 0 or len(issue['matching_commits']) != 0:
            if issue['training_labels'][assignee] == 1:
                num_positive_labels += 1
            total_num_labels += 1
print("num positive events: " + str(num_positive_labels))
print("num total events: " + str(total_num_labels))

num positive events: 819
num total events: 937


In [829]:
person_counts = {}
person_totals = {}
for issue in data:
    for assignee in issue['training_labels']:
        if len(issue['matching_prs']) != 0 or len(issue['matching_commits']) != 0:
            if issue['training_labels'][assignee] == 1:
                person_counts[assignee] = person_counts.get(assignee, 0) + 1
            person_totals[assignee] = person_totals.get(assignee, 0) + 1
person_percents = {}
for person in person_totals:
    person_percents[person] = person_counts.get(person, 0) / person_totals[person]
    
print("Number of successful tasks: ")
print(person_counts)
print("Number of total tasks: ")
print(person_totals)
print("Percentage of tasks completed: ")
print(person_percents)

Number of successful tasks: 
{'Hixie': 96, 'abarth': 422, 'collinjackson': 7, 'HansMuller': 66, 'devoncarew': 1, 'mpcomplete': 20, 'yjbanov': 14, 'krisgiesing': 4, 'danrubel': 8, 'xster': 6, 'sethladd': 4, 'jason-simmons': 18, 'dragostis': 5, 'jimbeveridge': 1, 'tvolkert': 19, 'chinmaygarde': 2, 'pq': 3, 'aghassemi': 3, 'jakobr-google': 11, 'cbracken': 33, 'gspencergoog': 20, 'dvdwasibi': 1, 'johnmccutchan': 4, 'goderbauer': 13, 'lequem': 1, 'mehmetf': 3, 'mravn-google': 4, 'B3rn475': 2, 'jcollins-g': 11, 'amirh': 3, 'szakarias': 4, 'jonahwilliams': 1, 'aam': 3, 'DanTup': 2, 'mraleph': 3, 'blasten': 1}
Number of total tasks: 
{'abarth': 444, 'eseidelGoogle': 2, 'Hixie': 121, 'collinjackson': 11, 'HansMuller': 75, 'devoncarew': 2, 'vlidholt': 2, 'mpcomplete': 22, 'yjbanov': 20, 'krisgiesing': 5, 'danrubel': 9, 'xster': 17, 'sethladd': 7, 'jason-simmons': 19, 'aghassemi': 7, 'dragostis': 7, 'jimbeveridge': 1, 'tvolkert': 19, 'chinmaygarde': 7, 'pq': 3, 'jakobr-google': 11, 'johnmccutchan

In [830]:
zero_count = 0
total_count = 0
for person in person_percents:
    if person_percents[person] == 0.0:
        zero_count += 1
    total_count += 1
print(float(zero_count) / total_count)

0.18181818181818182


In [831]:
#TODO: compare characteristics of issues that are successfully completed.

# Classifier

In [832]:
# Think of Features - past history for dev, etc.
features = ['comments', 'is_doc_change', 'len_body', 'len_code_in_body']
df = df.reset_index(drop=True)
print(df[features].head(5))

   comments  is_doc_change  len_body  len_code_in_body
0         2              0       435                 0
1         6              0       953                 0
2         3              0        26                 0
3         3              1       439                 0
4         1              0      2877                 0


In [833]:
#TODOs:
# Normalize columns
# One Hot Encode UserName, Labels, etc.
# Sliding windows for previous events for user.
# deal with String fields (e.g. assignee)
# Use doc classifier as a feature
# Use other features
# Compare performance to naive model (just take previous % of completion for user...)
# Move to 3 class classifier (two binary combined: can predict if they will get a PR up, will that PR be accepted)

In [834]:
x_train, x_test, y_train, y_test = train_test_split(df[features], df['label'], test_size=0.2, random_state=1)
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

2392
2392
598
598


In [835]:
reg_classifier = LogisticRegression(C=1, class_weight='balanced')
reg_classifier.fit(x_train, y_train)
preds = reg_classifier.predict(x_test)
print("R^2 for model: " + str(r2_score(y_test, preds)))
print(confusion_matrix(y_test, preds))

R^2 for model: -0.9898997807704353
[[200 212]
 [ 43 143]]


In [836]:
coefficients = pd.concat([pd.DataFrame(df[features].columns),pd.DataFrame(np.transpose(reg_classifier.coef_))], axis = 1)
print(coefficients)

                  0         0
0          comments -0.101578
1     is_doc_change -0.042902
2          len_body -0.000012
3  len_code_in_body  0.000007
