In [869]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from classifier import *
from issues import get_num_code_lines
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Pre-Processing

In [870]:
with open('../data/flutter/flutter_issues_labeled.json') as json_data:
    issues = json.load(json_data)

In [871]:
print("Number of issues: " + str(len(issues)))
labeled_issues = [issue for issue in issues if len(issue['completed_by']) > 0]
print("Number of labeled issues: " + str(len(labeled_issues)))

Number of issues: 7170
Number of labeled issues: 2504


In [872]:
# create a data frame from the list of issues
df_list = []
for issue in labeled_issues[25:]:
        df_dict = {}
        df_dict['comments'] = issue['comments']
        if (not issue['body']):
            issue['body'] = ""
        df_dict['title'] = issue['title']
        df_dict['body'] = issue['body']
        df_dict['closed_date'] = pd.to_datetime(issue['closed_at'])
        df_dict['created_date'] = pd.to_datetime(issue['created_at'])
        df_dict['completed_by'] = issue['completed_by']
        #TODO: figure out how to one-hot-encode labels!!!!!!!!
        df_dict['labels'] = [label['name'] for label in issue['labels']]
        df_list.append(df_dict)
df = pd.DataFrame(df_list).sort_values('closed_date')
df.head(2)

Unnamed: 0,body,closed_date,comments,completed_by,created_date,labels,title
1,When I try to `flutter start` any of the examp...,2015-11-09 20:13:32,3,"[abarth, DanTup]",2015-11-08 20:33:37,"[easy fix, tool]",`flutter start` doesn't give good error messag...
6,"<a href=""https://github.com/Hixie""><img src=""h...",2015-11-09 20:18:24,1,[yjbanov],2015-11-09 20:16:52,"[framework, severe: new feature]",Swipe to change between Tabs


In [873]:
# count number of issues with multiple completers
counts = {}
for index, row in df.iterrows():
    count = len(row['completed_by'])
    counts[count] = counts.get(count, 0) + 1
print(counts)
#TODO: confirm that filtering "noise" is best strategy here
print("Number of total issues: " + str(len(df)))
df = df[df.apply(lambda x: len(x['completed_by']) == 1, axis=1)].reset_index(drop=True)
df['completed_by'] = df['completed_by'].apply(lambda  x : x[0])
print("Number of issues with single solver: " + str(len(df)))

{2: 197, 1: 2268, 3: 14}
Number of total issues: 2479
Number of issues with single solver: 2268


In [874]:
# one hot encode the label column
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('labels')),
                          columns=mlb.classes_,
                          index=df.index))
df.head(2)

Unnamed: 0,body,closed_date,comments,completed_by,created_date,title,a: accessibility,a: animation,a: china,a: fidelity,...,team: gallery,tool,waiting for PR to land (fixed),waiting for customer response,⌘‬ platform-mac,⌺‬ platform-ios,▣ platform-android,○ platform-fuchsia,⚠ TODAY,❖ platform-windows
0,"<a href=""https://github.com/Hixie""><img src=""h...",2015-11-09 20:18:24,1,yjbanov,2015-11-09 20:16:52,Swipe to change between Tabs,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,flutter start --debug\nsevere: To copy files t...,2015-11-09 21:43:40,1,abarth,2015-11-09 20:50:45,Exception running source build of engine on Mac,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [875]:
def preprocess(text):
    stemmer = PorterStemmer()
    # remove punctuation and non-alpha numeric characters
    split1 = ' '.join([word for word in re.split('\W+', text) if word.isalpha()])
    # split camel case words apart (necessary for embedded code) and apply stemmer to all words
    split2 = ' '.join([stemmer.stem(word) for word in re.sub('(?!^)([A-Z][a-z]+)', r' \1', split1).split()])
    return split2

In [876]:
# use tf-idf w/ stemming, stop-word removal, and non-alphabetic word removal to generate features
df['body'] = df['body'].apply(preprocess)
vectorizer_body = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_body.fit(df['body'])
vector_body = vectorizer_body.transform(df['body'])
# summarize encoded vector
print(vector_body.shape)
df['title'] = df['title'].apply(preprocess)
vectorizer_title = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_title.fit(df['title'])
vector_title = vectorizer_title.transform(df['title'])
# summarize encoded vector
print(vector_title.shape)

(2268, 4960)
(2268, 1804)


In [877]:
title_df = pd.DataFrame(vector_title.todense())
body_df = pd.DataFrame(vector_body.todense())
df = pd.concat([df, title_df, body_df], axis=1)
df.head(2)

Unnamed: 0,body,closed_date,comments,completed_by,created_date,title,a: accessibility,a: animation,a: china,a: fidelity,...,4950,4951,4952,4953,4954,4955,4956,4957,4958,4959
0,a href http github com hixi img src http avata...,2015-11-09 20:18:24,1,yjbanov,2015-11-09 20:16:52,swipe to chang between tab,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,flutter start debug sever To copi file to io d...,2015-11-09 21:43:40,1,abarth,2015-11-09 20:50:45,except run sourc build of engin on mac,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Machine Learning

In [878]:
X = df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title'], axis=1)
y = df['completed_by']

0    yjbanov
1     abarth
Name: completed_by, dtype: object