In [1161]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from classifier import *
from issues import get_num_code_lines
from nltk.stem import PorterStemmer
from sklearn.ensemble import importRandomForestClassifier
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier

SyntaxError: invalid syntax (<ipython-input-1161-ff2f345e6fc8>, line 11)

# Pre-Processing

In [None]:
with open('../data/flutter/flutter_issues_labeled.json') as json_data:
    issues = json.load(json_data)

In [None]:
print("Number of issues: " + str(len(issues)))
labeled_issues = [issue for issue in issues if len(issue['completed_by']) > 0]
print("Number of labeled issues: " + str(len(labeled_issues)))

In [None]:
# create a data frame from the list of issues
df_list = []
for issue in labeled_issues[25:]:
        df_dict = {}
        df_dict['comments'] = issue['comments']
        if (not issue['body']):
            issue['body'] = ""
        df_dict['title'] = issue['title']
        df_dict['body'] = issue['body']
        df_dict['closed_date'] = pd.to_datetime(issue['closed_at'])
        df_dict['created_date'] = pd.to_datetime(issue['created_at'])
        df_dict['completed_by'] = issue['completed_by']
        #TODO: figure out how to one-hot-encode labels!!!!!!!!
        df_dict['labels'] = [label['name'] for label in issue['labels']]
        df_list.append(df_dict)
df = pd.DataFrame(df_list).sort_values('closed_date')
df.head(2)

In [None]:
# count number of issues with multiple completers
counts = {}
for index, row in df.iterrows():
    count = len(row['completed_by'])
    counts[count] = counts.get(count, 0) + 1
print(counts)
#TODO: confirm that filtering "noise" is best strategy here
print("Number of total issues: " + str(len(df)))
df = df[df.apply(lambda x: len(x['completed_by']) == 1, axis=1)].reset_index(drop=True)
df['completed_by'] = df['completed_by'].apply(lambda  x : x[0])
print("Number of issues with single solver: " + str(len(df)))

In [None]:
# one hot encode the label column
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('labels')),
                          columns=mlb.classes_,
                          index=df.index))
df.head(2)

In [None]:
def preprocess(text):
    stemmer = PorterStemmer()
    # remove punctuation and non-alpha numeric characters
    split1 = ' '.join([word for word in re.split('\W+', text) if word.isalpha()])
    # split camel case words apart (necessary for embedded code) and apply stemmer to all words
    split2 = ' '.join([stemmer.stem(word) for word in re.sub('(?!^)([A-Z][a-z]+)', r' \1', split1).split()])
    return split2

In [None]:
# use tf-idf w/ stemming, stop-word removal, and non-alphabetic word removal to generate features
df['body'] = df['body'].apply(preprocess)
vectorizer_body = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_body.fit(df['body'])
vector_body = vectorizer_body.transform(df['body'])
# summarize encoded vector
print(vector_body.shape)
df['title'] = df['title'].apply(preprocess)
vectorizer_title = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_title.fit(df['title'])
vector_title = vectorizer_title.transform(df['title'])
# summarize encoded vector
print(vector_title.shape)

In [None]:
title_df = pd.DataFrame(vector_title.todense())
body_df = pd.DataFrame(vector_body.todense())
df = pd.concat([df, title_df, body_df], axis=1)
df.head(2)

In [None]:
# get list of devs who solve an issue in last three months (from end of train set)
active_devs = set()
dev_counts = {}
filter_date = df.iloc[2000]['closed_date'] - pd.to_timedelta(90, unit='d')
for _, row in df.iterrows():
    if (row['closed_date'] > filter_date):
        dev_counts[row['completed_by']] = dev_counts.get(row['completed_by'], 0) + 1
print(dev_counts)
for dev in dev_counts:
    if dev_counts[dev] >= 4:
        active_devs.add(dev)
print(active_devs)
# remove all issues not solved by an active dev
df = df[df['completed_by'].isin(active_devs)].reset_index(drop=True)
print(len(df))

# Machine Learning

In [None]:
train_df = df[:1200]
test_df = df[1200:]
X_train = train_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title'], axis=1)
y_train = train_df['completed_by']
X_test = test_df.drop(['body', 'closed_date', 'completed_by', 'created_date', 'title'], axis=1)
y_test = test_df['completed_by']
print(len(X_train))
print(len(X_test))

In [None]:
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)
preds = classifier.predict(X_test)
correct = 0
for idx, pred in enumerate(preds):
    if pred == y_test[1200 + idx]:
        correct += 1
print (correct/len(y_test))