In [454]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from classifier import *
from issues import get_num_code_lines
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Pre-Processing

In [455]:
# merge new files here
with open('../data/flutter/flutter_issues_labeled_1.json') as json_data:
    issues1 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_2.json') as json_data:
    issues2 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_3.json') as json_data:
    issues3 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_4.json') as json_data:
    issues4 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_5.json') as json_data:
    issues5 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_6.json') as json_data:
    issues6 = json.load(json_data)
'''
with open('../data/flutter/flutter_issues_labeled_7.json') as json_data:
    data7 = json.load(json_data)
with open('../data/flutter/flutter_issues_labeled_8.json') as json_data:
    data8 = json.load(json_data)
'''
issues = issues1 + issues2 + issues3 + issues4 + issues5 + issues6

In [456]:
print("Number of issues: " + str(len(issues)))
labeled_issues = [issue for issue in issues if len(issue['completed_by']) > 0]
print("Number of labeled issues: " + str(len(labeled_issues)))

Number of issues: 6000
Number of labeled issues: 2265


In [457]:
# create a data frame from the list of issues
df_list = []
for issue in labeled_issues[25:]:
        df_dict = {}
        df_dict['comments'] = issue['comments']
        if (not issue['body']):
            issue['body'] = ""
        df_dict['title'] = issue['title']
        df_dict['body'] = issue['body']
        df_dict['closed_date'] = pd.to_datetime(issue['closed_at'])
        df_dict['created_date'] = pd.to_datetime(issue['created_at'])
        df_dict['completed_by'] = issue['completed_by']
        #TODO: figure out how to one-hot-encode labels!!!!!!!!
        df_dict['labels'] = [label['name'] for label in issue['labels']]
        df_list.append(df_dict)
df.head(5)

Unnamed: 0,body,closed_date,comments,completed_by,created_date,title,a: accessibility,a: animation,a: fidelity,a: first hour,...,team: gallery,tool,waiting for PR to land (fixed),waiting for customer response,⌘‬ platform-mac,⌺‬ platform-ios,▣ platform-android,○ platform-fuchsia,⚠ TODAY,❖ platform-windows
1,when I tri to flutter start ani of the exampl ...,2015-11-09 20:13:32,3,"[abarth, DanTup]",2015-11-08 20:33:37,flutter start doesn t give good error messag w...,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,a href http github com hixi img src http avata...,2015-11-09 20:18:24,1,[yjbanov],2015-11-09 20:16:52,swipe to chang between tab,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,flutter start debug sever To copi file to io d...,2015-11-09 21:43:40,1,[abarth],2015-11-09 20:50:45,except run sourc build of engin on mac,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
18,cc tvolkert,2015-11-10 00:40:38,1,[collinjackson],2015-11-09 21:07:13,implement dart s http packag in flutter with h...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,a href http github com hixi img src http avata...,2015-11-10 22:17:26,1,[abarth],2015-11-09 21:06:41,menu item remain interact after menu ha start ...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [458]:
# one hot encode the label column
df = pd.DataFrame(df_list).sort_values('closed_date')
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('labels')),
                          columns=mlb.classes_,
                          index=df.index))
df.head(5)

Unnamed: 0,body,closed_date,comments,completed_by,created_date,title,a: accessibility,a: animation,a: fidelity,a: first hour,...,team: gallery,tool,waiting for PR to land (fixed),waiting for customer response,⌘‬ platform-mac,⌺‬ platform-ios,▣ platform-android,○ platform-fuchsia,⚠ TODAY,❖ platform-windows
1,When I try to `flutter start` any of the examp...,2015-11-09 20:13:32,3,"[abarth, DanTup]",2015-11-08 20:33:37,`flutter start` doesn't give good error messag...,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,"<a href=""https://github.com/Hixie""><img src=""h...",2015-11-09 20:18:24,1,[yjbanov],2015-11-09 20:16:52,Swipe to change between Tabs,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,flutter start --debug\nsevere: To copy files t...,2015-11-09 21:43:40,1,[abarth],2015-11-09 20:50:45,Exception running source build of engine on Mac,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
18,cc @tvolkert \n,2015-11-10 00:40:38,1,[collinjackson],2015-11-09 21:07:13,Implement Dart's HTTP package in Flutter with ...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,"<a href=""https://github.com/Hixie""><img src=""h...",2015-11-10 22:17:26,1,[abarth],2015-11-09 21:06:41,Menu items remain interactive after menu has s...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [459]:
def preprocess(text):
    stemmer = PorterStemmer()
    # remove punctuation and non-alpha numeric characters
    split1 = ' '.join([word for word in re.split('\W+', text) if word.isalpha()])
    # split camel case words apart (necessary for embedded code) and apply stemmer to all words
    split2 = ' '.join([stemmer.stem(word) for word in re.sub('(?!^)([A-Z][a-z]+)', r' \1', split1).split()])
    return split2

In [460]:
# use tf-idf w/ stemming, stop-word removal, and non-alphabetic word removal to generate features
df['body'] = df['body'].apply(preprocess)
vectorizer_body = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_body.fit(df['body'])
vector_body = vectorizer_body.transform(df['body'])
# summarize encoded vector
print(vector_body.shape)
print(vector_body.toarray())
df['title'] = df['title'].apply(preprocess)
vectorizer_title = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer_title.fit(df['title'])
vector_title = vectorizer_title.transform(df['title'])
# summarize encoded vector
print(vector_title.shape)
print(vector_title.toarray())

(2240, 4740)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(2240, 1782)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
