# Exploration of Issue Data from React Repository

In [None]:
import operator
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from classifier import *
from issues import *

## Issue Data Summary

In [None]:
with open('data/react/react_issues_closed.json') as f:
    closed_issues = json.load(f)
print(str(len(closed_issues)) + ' closed issues')
with open('data/react/react_issues_open.json') as f:
    open_issues = json.load(f)
print(str(len(open_issues)) + ' open issues')

In [None]:
get_issue_by_title(closed_issues, "Extract react prop from ref")

### Issues that are classified as trivial or documentation changes are labeled 'True', all other issues are labeled as 'False'.

In [None]:
closed_labels = classify_issues(closed_issues)
open_labels = classify_issues(open_issues)

ig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True)
sns.countplot(closed_labels, ax=ax1).set_title("closed issue labels")
sns.countplot(open_labels, ax=ax2).set_title("open issue labels")

In [None]:
pos_issues_closed = [issue for (i,issue) in enumerate(closed_issues) if closed_labels[i]]
neg_issues_closed = [issue for (i,issue) in enumerate(closed_issues) if not closed_labels[i]]
pos_issues_open = [issue for (i,issue) in enumerate(open_issues) if open_labels[i]]
neg_issues_open = [issue for (i,issue) in enumerate(open_issues) if not open_labels[i]]

## Exploration of Positive Labeled Issues

In [None]:
# Print a sample of positive label titles
print("positive label titles: ")
for issue in pos_issues_closed[:20]:
    print ("\t" + "\"" + issue['title'] + "\"")

### The classifier misclassifies some examples as documentation/trivial issues that are more complex upon manual inspection. Example below is an issue that was flagged despite being a significant code change with a lot of discussion around it.

In [None]:
misclass_issue = get_issue_by_title(pos_issues_closed, "null props considered differently in getdefaultprops vs. isrequired")
print('title: ' + misclass_issue[0]['title'])
print('')
print('body: ' + misclass_issue[0]['body'])

## Exploration of Negative Labeled Issues

In [None]:
print("negative label titles: ")
for issue in neg_issues_closed[:20]:
    print ("\t" + "\"" + issue['title'] + "\"")

### The classifier misses some examples that upon further inspection are clearly simple changes. Example below is an issue that was resolved by simply changing a single URL in a markdown file.

In [None]:
misclass_issue = get_issue_by_title(neg_issues_closed, 'small update to bower command')
print('title: ' + misclass_issue[0]['title'])
print('')
print('body: ' + misclass_issue[0]['body'])

## Comparsion of Issue Classes

In [None]:
neg_issues_freqs = sorted(get_word_freq_title(neg_issues_closed).items(), key=lambda x: x[1], reverse=True)
pos_issues_freqs = sorted(get_word_freq_title(pos_issues_closed).items(), key=lambda x: x[1], reverse=True)

# print top 20 words for negative issues
print("Top words in Negative Issue Titles: ")
for (word, freq) in neg_issues_freqs[:30]:
    print("\t" + word + ": " + str(freq))
print("")
# print top 20 words for positive issues
print("Top words in Positive Issue Titles: ")
for (word, freq) in pos_issues_freqs[:30]:
    print("\t" + word + ": " + str(freq))

In [None]:
#TODO: list words with biggest differences between the two classes

In [None]:
neg_issues_freqs = sorted(get_word_freq_body(neg_issues_closed).items(), key=lambda x: x[1], reverse=True)
pos_issues_freqs = sorted(get_word_freq_body(pos_issues_closed).items(), key=lambda x: x[1], reverse=True)

# print top 20 words for negative issues
print("Top words in Negative Issue Bodies: ")
for (word, freq) in neg_issues_freqs[:30]:
    print("\t" + word + ": " + str(freq))
print("")
# print top 20 words for positive issues
print("Top words in Positive Issue Bodies: ")
for (word, freq) in pos_issues_freqs[:30]:
    print("\t" + word + ": " + str(freq))

In [None]:
#TODO: list words with biggest differences between the two classes
neg_freqs = get_word_freq_body(neg_issues_closed)
pos_freqs = get_word_freq_body(pos_issues_closed)
freq_diffs = {}
for word in neg_freqs:
    freq_diffs[word] = neg_freqs[word] - pos_freqs.get(word, 0)
for word in pos_freqs:
    if word not in neg_freqs:
        freq_diffs[word] = - pos_freqs[word]
freq_diffs = sorted(freq_diffs.items(), key=lambda x: x[1], reverse=True)
# print top 20 words for negative issues
print("Top words in Negative Issue Bodies compared to Positive Issues: ")
for (word, freq) in freq_diffs[:20]:
    print("\t" + word + ": " + str(freq))
print("")
# print top 20 words for positive issues
print("Top words in Positive Issue Bodies compared to Negative Issues: ")
for (word, freq) in freq_diffs[:-20:-1]:
    print("\t" + word + ": " + str(freq))

In [None]:
#TODO: list words with biggest differences between the two classes
neg_freqs = get_word_freq_title(neg_issues_closed)
pos_freqs = get_word_freq_title(pos_issues_closed)
freq_diffs = {}
for word in neg_freqs:
    freq_diffs[word] = neg_freqs[word] - pos_freqs.get(word, 0)
for word in pos_freqs:
    if word not in neg_freqs:
        freq_diffs[word] = - pos_freqs[word]
freq_diffs = sorted(freq_diffs.items(), key=lambda x: x[1], reverse=True)
# print top 20 words for negative issues
print("Top words in Negative Issue Titles compared to Positive Issues: ")
for (word, freq) in freq_diffs[:20]:
    print("\t" + word + ": " + str(freq))
print("")
# print top 20 words for positive issues
print("Top words in Positive Issue Titles compared to Negative Issues: ")
for (word, freq) in freq_diffs[:-20:-1]:
    print("\t" + word + ": " + str(freq))

In [None]:
print("Positive issue data summary ")
pos_df = pd.DataFrame({'body length': [len(issue['body']) for issue in pos_issues_closed],
                      'comments': [issue['comments'] for issue in pos_issues_closed],
                      'lines of code': [get_num_code_lines(issue) for issue in pos_issues_closed]})
pos_df.describe()

In [None]:
print("Negative issue datary summary: ")
neg_df = pd.DataFrame({'body length': [len(issue['body']) for issue in neg_issues_closed],
                      'comments': [issue['comments'] for issue in neg_issues_closed],
                      'lines of code': [get_num_code_lines(issue) for issue in neg_issues_closed]})
neg_df.describe()

In [None]:
#TODO: Manually look at recent issues in React to determine normal use case.

In [None]:
#TODO: other statistics that may be meaningful?????