## Imports

In [None]:
import os
import math
import jsonpickle

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

## Constants

In [None]:
DATA_FOLDER = "../data/"

In [None]:
LABELED_ISSUES_FILE_KEY = "labeled_issues_of_interest_"

## Load data

In [None]:
REPO_TO_ID = {}

In [None]:
# Load labeled issues
labeled_issues_of_interest_data = {}
for file in os.listdir(DATA_FOLDER):
    if LABELED_ISSUES_FILE_KEY in file:
        repoId = file.replace(LABELED_ISSUES_FILE_KEY, "").replace(".json", "")
        with open(os.path.join(DATA_FOLDER, file), "r") as f_in:
            for line in f_in:
                repo_labeled_issues = jsonpickle.decode(line)
        for repo in repo_labeled_issues:
            REPO_TO_ID[repo] = repoId
        labeled_issues_of_interest_data.update(repo_labeled_issues)
#===
for repo in labeled_issues_of_interest_data:
    if "list" in str(type(labeled_issues_of_interest_data[repo])):
        adjusted_labeled_issues_of_interest = {}
        for issue in labeled_issues_of_interest_data[repo]:
            if issue is not None:
                adjusted_labeled_issues_of_interest[str(issue["number"])] = issue
        labeled_issues_of_interest_data[repo] = adjusted_labeled_issues_of_interest

In [None]:
print("Repo cnt:", len(labeled_issues_of_interest_data))

In [None]:
issue_cnt = 0
for repoId in labeled_issues_of_interest_data:
    repo = labeled_issues_of_interest_data[repoId]
    issue_cnt = issue_cnt + len(repo)
print("Final issue cnt:", issue_cnt)

## Discover important words

In [None]:
stemmer = SnowballStemmer(language='english')

In [None]:
bug_word_cnts = { }
other_word_cnts = { }
#
bug_cnt = 0
other_cnt = 0
#
for repo in labeled_issues_of_interest_data:
    for issueId in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][issueId]
        #
        if issue is None or "type" not in issue or issue["type"] is None:
            continue
        #
        title = issue["title"]
        description = issue["body"]
        text = title if title is not None else ""
        text = text + " "
        text = text + (description if description is not None else "")
        #
        words = set(text.split())
        words = set([word.replace(".","").replace(",","").lower() for word in words])
        #
        for word in words:
            if len(word)<3 or not word.replace("'","").isalpha():
                continue
            word = stemmer.stem(word)
            cnt = text.count(word)
            if cnt > 0:
                if issue["type"]=="Bug":
                    if word not in bug_word_cnts:
                        bug_word_cnts[word] = (0, 0)
                    tc, dc = bug_word_cnts[word]
                    tc = tc + cnt
                    dc = dc + 1
                    bug_word_cnts[word] = (tc, dc)
                else:
                    if word not in other_word_cnts:
                        other_word_cnts[word] = (0, 0)
                    tc, dc = other_word_cnts[word]
                    tc = tc + cnt
                    dc = dc + 1
                    other_word_cnts[word] = (tc, dc)
        #
        if issue["type"]=="Bug":
            bug_cnt = bug_cnt + 1
        else:
            other_cnt = other_cnt + 1

In [None]:
for word in bug_word_cnts:
    tc, dc = bug_word_cnts[word]
    bug_word_cnts[word] = math.log(tc/dc)*(dc/bug_cnt)
#
for word in other_word_cnts:
    tc, dc = other_word_cnts[word]
    other_word_cnts[word] = math.log(tc/dc)*(dc/other_cnt)

In [None]:
for word in bug_word_cnts:
    if word in other_word_cnts:
        bug_word_cnts[word] = bug_word_cnts[word] - other_word_cnts[word]

In [None]:
bug_word_cnts = [(bug_word_cnts[word],word) for word in bug_word_cnts]

In [None]:
bug_word_cnts.sort()
bug_word_cnts.reverse()

In [None]:
for i, entry in enumerate(bug_word_cnts):
    print(i, "==>", entry)

In [None]:
WORDS_TO_TAKE = 75
important_bug_words = set([entry[1] for entry in bug_word_cnts][:WORDS_TO_TAKE])
important_other_words = set([entry[1] for entry in reversed(bug_word_cnts)][:WORDS_TO_TAKE])

In [None]:
print(important_bug_words)

In [None]:
print(important_other_words)

## Visualize words

In [None]:
bug_texts = ""
other_texts = ""
for repo in labeled_issues_of_interest_data:
    for issueId in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][issueId]
        #
        if issue is None or "type" not in issue or issue["type"] is None:
            continue
        #
        title = issue["title"]
        description = issue["body"]
        text = title if title is not None else ""
        text = text + " "
        text = text + (description if description is not None else "")
        #
        if issue["type"]=="Bug":
            text = " ".join([word.strip() for word in text.split() if word in important_bug_words]) 
            bug_texts = bug_texts + text + "\n"
        else:
            text = " ".join([word.strip() for word in text.split() if word in important_other_words]) 
            other_texts = other_texts + text + "\n"

In [None]:
stopwords = set(STOPWORDS)

In [None]:
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white", collocations=False, colormap="Reds").generate(bug_texts)

# Display the generated image:
plt.figure(figsize=(12, 9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("negative_words.pdf")
plt.show()

In [None]:
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white", collocations=False).generate(other_texts)

# Display the generated image:
plt.figure(figsize=(12, 9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("positive_words.pdf")
plt.show()