## Imports

In [None]:
import os
import jsonpickle
from ast import literal_eval as make_tuple

In [None]:
import re

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
#
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
#
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix

In [None]:
from datasets import Dataset
#
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
#
from torch.utils.data import DataLoader
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

In [None]:
import fasttext

In [None]:
from sklearn.manifold import TSNE
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt

In [None]:
from tabulate import tabulate

In [None]:
import warnings
warnings.simplefilter("ignore")

## Constants

In [None]:
DATA_FOLDER = "./../data/"
#
FILE_OF_INTEREST = "files_of_interest.json"
FILE_OF_INTEREST_SOURCE = "files_of_interest_source_lookup.json"
LABELED_FILE_KEY = "labeled_issues_of_interest_"
EMBEDDING_FILE_KEY = "file_of_interest_embedding_lookup_"

## Utils

In [None]:
def find_file(commit, file_name):
    for file in commit["files"]:
        if file["name"] == file_name:
            return file
    return None

## Load data

In [None]:
# Load files of interest 
with open(os.path.join(DATA_FOLDER, FILE_OF_INTEREST), "r") as f_in:
    for line in f_in:
        file_of_interest_data = jsonpickle.decode(line)

In [None]:
REPO_TO_ID = {}

In [None]:
# Load labeled issues
labeled_issues_of_interest_data = {}
for file in os.listdir(DATA_FOLDER):
    if LABELED_FILE_KEY in file:
        repoId = file.replace(LABELED_FILE_KEY, "").replace(".json", "")
        with open(os.path.join(DATA_FOLDER, file), "r") as f_in:
            for line in f_in:
                repo_labeled_issues = jsonpickle.decode(line)
        for repo in repo_labeled_issues:
            REPO_TO_ID[repo] = repoId
        labeled_issues_of_interest_data.update(repo_labeled_issues)
#===
for repo in labeled_issues_of_interest_data:
    if "list" in str(type(labeled_issues_of_interest_data[repo])):
        adjusted_labeled_issues_of_interest = {}
        for issue in labeled_issues_of_interest_data[repo]:
            if issue is not None:
                adjusted_labeled_issues_of_interest[str(issue["number"])] = issue
        labeled_issues_of_interest_data[repo] = adjusted_labeled_issues_of_interest

In [None]:
# Load files of interest source
with open(os.path.join(DATA_FOLDER, FILE_OF_INTEREST_SOURCE), "r") as f_in:
    for line in f_in:
        file_states = jsonpickle.decode(line)
#
adjusted_file_states = {}
for entry in file_states:
    t = make_tuple(entry)
    adjusted_file_states[t] = file_states[entry]
#
file_states = adjusted_file_states

In [None]:
# Load file embedding 
file_embeddings_data = {}
for file in os.listdir(DATA_FOLDER):
    if EMBEDDING_FILE_KEY in file:
        with open(os.path.join(DATA_FOLDER, file), "r") as f_in:
            for line in f_in:
                repo_file_embeddings = jsonpickle.decode(line)   
        file_embeddings_data.update(repo_file_embeddings)

## Adjust embeddings - missing file and commit id

In [None]:
adjuted_cnt = 0

In [None]:
for key in file_states:
    if "embedding" in file_states[key]:
        continue
    #
    repo, _, _ = key
    #
    for entry in file_embeddings_data[repo]:
        if file_states[key]["source"] == entry["text"] and "embedding" in entry:
            file_states[key]["embedding"] = entry["embedding"]
            adjuted_cnt = adjuted_cnt + 1
    #
    if (adjuted_cnt+1)%50==0:
        print(adjuted_cnt)
print(adjuted_cnt)

In [None]:
encoding = jsonpickle.encode(file_states)
with open(os.path.join(DATA_FOLDER, "files_of_interest_with_embeddings.json"), "w") as f_out:
    f_out.write(encoding)

## Load adjusted data

In [None]:
FILE_OF_INTEREST_WITH_EMBEDDINGS = "files_of_interest_with_embeddings.json"

In [None]:
# Load files of interest with embeddings
with open(os.path.join(DATA_FOLDER, FILE_OF_INTEREST_WITH_EMBEDDINGS), "r") as f_in:
    for line in f_in:
        file_states = jsonpickle.decode(line)

In [None]:
adjusted_file_states = {}
for entry in file_states:
    t = make_tuple(entry)
    adjusted_file_states[t] = file_states[entry]
#
file_states = adjusted_file_states

In [None]:
pass

In [None]:
with open(os.path.join(DATA_FOLDER, "repo_to_id.json"), "r") as f_in:
    for line in f_in:
        REPO_TO_ID = jsonpickle.decode(line)

## Construct datasets

In [None]:
dataset = {}
for repo in file_of_interest_data:
    if repo not in labeled_issues_of_interest_data:
        continue
    dataset[repo] = {}
    for file_name in file_of_interest_data[repo]:
        commits = file_of_interest_data[repo][file_name]
        commits = sorted(commits, key=lambda c: c["date"])
        #
        all_refs_cnt = 0
        for commit in commits:
            file = find_file(commit, file_name)
            if file is None:
                continue
            all_refs_cnt = all_refs_cnt + len(commit["refs"])        
        #
        for commit in commits:
            file = find_file(commit, file_name)
            if file is None:
                continue
            #
            key = (repo, file["sha"], commit["sha"])
            has_source = key in file_states and "source" in file_states[key]
            has_embedding = key in file_states and "embedding" in file_states[key] and file_states[key]["embedding"] is not None 
            if not has_source or not has_embedding:
                continue
            #
            bug_cnt = 0
            undefined_cnt = 0
            for ref in commit["refs"]:
                if ref in labeled_issues_of_interest_data[repo] and labeled_issues_of_interest_data[repo][ref]:
                    if "type" in labeled_issues_of_interest_data[repo][ref]:
                        if labeled_issues_of_interest_data[repo][ref]["type"] == "Bug":
                            bug_cnt = bug_cnt + 1
                    else:
                        undefined_cnt = undefined_cnt + 1
                else:
                        undefined_cnt = undefined_cnt + 1
            #
            if has_source and has_embedding and (bug_cnt>0 or undefined_cnt==0):
                if file_name not in dataset[repo]:
                    dataset[repo][file_name] = []
                #
                source = file_states[key]["source"]
                lines_of_code = len([line for line in source.split("\n") if len(line.strip()) > 0 ])
                embedding = file_states[key]["embedding"]
                bug = 1 if bug_cnt > 0 else 0
                #
                dataset[repo][file_name].append((source, lines_of_code, embedding, len(commits), all_refs_cnt, len(commit["refs"]), commit["refs"], bug))   

In [None]:
print_data = []
for repo in dataset:
    cnt = 0
    bug_cnt = 0
    for file in dataset[repo]:
        cnt = cnt + 1
        for version in dataset[repo][file]:
            source, lines_of_code, embedding, commit_cnt, all_refs_cnt, refs_cnt, refs, bug = version
            #
            if bug > 0:
                bug_cnt = bug_cnt + 1
                break
            #
        r = bug_cnt/cnt
        random_f1 = 2*r/(r+1)
    print_data.append([repo, bug_cnt, cnt, f"{round(100*bug_cnt/cnt, 2)}%", f"{round(100*random_f1, 2)}%"])
print(tabulate(print_data, headers=["Repo", "BugCnt", "Cnt", "Share", "MaxF1"]))

## Classification experiment

In [None]:
def get_prop_from_version(prop, version):
    source, lines_of_code, embedding, commit_cnt, all_refs_cnt, refs_cnt, refs, bug = version
    if prop == "source":
        return source
    if prop == "loc":
        return lines_of_code
    if prop == "embedding":
        return embedding
    if prop == "commit_cnt":
        return commit_cnt
    if prop == "all_refs_cnt":
        return all_refs_cnt
    if prop == "refs_cnt":
        return refs_cnt
    if prop == "refs":
        return refs
    if prop == "bug":
        return bug
    return None

In [None]:
def calc_diff_vec(embeddings):
    if len(embeddings) < 2:
        return np.zeros(embeddings[0].shape)
    else:
        diffs = []
        for i in range(len(embeddings)-1):
            diff = embeddings[i] - embeddings[i+1]
            diffs.append(diff)
        return np.asarray(sum(diffs))

In [None]:
def calc_x(versions):
    locs = [get_prop_from_version("loc", v) for v in versions]
    avg_loc = np.asarray([sum(locs)/len(locs)])
    #
    embeddings = [get_prop_from_version("embedding", v) for v in versions]
    x_mean = np.asarray(sum(embeddings) / len(versions))
    x_diff = calc_diff_vec(embeddings)
    #
    commit_cnt = np.asarray([get_prop_from_version("commit_cnt", versions[0])])
    #
    all_refs_cnt = np.asarray([get_prop_from_version("all_refs_cnt", versions[0])])
    #
    ref_cnts = [get_prop_from_version("refs_cnt", v) for v in versions]
    avg_ref_cnt = np.asarray([sum(ref_cnts) / len(ref_cnts)])
    #
    x = np.concatenate((avg_loc, x_mean, x_diff, commit_cnt, all_refs_cnt, avg_ref_cnt))
    return x

In [None]:
def calc_y(versions):
    bug_cnt = sum([get_prop_from_version("bug", v) for v in versions])
    return 1 if bug_cnt > 0 else 0

In [None]:
MODELS = [("LogisticRegression", lambda: LogisticRegression()), 
          ("KNeighborsClassifier", lambda: KNeighborsClassifier(1)), 
          ("GaussianNB", lambda: GaussianNB()), 
          ("DecisionTreeClassifier", lambda: DecisionTreeClassifier()),
          #("SVM", lambda: SVC())
         ]

In [None]:
columns=["Model", "Precision", "Recall", "F1", "MCC"]

### Experiment gold

In [None]:
gold_experimental_results = {}
for repo in dataset:
    gold_experimental_results[repo] = []

In [None]:
REP_CNT = 30
for repo in dataset:
    print(repo)
    #======
    X = []
    Y = []
    for file in dataset[repo]:
        x = calc_x(dataset[repo][file])
        y = calc_y(dataset[repo][file])
        #
        X.append(x)
        Y.append(y)
    X = np.asarray(X)
    y = np.asarray(Y)
    #======
    for rep in range(REP_CNT):
        if (rep+1) % 5 == 0:
            print(f"\t {rep+1}/{REP_CNT}")
        #==
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        #
        for model_name, model_provider in MODELS:
            classifier = model_provider()
            #
            classifier.fit(X_train, y_train)
            #
            yp = classifier.predict(X_test)
            #
            classifier_precision = precision_score(y_test, yp)
            classifier_recall = recall_score(y_test, yp)
            classifier_f1 = f1_score(y_test, yp)
            classifier_mcc = matthews_corrcoef(y_test, yp)
            #
            gold_experimental_results[repo].append([model_name, classifier_precision, classifier_recall, classifier_f1, classifier_mcc])

In [None]:
for repo in gold_experimental_results:
    df = pd.DataFrame(gold_experimental_results[repo], columns=columns)
    print(repo)
    print(df.groupby(["Model"]).mean())
    #boxplot = df.boxplot(column=columns) 
    #plt.show()
    print()

### Experiment heursitic

In [None]:
nlp_datasets = {}
for repo in dataset:
    if repo not in labeled_issues_of_interest_data:
        continue
    #
    nlp_datasets[repo] = []
    #
    for ref in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][ref]
        if issue is None or "type" not in issue or issue["type"] is None or "text" not in issue:
            continue
        label = 1 if issue["type"] == 'Bug' else 0
        nlp_datasets[repo].append({"text": issue["text"], "label": label})

In [None]:
heuristic_model_results = {}
for repo in nlp_datasets:
    p = []
    y = []
    for issue in nlp_datasets[repo]:
        is_bug = False
        for key in ["bug", "fix"]:
            if key in issue["text"]:
                is_bug = True
                break
        p.append(1 if is_bug else 0)
        y.append(issue["label"])
        #
        classifier_precision = precision_score(y, p)
        classifier_recall = recall_score(y, p)    
        classifier_f1 = f1_score(y, p)
        classifier_mcc = matthews_corrcoef(y, p)
        #
        heuristic_model_results[repo] = [classifier_precision, classifier_recall, classifier_f1, classifier_mcc]

In [None]:
def calc_y_keyword_heuristic(versions, repo, keywords=["bug", "fix"], locations=["title", "body"]):
    for version in versions:
        refs = get_prop_from_version("refs", version)
        for ref in refs:
            if ref in labeled_issues_of_interest_data[repo] and labeled_issues_of_interest_data[repo][ref]:
                issue = labeled_issues_of_interest_data[repo][ref]
                for word in keywords:
                    for location in locations:
                        if issue[location] is not None:
                            if word in issue[location].lower():
                                return 1
    return 0            

In [None]:
heuristic_experimental_results = {}
for repo in dataset:
    heuristic_experimental_results[repo] = []
#
heuristics_labels = {}

In [None]:
REP_CNT = 30
for repo in dataset:
    print(repo)
    #======
    X = []
    YT = []
    YH = []
    for file in dataset[repo]:
        versions = dataset[repo][file]
        x = calc_x(versions)
        yt = calc_y(versions)
        yh = calc_y_keyword_heuristic(versions, repo)
        #
        X.append(x)
        YT.append(yt)
        YH.append(yh)
    X = np.asarray(X)
    y = np.asarray(YT)
    yh = np.asarray(YH)
    #======
    heuristics_labels[repo] = {
        "Y": YT,
        "YE": YH
    }
    #======
    for rep in range(REP_CNT):
        if (rep+1) % 5 == 0:
            print(f"\t {rep+1}/{REP_CNT}")
        #==
        X_train, X_test, y_train, _, _, y_test  = train_test_split(X, yh, y, test_size=0.2)
        #
        for model_name, model_provider in MODELS:
            classifier = model_provider()
            #
            classifier.fit(X_train, y_train)
            #
            yp = classifier.predict(X_test)
            #
            classifier_precision = precision_score(y_test, yp)
            classifier_recall = recall_score(y_test, yp)
            classifier_f1 = f1_score(y_test, yp)
            classifier_mcc = matthews_corrcoef(y_test, yp)
            #
            heuristic_experimental_results[repo].append([model_name, classifier_precision, classifier_recall, classifier_f1, classifier_mcc])

In [None]:
for repo in heuristic_experimental_results:
    df = pd.DataFrame(heuristic_experimental_results[repo], columns=columns)
    print(repo)
    print(df.groupby(["Model"]).mean())
    #boxplot = df.boxplot(column=columns) 
    #plt.show()
    print()

### Experiment improved Heuristic

In [None]:
for repo in labeled_issues_of_interest_data:
    for ref in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][ref]
        if issue is None or "type" not in issue or issue["type"] is None:
            continue
        issue_title = issue["title"]
        issue_description = issue["body"]
        issue_title = "" if issue_title is None else issue_title
        issue_description = "" if issue_description is None else issue_description
        #
        text = (issue_title + " " + issue_description).lower().replace("\n", " ").replace("\r", " ").replace("  ", " ")
        text = ''.join([c for c in text  if c.isalpha()])
        issue["text"] = text
        #
        issue["description"] = issue_description.lower().replace("\n", " ").replace("\r", " ").replace("  ", " ")
        issue["description"] = ''.join([c for c in issue["description"]  if c.isalpha()])

In [None]:
nlp_datasets = {}
for repo in dataset:
    if repo not in labeled_issues_of_interest_data:
        continue
    #
    nlp_datasets[repo] = []
    #
    for ref in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][ref]
        if issue is None or "type" not in issue or issue["type"] is None or "text" not in issue:
            continue
        label = 1 if issue["type"] == 'Bug' else 0
        nlp_datasets[repo].append({"text": issue["text"], "title": issue["title"].lower(), "description": issue["description"], "label": label})

In [None]:
starting_words = ['bug', 'fix', 'issue', 'out', 'error',
                  'not', 'line', 'test']
#
combination_cnt = 0x1 << len(starting_words)
strategies = []
for i in range(1, combination_cnt):
    keywords = []
    for w_id in range(len(starting_words)):
        if ((0x1 << w_id) & i) > 0:
            keywords.append(starting_words[w_id])
    strategies.append(keywords)

In [None]:
best_strategy_for_repo = {}
for repo in nlp_datasets:
    best_strategy = None
    best_strategy_f1 = 0
    #best_fcnt = 50000
    for description_keywords in strategies:
        for title_keywords in strategies:
            p = []
            y = []
            for issue in nlp_datasets[repo]:
                is_bug = False
                for key in title_keywords:
                    if key in issue["title"]:
                        is_bug = True
                        break
                if is_bug is False:
                    for key in description_keywords:
                        if key in issue["description"]:
                            is_bug = True
                            break
                p.append(1 if is_bug else 0)
                y.append(issue["label"])
        #
        f1 = f1_score(y, p)
        if f1>best_strategy_f1:
            best_strategy_f1 = f1
            best_strategy = (description_keywords, title_keywords)
            print("============================")
            print(f"{repo} => BEST: {best_strategy}")
            print("============================")         
    best_strategy_for_repo[repo] = best_strategy
    print()

In [None]:
improved_heuristic_model_results = {}
for repo in nlp_datasets:
    description_keywords, title_keywords = best_strategy_for_repo[repo]
    p = []
    y = []
    for issue in nlp_datasets[repo]:
        is_bug = False
        for key in title_keywords:
            if key in issue["title"]:
                is_bug = True
                break
        if is_bug is False:
            for key in description_keywords:
                if key in issue["description"]:
                    is_bug = True
                    break
        p.append(1 if is_bug else 0)
        y.append(issue["label"])
        #
        classifier_precision = precision_score(y, p)
        classifier_recall = recall_score(y, p)    
        classifier_f1 = f1_score(y, p)
        classifier_mcc = matthews_corrcoef(y, p)
        #
        improved_heuristic_model_results[repo] = [classifier_precision, classifier_recall, classifier_f1, classifier_mcc]

In [None]:
def calc_y_improved_keyword_heuristic(versions, repo, keywords):
    description_keywords, title_keywords = keywords
    for version in versions:
        refs = get_prop_from_version("refs", version)
        for ref in refs:
            if ref in labeled_issues_of_interest_data[repo] and labeled_issues_of_interest_data[repo][ref]:
                issue = labeled_issues_of_interest_data[repo][ref]
                #
                for word in title_keywords:
                    if issue["title"] is not None:
                        if word in issue["title"].lower():
                            return 1                
                #                         
                for word in description_keywords:
                    if issue["body"] is not None:
                        text = issue["body"].lower().replace("\n", " ").replace("\r", " ").replace("  ", " ")
                        text = ''.join([c for c in text  if c.isalpha()])
                        if word in text:
                            return 1
    return 0

In [None]:
improved_heuristic_experimental_results = {}
for repo in dataset:
    improved_heuristic_experimental_results[repo] = []
#
improved_heuristics_labels = {}

In [None]:
REP_CNT = 30
for repo in dataset:
    print(repo)
    #======
    X = []
    YT = []
    YH = []
    for file in dataset[repo]:
        versions = dataset[repo][file]
        x = calc_x(versions)
        yt = calc_y(versions)
        yh = calc_y_improved_keyword_heuristic(versions, repo, best_strategy_for_repo[repo])
        #
        X.append(x)
        YT.append(yt)
        YH.append(yh)
    X = np.asarray(X)
    y = np.asarray(YT)
    yh = np.asarray(YH)
    #======
    improved_heuristics_labels[repo] = {
        "Y": YT,
        "YE": YH
    }
    #======
    for rep in range(REP_CNT):
        if (rep+1) % 5 == 0:
            print(f"\t {rep+1}/{REP_CNT}")
        #==
        X_train, X_test, y_train, _, _, y_test  = train_test_split(X, yh, y, test_size=0.2)
        #
        for model_name, model_provider in MODELS:
            classifier = model_provider()
            #
            classifier.fit(X_train, y_train)
            #
            yp = classifier.predict(X_test)
            #
            classifier_precision = precision_score(y_test, yp)
            classifier_recall = recall_score(y_test, yp)
            classifier_f1 = f1_score(y_test, yp)
            classifier_mcc = matthews_corrcoef(y_test, yp)
            #
            improved_heuristic_experimental_results[repo].append([model_name, classifier_precision, classifier_recall, classifier_f1, classifier_mcc])

In [None]:
for repo in improved_heuristic_experimental_results:
    df = pd.DataFrame(improved_heuristic_experimental_results[repo], columns=columns)
    print(repo)
    print(df.groupby(["Model"]).mean())
    #boxplot = df.boxplot(column=columns) 
    #plt.show()
    print()

### Experiment NLP - BERT

In [None]:
def calc_y_bert(versions, lookup):
    for version in versions:
        refs = get_prop_from_version("refs", version)
        for ref in refs:
            ref = int(ref)
            if ref in lookup:
                if lookup[ref] > 0:
                    return 1
    return 0

In [None]:
for repo in labeled_issues_of_interest_data:
    for ref in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][ref]
        if issue is None or "type" not in issue or issue["type"] is None:
            continue
        issue_title = issue["title"]
        issue_description = issue["body"]
        issue_title = "" if issue_title is None else issue_title
        issue_description = "" if issue_description is None else issue_description
        #
        CLEANR = re.compile('<.*?>') 
        text = re.sub(CLEANR, ' ', issue_description)
        text = issue_title + " " + text
        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        text = re.sub(' +', ' ', text)
        text = re.sub(' http.*? ', ' [link] ', text)
        #
        issue["text"] = text

In [None]:
print_data = []
nlp_datasets = {}
for repo in dataset:
    if repo not in labeled_issues_of_interest_data:
        continue
    #
    nlp_datasets[repo] = {
        "train": [],
        "test": []
    }
    #
    train_bug_cnt = 0
    for other_repo in dataset:
        if repo == other_repo or other_repo not in labeled_issues_of_interest_data:
            continue
        for ref in labeled_issues_of_interest_data[other_repo]:
            issue = labeled_issues_of_interest_data[other_repo][ref]
            if issue is None or "type" not in issue or issue["type"] is None or "text" not in issue:
                continue
            label = 1 if issue["type"] == 'Bug' else 0
            train_bug_cnt = train_bug_cnt + label
            nlp_datasets[repo]["train"].append({"text": issue["text"], "label": label})
    #
    test_bug_cnt = 0
    for ref in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][ref]
        if issue is None or "type" not in issue or issue["type"] is None or "text" not in issue:
            continue
        label = 1 if issue["type"] == 'Bug' else 0
        test_bug_cnt = test_bug_cnt  + label
        nlp_datasets[repo]["test"].append({"issueId": issue["number"], "text": issue["text"], "label": label})
    #
    print_data.append([repo, train_bug_cnt, len(nlp_datasets[repo]["train"]), test_bug_cnt, len(nlp_datasets[repo]["test"])])
print(tabulate(print_data, headers=["Repo", "TrainBugCnt", "TrainCnt", "TestBugCnt", "TestCnt"]))

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
def tokenize(entry):
    return tokenizer(entry["text"], padding="max_length", truncation=True)

In [None]:
metric = evaluate.load("f1")
#
def compute_metrics(eval_pred):
    o, y = eval_pred
    yp = np.argmax(o, axis=-1)
    #
    return metric.compute(predictions=yp, references=y)

In [None]:
bert_experimental_results = {}
for repo in dataset:
    bert_experimental_results[repo] = []
#
bert_model_results = {}
bert_labels = {}

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
TRAIN_SIZE = 0.8
REP_CNT = 30
for repo in dataset:
    print(repo)
    #===============================
    # Train BERT
    train_nlp = Dataset.from_pandas(pd.DataFrame(nlp_datasets[repo]["train"]))
    #
    permutation = torch.randperm(len(train_nlp)).tolist()
    train_cnt = int(len(train_nlp) * TRAIN_SIZE)
    train_indices = permutation[:train_cnt]
    val_indices = permutation[train_cnt:]
    val_nlp = train_nlp.select(val_indices)
    train_nlp = train_nlp.select(train_indices)
    #
    train_nlp = train_nlp.map(tokenize, batched=True)
    val_nlp = val_nlp.map(tokenize, batched=True)
    #
    model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
    if device == "cuda":
        model.cuda()
    #
    training_args = TrainingArguments(
                         overwrite_output_dir=True,
                         output_dir=f"roberta-issue-classifier-{REPO_TO_ID[repo]}",
                         evaluation_strategy="epoch",
                         learning_rate=2e-5,
                         logging_strategy='epoch',
                         per_device_train_batch_size=4,
                         per_device_eval_batch_size=4,
                         save_total_limit=3,
                         num_train_epochs=6, 
                         gradient_accumulation_steps=4,
                         gradient_checkpointing=True,
                         weight_decay=1e-3,
                         save_strategy='epoch',
                         load_best_model_at_end=True)
    #
    trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_nlp,
                eval_dataset=val_nlp,
                compute_metrics=compute_metrics)
    #
    trainer.train()
    #
    model.save_pretrained(f"roberta-issue-classifier-{REPO_TO_ID[repo]}/evaluated_model")
    #
    test_nlp = Dataset.from_pandas(pd.DataFrame(nlp_datasets[repo]["test"]))
    test_nlp = test_nlp.map(tokenize, batched=True)
    #
    copy_test_nlp = test_nlp.select([i for i in range(len(test_nlp))])
    copy_test_nlp = copy_test_nlp.remove_columns(["text"]).rename_column("label", "labels")
    copy_test_nlp.set_format("torch")
    #
    del train_nlp
    del val_nlp
    if device == "cuda":
        torch.cuda.empty_cache()
    #
    eval_dataloader = DataLoader(copy_test_nlp, batch_size=16)
    #
    model.eval()
    #
    all_labels = []
    all_preds = []
    all_issueId = []
    for batch in eval_dataloader:
        all_labels.append(batch['labels'].detach().cpu())
        all_issueId.append(batch['issueId'].detach().cpu())
        del batch["issueId"]
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            hs = model(**batch, output_hidden_states=True)
            logits = hs.logits
            predictions = torch.argmax(logits, dim=-1)
            last_hiddens = hs.hidden_states[-1][:,0,:]
            all_preds.append(predictions)
    #
    all_labels = torch.cat(all_labels, 0).detach().cpu().numpy()
    all_preds = torch.cat(all_preds, 0).detach().cpu().numpy()                          
    all_issueId = torch.cat(all_issueId, 0).detach().cpu().numpy()
    #
    bert_precision = precision_score(all_labels, all_preds)
    bert_recall = recall_score(all_labels, all_preds)
    bert_f1 = f1_score(all_labels, all_preds)
    bert_mcc = matthews_corrcoef(all_labels, all_preds)   
    #
    bert_model_results[repo] = [bert_precision, bert_recall, bert_f1, bert_mcc]
    #
    issue_lbl_lookup = {}
    for i in range(len(test_nlp)):
        issue_bert_lbl = all_preds[i]
        issue_id = all_issueId[i]
        issue_lbl_lookup[issue_id] = issue_bert_lbl
    #===============================
    X = []
    YT = []
    YH = []
    for file in dataset[repo]:
        versions = dataset[repo][file]
        x = calc_x(versions)
        yt = calc_y(versions)
        yh = calc_y_bert(versions, issue_lbl_lookup)
        #
        X.append(x)
        YT.append(yt)
        YH.append(yh)
    X = np.asarray(X)
    y = np.asarray(YT)
    yh = np.asarray(YH)
    #======
    bert_labels[repo] = {
        "Y": YT,
        "YE": YH
    }
    #======
    for rep in range(REP_CNT):
        if (rep+1) % 5 == 0:
            print(f"\t {rep+1}/{REP_CNT}")
        #==
        X_train, X_test, y_train, _, _, y_test  = train_test_split(X, yh, y, test_size=0.2)
        #
        for model_name, model_provider in MODELS:
            classifier = model_provider()
            #
            classifier.fit(X_train, y_train)
            #
            yp = classifier.predict(X_test)
            #
            classifier_precision = precision_score(y_test, yp)
            classifier_recall = recall_score(y_test, yp)
            classifier_f1 = f1_score(y_test, yp)
            classifier_mcc = matthews_corrcoef(y_test, yp)
            #
            bert_experimental_results[repo].append([model_name, classifier_precision, classifier_recall, classifier_f1, classifier_mcc])

In [None]:
print_data = []
for repo in bert_model_results:
    print_data.append([repo]+bert_model_results[repo])
print(tabulate(print_data, headers=["Repo", "Precision", "Recall", "F1", "MCC"]))

In [None]:
for repo in bert_experimental_results:
    df = pd.DataFrame(bert_experimental_results[repo], columns=columns)
    print(repo)
    print(df.groupby(["Model"]).mean())
    #boxplot = df.boxplot(column=columns) 
    #plt.show()
    print()

### Experiment NLP - Fasttext

In [None]:
def calc_y_fasttext(versions, lookup):
    for version in versions:
        refs = get_prop_from_version("refs", version)
        for ref in refs:
            ref = int(ref)
            if ref in lookup:
                if lookup[ref] > 0:
                    return 1
    return 0

In [None]:
for repo in labeled_issues_of_interest_data:
    for ref in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][ref]
        if issue is None or "type" not in issue or issue["type"] is None:
            continue
        issue_title = issue["title"]
        issue_description = issue["body"]
        issue_title = "" if issue_title is None else issue_title
        issue_description = "" if issue_description is None else issue_description
        #
        CLEANR = re.compile('<.*?>') 
        text = re.sub(CLEANR, ' ', issue_description)
        text = issue_title + " " + text
        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        text = re.sub(' +', ' ', text)
        text = re.sub(' http.*? ', ' [link] ', text)
        #
        issue["text"] = text        

In [None]:
print_data = []
nlp_datasets = {}
for repo in dataset:
    if repo not in labeled_issues_of_interest_data:
        continue
    #
    nlp_datasets[repo] = {
        "train": [],
        "test": []
    }
    #
    train_bug_cnt = 0
    for other_repo in dataset:
        if repo == other_repo or other_repo not in labeled_issues_of_interest_data:
            continue
        for ref in labeled_issues_of_interest_data[other_repo]:
            issue = labeled_issues_of_interest_data[other_repo][ref]
            if issue is None or "type" not in issue or issue["type"] is None or "text" not in issue:
                continue
            label = 1 if issue["type"] == 'Bug' else 0
            train_bug_cnt = train_bug_cnt + label
            nlp_datasets[repo]["train"].append({"text": issue["text"], "label": label})
    #
    test_bug_cnt = 0
    for ref in labeled_issues_of_interest_data[repo]:
        issue = labeled_issues_of_interest_data[repo][ref]
        if issue is None or "type" not in issue or issue["type"] is None or "text" not in issue:
            continue
        label = 1 if issue["type"] == 'Bug' else 0
        test_bug_cnt = test_bug_cnt  + label
        nlp_datasets[repo]["test"].append({"issueId": issue["number"], "text": issue["text"], "label": label})
    #
    print_data.append([repo, train_bug_cnt, len(nlp_datasets[repo]["train"]), test_bug_cnt, len(nlp_datasets[repo]["test"])])
print(tabulate(print_data, headers=["Repo", "TrainBugCnt", "TrainCnt", "TestBugCnt", "TestCnt"]))

In [None]:
def fasttext_adjust_text(text):
    text = " ".join([line for line in text.split("\n") if len(line.strip())>0])
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

In [None]:
def write_fasttext_file(filename, data):
    with open(filename, "w") as f_out:
        for entry in data:
            text = fasttext_adjust_text(entry["text"])
            label = entry["label"]
            f_out.write(f"__label__{label} {text}") 
            f_out.write("\n")

In [None]:
fasttext_experimental_results = {}
for repo in dataset:
    fasttext_experimental_results[repo] = []
#
fasttext_model_results = {}
fasttext_labels = {}

In [None]:
TRAIN_SIZE = 0.8
REP_CNT = 30
for repo in dataset:
    print(repo)
    #===============================
    # Train FastText
    train_nlp = nlp_datasets[repo]["train"]
    train_nlp, val_nlp  = train_test_split(train_nlp, test_size=0.2)
    #
    write_fasttext_file("data.train", train_nlp)
    write_fasttext_file("data.valid", val_nlp)          
    #
    model = fasttext.train_supervised(input='data.train', autotuneValidationFile='data.valid', autotuneDuration=15*60)
    #
    model.save_model(f"fasttext-issue-classifier-{REPO_TO_ID[repo]}.bin")
    #
    test_nlp = nlp_datasets[repo]["test"]
    #
    all_issueId = []
    all_labels = []
    all_preds = []
    for entry in test_nlp:
        text =  fasttext_adjust_text(entry["text"])
        p = model.predict(text)
        if len(p[0])>0:
            p = 0 if "0" in p[0][0] else 1
            #
            all_issueId.append(entry["issueId"])        
            all_labels.append(entry["label"])
            all_preds.append(p)
    #
    fasttext_precision = precision_score(all_labels, all_preds)
    fasttext_recall = recall_score(all_labels, all_preds)
    fasttext_f1 = f1_score(all_labels, all_preds)
    fasttext_mcc = matthews_corrcoef(all_labels, all_preds)   
    #
    fasttext_model_results[repo] = [fasttext_precision, fasttext_recall, fasttext_f1, fasttext_mcc]
    #
    issue_lbl_lookup = {}
    for i in range(len(all_preds)):
        issue_bert_lbl = all_preds[i]
        issue_id = all_issueId[i]
        issue_lbl_lookup[issue_id] = issue_bert_lbl
    #===============================
    X = []
    YT = []
    YH = []
    for file in dataset[repo]:
        versions = dataset[repo][file]
        x = calc_x(versions)
        yt = calc_y(versions)
        yh = calc_y_fasttext(versions, issue_lbl_lookup)
        #
        X.append(x)
        YT.append(yt)
        YH.append(yh)
    X = np.asarray(X)
    y = np.asarray(YT)
    yh = np.asarray(YH)
    #======
    fasttext_labels[repo] = {
        "Y": YT,
        "YE": YH
    }
    #======
    for rep in range(REP_CNT):
        if (rep+1) % 5 == 0:
            print(f"\t {rep+1}/{REP_CNT}")
        #==
        X_train, X_test, y_train, _, _, y_test  = train_test_split(X, yh, y, test_size=0.2)
        #
        for model_name, model_provider in MODELS:
            classifier = model_provider()
            #
            classifier.fit(X_train, y_train)
            #
            yp = classifier.predict(X_test)
            #
            classifier_precision = precision_score(y_test, yp)
            classifier_recall = recall_score(y_test, yp)
            classifier_f1 = f1_score(y_test, yp)
            classifier_mcc = matthews_corrcoef(y_test, yp)
            #
            fasttext_experimental_results[repo].append([model_name, classifier_precision, classifier_recall, classifier_f1, classifier_mcc])

In [None]:
print_data = []
for repo in fasttext_model_results:
    print_data.append([repo]+fasttext_model_results[repo])
print(tabulate(print_data, headers=["Repo", "Precision", "Recall", "F1", "MCC"]))

In [None]:
for repo in fasttext_experimental_results:
    df = pd.DataFrame(fasttext_experimental_results[repo], columns=columns)
    print(repo)
    print(df.groupby(["Model"]).mean())
    #boxplot = df.boxplot(column=columns) 
    #plt.show()
    print()

### Compare results

In [None]:
issue_label_data = []
for entry in [["Heuristic", heuristic_model_results], ["IHeuristic", improved_heuristic_model_results],
              ["BERT", bert_model_results], ["FastText", fasttext_model_results]]:
    method = entry[0]
    results = entry[1]
    for repo in results:
        classifier_precision, classifier_recall, classifier_f1, classifier_mcc  = results[repo]
        issue_label_data.append([method, repo, classifier_precision, classifier_recall, classifier_f1, classifier_mcc])

In [None]:
df = pd.DataFrame(issue_label_data, columns=["Method", "Repo", "Precision", "Recall", "F1", "MCC"])
mean_res = df.groupby(["Repo", "Method"]).mean()
print(mean_res)

In [None]:
file_label_data = []
for entry in [["Heuristic", heuristics_labels], ["IHeuristic", improved_heuristics_labels],
              ["BERT", bert_labels], ["FastText", fasttext_labels]]:
    method = entry[0]
    labels = entry[1]
    for repo in labels:
        y = labels[repo]["Y"]
        p = labels[repo]["YE"]
        #
        tn, fp, fn, tp = confusion_matrix(y, p).ravel()
        cnt = tn+fp+fn+tp
        file_label_data.append([method, repo, tp, tn, fp, fn, round(100*fp/cnt, 4), round(100*fn/cnt, 4)])

In [None]:
df = pd.DataFrame(file_label_data, columns=["Method", "Repo", "TP", "TN", "FP", "FN", "FPR", "FNR"])
#mean_res = df.drop(columns=["FPR", "FNR"]).groupby(["Repo", "Method"]).mean()
mean_res = df.drop(columns=["TP", "TN", "FP", "FN"]).groupby(["Method", "Repo"]).mean()
print(mean_res)

In [None]:
file_label_data = []
for entry in [["Heuristic", heuristics_labels], ["IHeuristic", improved_heuristics_labels],
              ["BERT", bert_labels], ["FastText", fasttext_labels]]:
    method = entry[0]
    labels = entry[1]
    for repo in labels:
        y = labels[repo]["Y"]
        p = labels[repo]["YE"]
        #
        classifier_precision = precision_score(y, p)
        classifier_recall = recall_score(y, p)    
        classifier_f1 = f1_score(y, p)
        classifier_mcc = matthews_corrcoef(y, p)
        file_label_data.append([method, repo, classifier_precision, classifier_recall, classifier_f1, classifier_mcc])

In [None]:
df = pd.DataFrame(file_label_data, columns=["Method", "Repo", "Precision", "Recall", "F1", "MCC"])
mean_res = df.groupby(["Repo", "Method"]).mean()
print(mean_res)

In [None]:
for repo in REPO_TO_ID:
    g_df = pd.DataFrame(gold_experimental_results[repo], columns=columns)
    h_df = pd.DataFrame(heuristic_experimental_results[repo], columns=columns)
    i_df = pd.DataFrame(improved_heuristic_experimental_results[repo], columns=columns)
    b_df = pd.DataFrame(bert_experimental_results[repo], columns=columns)
    f_df = pd.DataFrame(fasttext_experimental_results[repo], columns=columns)
    #
    g_df["Type"] = ["Gold" for _ in range(len(g_df))]
    h_df["Type"] = ["Heuristic" for _ in range(len(h_df))]
    i_df["Type"] = ["IHeuristic" for _ in range(len(i_df))]
    b_df["Type"] = ["BERT" for _ in range(len(b_df))]
    f_df["Type"] = ["FastText" for _ in range(len(f_df))]
    #
    df = pd.concat([g_df, h_df, i_df, b_df, f_df])
    #
    mean_res = df.groupby(["Model", "Type"]).mean()
    print(repo)
    print(mean_res)
    print("\n")

In [None]:
#Statistics
from scipy.stats import normaltest

In [None]:
def is_normal(values):
    _, p = normaltest(values)
    return p >= alpha    

In [None]:
alpha = 0.05
#
cnt = 0
normal_cnt = 0
for repo in REPO_TO_ID:
    g_df = pd.DataFrame(gold_experimental_results[repo], columns=columns)
    h_df = pd.DataFrame(heuristic_experimental_results[repo], columns=columns)
    i_df = pd.DataFrame(improved_heuristic_experimental_results[repo], columns=columns)
    b_df = pd.DataFrame(bert_experimental_results[repo], columns=columns)
    f_df = pd.DataFrame(fasttext_experimental_results[repo], columns=columns)
    #
    g_df["Type"] = ["Gold" for _ in range(len(g_df))]
    h_df["Type"] = ["Heuristic" for _ in range(len(h_df))]
    i_df["Type"] = ["IHeuristic" for _ in range(len(i_df))]
    b_df["Type"] = ["BERT" for _ in range(len(b_df))]
    f_df["Type"] = ["FastText" for _ in range(len(f_df))]
    #
    df = pd.concat([g_df, h_df, i_df, b_df, f_df])
    #
    types = set(df["Type"].values)
    models = set(df["Model"].values)
    models.remove("SVM")
    #
    for t in types:
        for m in models:
            cnt = cnt + 1
            if is_normal(df[(df["Type"]==t) & (df["Model"]==m)]["MCC"].values):
                normal_cnt = normal_cnt + 1
print(f"{normal_cnt}/{cnt}")

In [None]:
#Statistics
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind

In [None]:
alpha = 0.05
#
repo_better = 0
repo_important = 0
cnt = 0
for repo in REPO_TO_ID:
    g_df = pd.DataFrame(gold_experimental_results[repo], columns=columns)
    h_df = pd.DataFrame(heuristic_experimental_results[repo], columns=columns)
    i_df = pd.DataFrame(improved_heuristic_experimental_results[repo], columns=columns)
    b_df = pd.DataFrame(bert_experimental_results[repo], columns=columns)
    f_df = pd.DataFrame(fasttext_experimental_results[repo], columns=columns)
    #
    g_df["Type"] = ["Gold" for _ in range(len(g_df))]
    h_df["Type"] = ["Heuristic" for _ in range(len(h_df))]
    i_df["Type"] = ["IHeuristic" for _ in range(len(i_df))]
    b_df["Type"] = ["BERT" for _ in range(len(b_df))]
    f_df["Type"] = ["FastText" for _ in range(len(f_df))]
    #
    df = pd.concat([g_df, h_df, i_df, b_df, f_df])
    #
    types = set(df["Type"].values)
    models = set(df["Model"].values)
    models.remove("SVM")
    #
    for t in types:
        if t == "BERT" or t == "Gold":
            continue
        for m in models:
            cnt = cnt + 1
            
            both_normal = is_normal(df[(df["Type"]=="BERT") & (df["Model"]==m)]["MCC"].values) and is_normal(df[(df["Type"]==t) & (df["Model"]==m)]["MCC"].values)
            if both_normal:
                stat, p = ttest_ind(
                        df[(df["Type"]=="BERT") & (df["Model"]==m)]["MCC"].values, 
                        df[(df["Type"]==t) & (df["Model"]==m)]["MCC"].values
                        )
            else:
                stat, p = mannwhitneyu(
                        df[(df["Type"]=="BERT") & (df["Model"]==m)]["MCC"].values, 
                        df[(df["Type"]==t) & (df["Model"]==m)]["MCC"].values
                        )
            b_mean = df[(df["Type"]=="BERT") & (df["Model"]==m)]["MCC"].mean()
            o_mean = df[(df["Type"]==t) & (df["Model"]==m)]["MCC"].mean()
            #
            if b_mean > o_mean:
                repo_better = repo_better + 1
                if p > alpha:
                    #print('Same distribution (fail to reject H0)')
                    pass
                else:
                    #print('Different distribution (reject H0)')
                    repo_important = repo_important + 1
print(f"{repo_better} {repo_important}/{cnt}")
#65 53/84

In [None]:
alpha = 0.05
#
repo_better = 0
repo_important = 0
cnt = 0
for repo in REPO_TO_ID:
    g_df = pd.DataFrame(gold_experimental_results[repo], columns=columns)
    h_df = pd.DataFrame(heuristic_experimental_results[repo], columns=columns)
    i_df = pd.DataFrame(improved_heuristic_experimental_results[repo], columns=columns)
    b_df = pd.DataFrame(bert_experimental_results[repo], columns=columns)
    f_df = pd.DataFrame(fasttext_experimental_results[repo], columns=columns)
    #
    g_df["Type"] = ["Gold" for _ in range(len(g_df))]
    h_df["Type"] = ["Heuristic" for _ in range(len(h_df))]
    i_df["Type"] = ["IHeuristic" for _ in range(len(i_df))]
    b_df["Type"] = ["BERT" for _ in range(len(b_df))]
    f_df["Type"] = ["FastText" for _ in range(len(f_df))]
    #
    df = pd.concat([g_df, h_df, i_df, b_df, f_df])
    #
    types = set(df["Type"].values)
    models = set(df["Model"].values)
    models.remove("SVM")
    #
    for m in models:
        cnt = cnt + 1
        both_normal = is_normal(df[(df["Type"]=="BERT") & (df["Model"]==m)]["MCC"].values) and is_normal(df[(df["Type"]=="Gold") & (df["Model"]==m)]["MCC"].values)
        if both_normal:
            stat, p = ttest_ind(
                    df[(df["Type"]=="BERT") & (df["Model"]==m)]["MCC"].values, 
                    df[(df["Type"]=="Gold") & (df["Model"]==m)]["MCC"].values
                    )
        else:
            stat, p = mannwhitneyu(
                    df[(df["Type"]=="BERT") & (df["Model"]==m)]["MCC"].values, 
                    df[(df["Type"]=="Gold") & (df["Model"]==m)]["MCC"].values
                    )
        #
        if p > alpha:
            repo_important = repo_important + 1
            #print('Same distribution (fail to reject H0)')
            pass
        else:
            #print('Different distribution (reject H0)')
            pass
print(f"{repo_important}/{cnt}")

### BERT visualize TSNE

In [None]:
for repo in dataset:
    print(repo)
    #===============================
    model = AutoModelForSequenceClassification.from_pretrained(f"./roberta-issue-classifier-{REPO_TO_ID[repo]}/evaluated_model", num_labels=2)
    if device == "cuda":
        model.cuda()
    #
    test_nlp = Dataset.from_pandas(pd.DataFrame(nlp_datasets[repo]["test"]))
    test_nlp = test_nlp.map(tokenize, batched=True)
    #
    copy_test_nlp = test_nlp.select([i for i in range(len(test_nlp))])
    copy_test_nlp = copy_test_nlp.remove_columns(["text"]).rename_column("label", "labels")
    copy_test_nlp.set_format("torch")
    #
    if device == "cuda":
        torch.cuda.empty_cache()
    #
    eval_dataloader = DataLoader(copy_test_nlp, batch_size=16)
    #
    model.eval()
    #
    all_last_hiddens = []
    all_labels = []
    for batch in eval_dataloader:
        all_labels.append(batch['labels'].detach().cpu())
        del batch["issueId"]
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            hs = model(**batch, output_hidden_states=True)
            last_hiddens = hs.hidden_states[-1][:,0,:]
            all_last_hiddens.append(last_hiddens)
    #
    all_labels = torch.cat(all_labels, 0).detach().cpu().numpy()
    all_last_hiddens = torch.cat(all_last_hiddens, 0).detach().cpu().numpy()
    X_embedded = TSNE(n_components=2).fit_transform(all_last_hiddens)
    #
    figure(figsize=(8, 6), dpi=80)
    plt.scatter(X_embedded[:,0], X_embedded[:,1], c=all_labels, cmap='bwr', s=2)
    plt.axis('off')
    plt.savefig(f'tsne-{REPO_TO_ID[repo]}.pdf', bbox_inches='tight')
    plt.show()

In [None]:
for repo in dataset:
    print(repo)
    #======
    X = []
    Y = []
    for file in dataset[repo]:
        x = calc_x(dataset[repo][file])
        y = calc_y(dataset[repo][file])
        #
        X.append(x)
        Y.append(y)
    X = np.asarray(X)
    y = np.asarray(Y)
    #======
    X_embedded = TSNE(n_components=2).fit_transform(X)
    #
    figure(figsize=(8, 6), dpi=80)
    plt.scatter(X_embedded[:,0], X_embedded[:,1], c=y, cmap='bwr', s=2)
    plt.axis('off')
    plt.savefig(f'file-tsne-{REPO_TO_ID[repo]}.pdf', bbox_inches='tight')
    plt.show()                

In [None]:
for repo in dataset:
    print(repo)
    #===============================
    # Train BERT
    model = AutoModelForSequenceClassification.from_pretrained(f"./roberta-issue-classifier-{REPO_TO_ID[repo]}/evaluated_model", num_labels=2)
    if device == "cuda":
        model.cuda()
    #
    test_nlp = Dataset.from_pandas(pd.DataFrame(nlp_datasets[repo]["test"]))
    test_nlp = test_nlp.map(tokenize, batched=True)
    #
    copy_test_nlp = test_nlp.select([i for i in range(len(test_nlp))])
    copy_test_nlp = copy_test_nlp.remove_columns(["text"]).rename_column("label", "labels")
    copy_test_nlp.set_format("torch")
    #
    if device == "cuda":
        torch.cuda.empty_cache()
    #
    eval_dataloader = DataLoader(copy_test_nlp, batch_size=16)
    #
    model.eval()
    #
    all_preds = []
    all_issueId = []
    for batch in eval_dataloader:
        all_issueId.append(batch['issueId'].detach().cpu())
        del batch["issueId"]
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            hs = model(**batch, output_hidden_states=True)
            logits = hs.logits
            predictions = torch.argmax(logits, dim=-1)
            last_hiddens = hs.hidden_states[-1][:,0,:]
            all_preds.append(predictions)
    #
    all_preds = torch.cat(all_preds, 0).detach().cpu().numpy()                          
    all_issueId = torch.cat(all_issueId, 0).detach().cpu().numpy()
    #
    issue_lbl_lookup = {}
    for i in range(len(test_nlp)):
        issue_bert_lbl = all_preds[i]
        issue_id = all_issueId[i]
        issue_lbl_lookup[issue_id] = issue_bert_lbl
    #===============================
    X = []
    YH = []
    for file in dataset[repo]:
        versions = dataset[repo][file]
        x = calc_x(versions)
        yh = calc_y_bert(versions, issue_lbl_lookup)
        #
        X.append(x)
        YH.append(yh)
    X = np.asarray(X)
    yh = np.asarray(YH)
    #===============================
    X_embedded = TSNE(n_components=2).fit_transform(X)
    #
    figure(figsize=(8, 6), dpi=80)
    plt.scatter(X_embedded[:,0], X_embedded[:,1], c=yh, cmap='bwr', s=2)
    plt.axis('off')
    plt.savefig(f'file-bert-tsne-{REPO_TO_ID[repo]}.pdf', bbox_inches='tight')
    plt.show()                

In [None]:
for repo in REPO_TO_ID:
    print(repo, REPO_TO_ID[repo])

## Save results to disc

In [None]:
results = {
    "sdp": [["Gold", gold_experimental_results], 
            ["Heuristic", heuristic_experimental_results],
            ["IHeuristic", improved_heuristic_experimental_results],
            ["BERT", bert_experimental_results],
            ["FastText", fasttext_experimental_results]],
    "nlp": [["Heuristic", heuristic_model_results],
            ["IHeuristic", improved_heuristic_model_results],
            ["BERT", bert_model_results],
            ["FastText", fasttext_model_results]],
    "pred":[["Heuristic", heuristics_labels],
            ["IHeuristic", improved_heuristics_labels],
            ["BERT", bert_labels],
            ["FastText", fasttext_labels]],
    "strategy":[
        ["IHeuristic", best_strategy_for_repo]
    ]
}

encoded = jsonpickle.encode(results)
with open(os.path.join(DATA_FOLDER, "issue-article-results-multiple-models_v2.json"), "w") as f_out:
    f_out.write(encoded)

## Load results 

In [None]:
with open(os.path.join(DATA_FOLDER, "issue-article-results-multiple-models_v2.json"), "r") as f_in:
    for line in f_in:
        results = jsonpickle.decode(line)

if "sdp" in results:
    for entry in results["sdp"]:
        if entry[0] == "Gold":
            gold_experimental_results = entry[1]
        if entry[0] == "Heuristic":
            heuristic_experimental_results = entry[1]
        if entry[0] == "IHeuristic":
            improved_heuristic_experimental_results = entry[1]
        if entry[0] == "BERT":
            bert_experimental_results = entry[1]
        if entry[0] == "FastText":
            fasttext_experimental_results = entry[1]

if "nlp" in results:
    for entry in results["nlp"]:
        if entry[0] == "BERT":
            bert_model_results = entry[1]
        if entry[0] == "FastText":
            fasttext_model_results = entry[1]
        if entry[0] == "Heuristic":
            heuristic_model_results = entry[1]
        if entry[0] == "IHeuristic":
            improved_heuristic_model_results = entry[1]

if "pred" in results:
    for entry in results["pred"]:
        if entry[0] == "BERT":
            bert_labels = entry[1]
        if entry[0] == "FastText":
            fasttext_labels = entry[1]
        if entry[0] == "Heuristic":
            heuristics_labels = entry[1]
        if entry[0] == "IHeuristic":
            improved_heuristics_labels = entry[1]

if "strategy" in results:
    for entry in results["strategy"]:
        if entry[0] == "IHeuristic":
            best_strategy_for_repo = entry[1]