In [67]:
import torch
from rich import print
from rich.table import Table
import pandas as pd
import numpy as np
from tqdm import tqdm
from git import Repo
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)
from optimum.bettertransformer import BetterTransformer

In [17]:
df = pd.read_csv("data/linux-bugfixes-dataset.csv")
df = df[df["labels"] != -1]

In [19]:
df[df["labels"] == 1].value_counts("sha").to_frame().reset_index()

# remove duplicates based on sha
len(df), len(df.drop_duplicates(subset=["sha"]))

(2027, 1945)

In [None]:
sample_size = 200
# take sample_size from labels = 1 and labels = 0, totalling 400
sample_df = pd.concat(
    [
        df[df["labels"] == 1].sample(sample_size),
        df[df["labels"] == 0].sample(sample_size),
    ]
)
# shuffle
sample_df = sample_df.sample(frac=1).reset_index(drop=True)
sample_df

In [60]:
# sample_df.to_csv("200_samples_linux_bugfixes_labelled.csv", index=False)

In [114]:
checkpoint = "neuralsentry/starencoder-finetuned-class" # 500 commits
# checkpoint = "neuralsentry/starencoder-git-commit-bugfix-classification" # 3000 commits
# checkpoint = "neuralsentry/distilbert-git-commit-bugfix-classification" # 3000 commits

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model = BetterTransformer.transform(model)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
device = torch.device("cuda")
model.to(device)
...

Ellipsis

In [107]:
commits = sample_df["commit_msg"].tolist()
bugfix_threshold = 0.5
batch_size = 32

batch = []
probabilities = []
predictions = []
labels = sample_df["labels"].tolist()
for i, commit in enumerate(tqdm(commits)):
    batch.append(commit)

    if len(batch) == batch_size:
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256,
        )
        inputs.to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        probabilities.extend(probs.tolist())
        batch = []

# handle last batch if sample_size not divisible by batch_size
if len(batch) > 0:
    inputs = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    )
    inputs.to(device)
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)
    probabilities.extend(probs.tolist())

predictions = [1 if p[1] > bugfix_threshold else 0 for p in probabilities]

100%|██████████| 400/400 [00:00<00:00, 514.64it/s]


In [98]:
# calculate metrics
report = classification_report(
    labels,
    predictions,
    target_names=["non-bugfix", "bugfix"],
)

confusion_matrix_metric = confusion_matrix(
    labels,
    predictions,
)

accuracy_metric = accuracy_score(
    labels,
    predictions,
)

In [100]:
# StarEncoder (500 commits)

print(
    ">>> Confusion Matrix:",
    f"\nTP: {confusion_matrix_metric[1][1]}, FP: {confusion_matrix_metric[0][1]}",
    f"\nFN: {confusion_matrix_metric[1][0]}, TN: {confusion_matrix_metric[0][0]}",
    "\n\n>>> Accuracy: ",
    accuracy_metric,
    "\n\n>>>: Classification Report:\n",
    report
)

In [83]:
# StarEncoder (3000 commits)

print(
    ">>> Confusion Matrix:",
    f"\nTP: {confusion_matrix_metric[1][1]}, FP: {confusion_matrix_metric[0][1]}",
    f"\nFN: {confusion_matrix_metric[1][0]}, TN: {confusion_matrix_metric[0][0]}",
    "\n\n>>> Accuracy: ",
    accuracy_metric,
    "\n\n>>>: Classification Report:\n",
    report
)

In [89]:
# DistilBERT (3000 commits)

print(
    ">>> Confusion Matrix:",
    f"\nTP: {confusion_matrix_metric[1][1]}, FP: {confusion_matrix_metric[0][1]}",
    f"\nFN: {confusion_matrix_metric[1][0]}, TN: {confusion_matrix_metric[0][0]}",
    "\n\n>>> Accuracy: ",
    accuracy_metric,
    "\n\n>>>: Classification Report:\n",
    report
)