In [1]:
import os
import glob
import pandas as pd
from datasets import load_dataset, load_from_disk, Dataset, ClassLabel

## Pushed to HuggingFace


In [None]:
# Raw Dataset

# Convert csv to Apache Arrow format
raw_git_commits = load_dataset("csv", data_files="./data/commits.csv")
raw_git_commits.save_to_disk("./data/raw-git-commits")

# Upload to HuggingFace
raw_git_commits = load_from_disk("./data/raw-git-commits")
raw_git_commits.push_to_hub("neuralsentry/raw-git-commits")

In [None]:
# Split Dataset

raw_git_commits = load_dataset("neuralsentry/raw-git-commits")
git_commits = raw_git_commits["train"].train_test_split(test_size=0.1, seed=420)

# Upload to HuggingFace
git_commits.push_to_hub("neuralsentry/git-commits")

In [None]:
# Labelled Dataset

# Convert all excels to csv
labelled_excels = glob.glob("./data/labelled/*.xlsx")
for excel in labelled_excels:
    df = pd.read_excel(excel)
    df.to_csv(excel.replace(".xlsx", ".csv"), index=False)

# Combine all csvs into one dataset
labelled_csvs = glob.glob("./data/labelled/*.csv")
df = pd.concat([pd.read_csv(csv) for csv in labelled_csvs], ignore_index=True)
Dataset.from_pandas(df).save_to_disk("./data/git-commits-labelled")
git_commits_labelled = load_from_disk("./data/git-commits-labelled")

# Change typeof labels to ClassLabel
features = git_commits_labelled.features.copy()
features["labels"] = ClassLabel(num_classes=2, names=["non-bug", "bug"])
git_commits_labelled = git_commits_labelled.cast(features)

# Push
git_commits_labelled.push_to_hub("neuralsentry/git-commits-labelled")

## Data Labelling


In [None]:
raw_datasets = load_dataset("neuralsentry/raw-git-commits", split="train").shuffle(seed=420)

In [None]:
# Extract commits of each repo

remote_urls = raw_datasets.unique("remote_url")
repo_names = [os.path.basename(url) for url in remote_urls]

for repo_name in repo_names:
    raw_datasets.filter(lambda example: repo_name in example["remote_url"]).to_csv(
        f"./data/{repo_name}.csv"
    )

In [None]:
# Remove already labelled commits

for repo_name in repo_names:
    path = f"./data/{repo_name}.csv"
    labelled_path = f"./data/labelled/{repo_name}.csv"
    if not os.path.exists(labelled_path):
        continue
    unlabelled_df = pd.read_csv(path)
    labelled_df = pd.read_csv(labelled_path)

    # remove all rows in unlabelled_df where the sha is in labelled_df
    df = unlabelled_df[~unlabelled_df["sha"].isin(labelled_df["sha"])]
    df.to_csv(path, index=False)

## Data Preprocessing

In [100]:
raw_datasets = load_dataset("neuralsentry/raw-git-commits", split="train").shuffle(
    seed=420
)
raw_datasets.set_format("pandas")
df = raw_datasets.to_pandas()

Found cached dataset parquet (E:/.cache/huggingface/datasets/neuralsentry___parquet/neuralsentry--raw-git-commits-ed0e6dd27470031f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached shuffled indices for dataset at E:\.cache\huggingface\datasets\neuralsentry___parquet\neuralsentry--raw-git-commits-ed0e6dd27470031f\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-ae6c39276cb7522a.arrow


### Limit Duplicate Commits

In [135]:
freq_df = df["commit_msg"].value_counts().to_frame().reset_index()
freq_df[freq_df["count"] > 5]

Unnamed: 0,commit_msg,count
0,\n,8425
1,...\n,3315
2,Update TODO list.\n,551
3,pending release\n,187
4,Translation updates\n,164
...,...,...
156,Fix incorrect format placeholder\n,6
157,Fixed memory leak.\nCredit to OSS-Fuzz\n,6
158,Oops...\n,6
159,Buildfix.\n,6


In [138]:
def limit_duplicate_rows(limit: int = 5):
    def apply(df: pd.DataFrame):
        return df.sample(min(len(df), limit))

    return apply


df = df.groupby("commit_msg").apply(limit_duplicate_rows(5))

In [148]:
freq_df = df["commit_msg"].value_counts().to_frame().reset_index()
freq_df[freq_df["count"] > 5]

Unnamed: 0,commit_msg,count
