In [28]:
import os
import re
import glob
import pandas as pd
from datetime import datetime
from datasets import load_dataset, load_from_disk, Dataset, ClassLabel

## Pushed to HuggingFace


### Raw Dataset

https://huggingface.co/datasets/neuralsentry/raw-git-commits


In [None]:
# Convert csv to Apache Arrow format
raw_git_commits = load_dataset("csv", data_files="./data/commits.csv")
raw_git_commits = raw_git_commits.shuffle(seed=420)
raw_git_commits.save_to_disk("./data/raw-git-commits")

# Upload to HuggingFace
raw_git_commits = load_from_disk("./data/raw-git-commits")
raw_git_commits.push_to_hub("neuralsentry/raw-git-commits")

### Cleaned Dataset

https://huggingface.co/datasets/neuralsentry/git-commits


In [None]:
raw_git_commits = load_dataset("neuralsentry/raw-git-commits", split="train")
df = raw_git_commits.to_pandas()


# Filtering
def remove_commits(df: pd.DataFrame, regex_str: str) -> pd.DataFrame:
    """Filter out rows from dataframe where commit_msg matches regex_str."""
    # Create a boolean mask of rows where commit_msg does NOT match regex_str
    mask = ~df["commit_msg"].str.contains(regex_str, regex=True)
    # Use the mask to filter the DataFrame
    df_filtered = df[mask]
    return df_filtered


# Reduces the number of duplicate rows
def limit_duplicate_rows(limit: int = 5):
    def apply(df: pd.DataFrame):
        return df.sample(min(len(df), limit))

    return apply


filtered_df = (
    remove_commits(remove_commits(remove_commits(df, r"^\n$"), r"^...\n$"), r"^....\n$")
    .groupby("commit_msg")
    .apply(limit_duplicate_rows(5))
)

git_commits = Dataset.from_pandas(filtered_df, preserve_index=False)
git_commits.push_to_hub("neuralsentry/git-commits")

In [102]:
# Before filtering
freq_before_df = df["commit_msg"].value_counts().to_frame().reset_index()
freq_before_df[
    # Duplicate commit messages
    (freq_before_df["count"] > 1)
    # Two words or less
    & (freq_before_df["commit_msg"].str.split().apply(len) <= 2)
    # Does not contain URLs
    & (~freq_before_df["commit_msg"].str.contains("http", regex=False))
]

Unnamed: 0,commit_msg,count
0,...\n,3315
1,\n,544
2,pending release\n,187
4,cosmetic\n,117
6,release\n,63
...,...,...
949,Reverted fix.\n,2
972,Correct comment\n,2
1008,Whitespace changes.\n,2
1031,correct soversion\n,2


In [103]:
# After filtering
freq_filtered_df = filtered_df["commit_msg"].value_counts().to_frame().reset_index()

freq_filtered_df[
    # Duplicate commit messages
    (freq_filtered_df["count"] > 1)
    # Two words or less
    & (freq_filtered_df["commit_msg"].str.split().apply(len) <= 2)
    # Does not contain URLs
    & (~freq_filtered_df["commit_msg"].str.contains("http", regex=False))
]

Unnamed: 0,commit_msg,count
10,Whitespace cleanup.\n,5
17,Update README.md,5
20,beta release\n,5
22,Fix typo\n,5
24,Whitespace\n,5
...,...,...
913,Corrected version.\n,2
939,automake/autoconf update\n,2
953,makedepend\n,2
999,allow flavors\n,2


### Labelled Dataset

https://huggingface.co/datasets/neuralsentry/git-commits-labelled

In [None]:
# Convert all excels to csv
labelled_excels = glob.glob("./data/labelled/*.xlsx")
for excel in labelled_excels:
    df = pd.read_excel(excel)
    df.to_csv(excel.replace(".xlsx", ".csv"), index=False)

# Combine all csvs into one dataset
labelled_csvs = glob.glob("./data/labelled/*.csv")
df = pd.concat([pd.read_csv(csv) for csv in labelled_csvs], ignore_index=True)
Dataset.from_pandas(df).save_to_disk("./data/git-commits-labelled")
git_commits_labelled = load_from_disk("./data/git-commits-labelled")

# Change typeof labels to ClassLabel
features = git_commits_labelled.features.copy()
features["labels"] = ClassLabel(num_classes=2, names=["non-bug", "bug"])
git_commits_labelled = git_commits_labelled.cast(features)

# Push
git_commits_labelled.push_to_hub("neuralsentry/git-commits-labelled")