In [372]:
import re
import glob
from pprint import pprint

import pandas as pd
from datasets import load_dataset, load_from_disk, Dataset, ClassLabel

## Pushed to HuggingFace


### Raw Dataset

https://huggingface.co/datasets/neuralsentry/raw-git-commits


In [None]:
# Convert csv to Apache Arrow format
raw_git_commits = load_dataset("csv", data_files="./data/commits.csv")
raw_git_commits = raw_git_commits.shuffle(seed=420)
raw_git_commits.save_to_disk("./data/raw-git-commits")

# Upload to HuggingFace
raw_git_commits = load_from_disk("./data/raw-git-commits")
raw_git_commits.push_to_hub("neuralsentry/raw-git-commits")

### Cleaned Dataset

https://huggingface.co/datasets/neuralsentry/git-commits


In [None]:
raw_git_commits = load_dataset("neuralsentry/raw-git-commits", split="train")
df = raw_git_commits.to_pandas()

#### Remove Uninformative Commits


In [721]:
only_urls = df[df["commit_msg"].str.contains(r"^https?://\S+$", flags=re.IGNORECASE)]
empty_commits = df[df["commit_msg"].str.contains(r"^[\s\.]*$")]
df = df[~df.index.isin(only_urls.index)]
df = df[~df.index.isin(empty_commits.index)]

# # Export removed commits
# only_urls.to_excel("./data/only_urls.xlsx", index=False)
# empty_commits["commit_msg"].str.replace(
#     "\n", "\\n"
# ).value_counts().to_frame().reset_index().to_excel(
#     "./data/empty_commits.xlsx", index=False
# )

In [None]:
# Filtering
def remove_commits(df: pd.DataFrame, regex_str: str) -> pd.DataFrame:
    """Filter out rows from dataframe where commit_msg matches regex_str."""
    # Create a boolean mask of rows where commit_msg does NOT match regex_str
    mask = ~df["commit_msg"].str.contains(regex_str, regex=True)
    # Use the mask to filter the DataFrame
    df_filtered = df[mask]
    return df_filtered


# Before filtering
freq_before_df = df["commit_msg"].value_counts().to_frame().reset_index()
freq_before_df[
    # Duplicate commit messages
    (freq_before_df["count"] > 1)
    # Two words or less
    & (freq_before_df["commit_msg"].str.split().apply(len) <= 2)
]

#### Stripping Commit Metadata


In [723]:
metadata = [
    r"git-svn-id",
    r"signed-off-by",
    r"reviewed-by",
    r"reported-by",
    r"submitted by",
    r"github",
    r"pr",
    r"discussion",
    r"author",
    r"helped-by",
    r"merged-by",
    r"openbsd-commit-id",
]
metadata_regex = "|".join(metadata)
metadata_regex = r"\n+(" + metadata_regex + r"): .*"
stripped = df.copy()
stripped["commit_msg"] = (
    stripped["commit_msg"]
    .str.replace(metadata_regex, "", regex=True, flags=re.IGNORECASE)
    .str.strip()
)
pprint(
    df[df["sha"] == "c80affe0f192db9f851b5ed0617586783a02a82d"]["commit_msg"].values[0]
)
pprint(
    stripped[stripped["sha"] == "c80affe0f192db9f851b5ed0617586783a02a82d"][
        "commit_msg"
    ].values[0]
)

('Add a WHATSNEW entry indicating libgpo py deprecation\n'
 '\n'
 'BUG: https://bugzilla.samba.org/show_bug.cgi?id=15225\n'
 '\n'
 'Signed-off-by: David Mulder <dmulder@samba.org>\n'
 'Reviewed-by: Andrew Bartlett <abartlet@samba.org>\n')
('Add a WHATSNEW entry indicating libgpo py deprecation\n'
 '\n'
 'BUG: https://bugzilla.samba.org/show_bug.cgi?id=15225')


In [None]:
metadata_freq = (
    df["commit_msg"]
    .str.extractall(r"\n([\w\-]+):.*")
    .value_counts()
    .to_frame()
    .reset_index()
)
metadata_freq[metadata_freq["count"] > 50]

In [None]:
metadata_freq = (
    stripped["commit_msg"]
    .str.extractall(r"\n([\w\-]+):.*")
    .value_counts()
    .to_frame()
    .reset_index()
)
metadata_freq[metadata_freq["count"] > 50]

In [None]:
git_commits = Dataset.from_pandas(stripped, preserve_index=False)
git_commits.push_to_hub("neuralsentry/git-commits")

### Labelled Dataset

https://huggingface.co/datasets/neuralsentry/git-commits-labelled


In [None]:
# Use this code block if any modifications are made to `neuralsentry/git-commits`
# This will merge the updated dataset with the labelled dataset at `neuralsentry/git-commits-labelled`

In [None]:
# Convert all excels to csv
labelled_excels = glob.glob("./data/labelled/*.xlsx")
for excel in labelled_excels:
    df = pd.read_excel(excel)
    df.to_csv(excel.replace(".xlsx", ".csv"), index=False)

# Combine all csvs into one dataset
labelled_csvs = glob.glob("./data/labelled/*.csv")
df = pd.concat([pd.read_csv(csv) for csv in labelled_csvs], ignore_index=True)
Dataset.from_pandas(df).save_to_disk("./data/git-commits-labelled")
git_commits_labelled = load_from_disk("./data/git-commits-labelled")

# Change typeof labels to ClassLabel
features = git_commits_labelled.features.copy()
features["labels"] = ClassLabel(num_classes=2, names=["non-bug", "bug"])
git_commits_labelled = git_commits_labelled.cast(features)

# Push
git_commits_labelled.push_to_hub("neuralsentry/git-commits-labelled")