# Data Extraction


In [1]:
import os
import concurrent.futures

import pandas as pd
from git import Repo, Commit
from tqdm import tqdm
from datasets import load_dataset

In [33]:
REPO_LIST = "./data/repos.txt"
DESTINATION = "./data/commits.csv"
SINCE = "2015-01-01"

In [None]:
# Clone/Pull Repos
def clone_or_pull_repo(remote_url: str):
    repo_name = remote_url.split("/")[-1]
    owner_name = remote_url.split("/")[-2]
    destination = f"data/repos/{owner_name}/{repo_name}"

    repo = None
    if os.path.exists(destination):
        repo = Repo(destination)
        print(f"Pulling {repo_name}")
        repo.remotes.origin.pull()
        print(f"[DONE] Pulling {repo_name}")
    else:
        print(f"Cloning {repo_name}")
        repo = Repo.clone_from(remote_url, destination)
        print(f"[DONE] Cloning {repo_name}")

    return repo


repo_list = []
with open(REPO_LIST) as f:
    for line in f:
        repo_list.append(line.strip())

repos = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    repos = executor.map(clone_or_pull_repo, repo_list)
repos = list(repos)

In [24]:
# Batched Export
def add_commits_to_csv(
    commits: list[Commit],
    destination: str,
    write_header: bool = False,
    batch_size: int = 20,
    pbar: tqdm = None,
):
    for i in range(0, len(commits), batch_size):
        batch = commits[i : i + batch_size]
        outputs = []
        for commit in batch:
            parent = commit.parents[0] if commit.parents else None
            if not parent:  # Skip first commit
                continue

            output = {
                "commit_msg": commit.message,
                "sha": commit.hexsha,
                "remote_url": commit.repo.remotes.origin.url,
                "date": commit.authored_datetime,
                "labels": -1,
            }

            outputs.append(output)
        df = pd.DataFrame(outputs)
        df.to_csv(destination, mode="a", index=False, header=write_header)
        write_header = False

        if pbar:
            pbar.update(len(batch))


commit_count = sum(1 for repo in repos for commit in repo.iter_commits(since=SINCE))

with tqdm(total=commit_count, desc=f"Exporting commits to {DESTINATION}") as pbar:
    is_file_exists = os.path.isfile(DESTINATION)
    is_file_empty = is_file_exists and os.path.getsize(DESTINATION) == 0
    write_header = not is_file_exists or is_file_empty

    for repo in repos:
        repo_name = repo.remotes.origin.url.split("/")[-1].split(".")[0]
        destination = f"./data/{repo_name}.csv"
        commits = list(repo.iter_commits(since=SINCE))
        add_commits_to_csv(
            commits,
            destination=DESTINATION,
            write_header=write_header,
            pbar=pbar,
            batch_size=50,
        )
        write_header = False

Exporting commits to ./data/commits.csv: 100%|██████████| 183108/183108 [2:35:51<00:00, 19.58it/s]  


In [42]:
raw_datasets = load_dataset("csv", data_files=DESTINATION, split="train")
raw_datasets.set_format("pandas")
df = raw_datasets.to_pandas()
df.head()

Found cached dataset csv (E:/.cache/huggingface/datasets/csv/default-9f95ed729e084b78/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Unnamed: 0,commit_msg,sha,remote_url,date,labels
0,Update runner OS version for hardenedmalloc te...,8a6cd08850f576e7527c52a1b086cae82fab290e,https://github.com/openssh/openssh-portable,2023-06-23 09:49:02+10:00,-1
1,handle sysconf(SC_OPEN_MAX) returning > INT_MA...,cfca6f17e64baed6822bb927ed9f372ce64d9c5b,https://github.com/openssh/openssh-portable,2023-06-22 15:04:03+10:00,-1
2,upstream: better validate CASignatureAlgorithm...,c1c2ca1365b3f7b626683690bd2c68265f6d8ffd,https://github.com/openssh/openssh-portable,2023-06-21 05:10:26+00:00,-1
3,upstream: make `ssh -Q CASignatureAlgorithms` ...,4e73cd0f4ab3e5b576c56cac9732da62c8fc0565,https://github.com/openssh/openssh-portable,2023-06-21 05:08:32+00:00,-1
4,upstream: handle rlimits > INT_MAX (rlim_t is ...,a69062f1695ac9c3c3dea29d3044c72aaa6af0ea,https://github.com/openssh/openssh-portable,2023-06-21 05:06:04+00:00,-1
