# Data Extraction


In [None]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import concurrent.futures

import pandas as pd
from tqdm import tqdm
from git import Commit

from utils import clone_or_pull_repo, add_commit_to_csv, add_commits_to_csv

In [4]:
REPO_LIST = "./data/repos.txt"
DESTINATION = "./data/commits.csv"
RETRY_LIMIT = 5
INCLUDE_FILES = False

In [None]:
# Clone/Pull Repos

repo_list = []
with open(REPO_LIST) as f:
    for line in f:
        repo_list.append(line.strip())

repos = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    repos = executor.map(clone_or_pull_repo, repo_list)
repos = list(repos)

In [9]:
# Batched Export

commit_count = sum(1 for repo in repos for commit in repo.iter_commits())

with tqdm(total=commit_count, desc=f"Exporting commits to {DESTINATION}") as pbar:
    is_file_exists = os.path.isfile(DESTINATION)
    is_file_empty = os.path.getsize(DESTINATION) == 0
    write_header = not is_file_exists or is_file_empty

    for repo in repos:
        commits = list(repo.iter_commits())
        add_commits_to_csv(
            commits,
            destination=DESTINATION,
            write_header=write_header,
            include_files=INCLUDE_FILES,
            pbar=pbar,
            batch_size=50,
        )
        write_header = False

Exporting commits to ./data/commits.csv: 100%|██████████| 156604/156604 [1:54:17<00:00, 22.84it/s]


In [None]:
# Unbatched Export

retry_count = 0

with tqdm(total=commit_count, desc="Exporting commits to CSV") as pbar:
    write_header = not os.path.isfile(DESTINATION) or os.path.getsize(DESTINATION) == 0
    for repo in repos:
        for commit in repo.iter_commits():
            while True:
                try:
                    add_commit_to_csv(
                        commit,
                        destination=DESTINATION,
                        write_header=write_header,
                        include_files=INCLUDE_FILES,
                    )
                    pbar.update()
                    write_header = False
                    break
                except Exception as e:
                    print(f"Failed to add commit {commit.hexsha} to CSV, retrying...")
                    retry_count += 1
                    if retry_count > RETRY_LIMIT:
                        raise e
                    continue

In [12]:
df = pd.read_csv(DESTINATION)
df.head()

Unnamed: 0,commit_msg,sha,remote_url,date,labels
0,"fixup! if -s & -p specified, mention 'sftp -P'...",2709809fd616a0991dc18e3a58dea10fb383c3f0,https://github.com/openssh/openssh-portable,2023-05-24 19:41:14+02:00,-1
1,Make ssh-copy-id(1) consistent with OpenSSH.\n...,204e0bf05161b7641500d7ab266c21217412379f,https://github.com/openssh/openssh-portable,2021-08-03 21:25:48+10:00,-1
2,"if -s & -p specified, mention 'sftp -P' on suc...",9de79df66d1430d290fab670bb4b18612875e518,https://github.com/openssh/openssh-portable,2023-05-24 11:45:43+02:00,-1
3,drop whitespace\n\nSSH-Copy-ID-Upstream: e604f...,801cda54c00e0f4e7d89345a90874c8d05dc233a,https://github.com/openssh/openssh-portable,2023-05-23 23:07:11+02:00,-1
4,make -x also apply to the target script\n\nSSH...,288482f53613f3e74544eb92deeb24f7c7f1f371,https://github.com/openssh/openssh-portable,2023-05-23 20:52:13+02:00,-1
