# Data Extraction


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import concurrent.futures

import pandas as pd
from tqdm import tqdm
from git import Commit

from utils import clone_or_pull_repo, add_commit_to_csv, add_commits_to_csv

## 1. Clone or Pull GitHub Repositories
- reads from `/data/repos.txt`

In [None]:
repo_sources = []

with open("data/repos.txt") as f:
    for line in f:
        repo_sources.append(line.strip())

repos = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    repos = executor.map(clone_or_pull_repo, repo_sources)
repos = list(repos)

## 2. Extraction

### To CSV

In [None]:
DESTINATION = "data/commits.csv"
RETRY_LIMIT = 5
INCLUDE_FILES = False

In [None]:
# Batched
commit_count = sum(1 for repo in repos for commit in repo.iter_commits())

with tqdm(total=commit_count, desc=f"Exporting commits to {DESTINATION}") as pbar:
    write_header = not os.path.isfile(DESTINATION) or os.path.getsize(DESTINATION) == 0
    for repo in repos:
        commits = list(repo.iter_commits())
        add_commits_to_csv(
            commits,
            destination=DESTINATION,
            write_header=write_header,
            include_files=INCLUDE_FILES,
            pbar=pbar,
            batch_size=50,
        )
        write_header = False

In [None]:
# Unbatched
retry_count = 0

with tqdm(total=commit_count, desc="Exporting commits to CSV") as pbar:
    write_header = not os.path.isfile(DESTINATION) or os.path.getsize(DESTINATION) == 0
    for repo in repos:
        for commit in repo.iter_commits():
            while True:
                try:
                    add_commit_to_csv(
                        commit,
                        destination="data/commits.csv",
                        write_header=write_header,
                        include_files=INCLUDE_FILES,
                    )
                    pbar.update()
                    write_header = False
                    break
                except Exception as e:
                    print(f"Failed to add commit {commit.hexsha} to CSV, retrying...")
                    retry_count += 1
                    if retry_count > RETRY_LIMIT:
                        raise e
                    continue

In [77]:
df = pd.read_csv(
    "data/commits.csv", names=["message", "sha", "remote_url", "date", "label"]
)
df.head()

Unnamed: 0,message,sha,remote_url,date,label
0,"fixup! if -s & -p specified, mention 'sftp -P'...",2709809fd616a0991dc18e3a58dea10fb383c3f0,https://github.com/openssh/openssh-portable,2023-05-24 19:41:14+02:00,
1,Make ssh-copy-id(1) consistent with OpenSSH.\n...,204e0bf05161b7641500d7ab266c21217412379f,https://github.com/openssh/openssh-portable,2021-08-03 21:25:48+10:00,
2,"if -s & -p specified, mention 'sftp -P' on suc...",9de79df66d1430d290fab670bb4b18612875e518,https://github.com/openssh/openssh-portable,2023-05-24 11:45:43+02:00,
3,drop whitespace\n\nSSH-Copy-ID-Upstream: e604f...,801cda54c00e0f4e7d89345a90874c8d05dc233a,https://github.com/openssh/openssh-portable,2023-05-23 23:07:11+02:00,
4,make -x also apply to the target script\n\nSSH...,288482f53613f3e74544eb92deeb24f7c7f1f371,https://github.com/openssh/openssh-portable,2023-05-23 20:52:13+02:00,
