# Data Extraction


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import concurrent.futures

import pandas as pd
from tqdm import tqdm

from util import clone_or_pull_repo, export_commit_to_csv

## 1. Clone or Pull GitHub Repositories
- reads from `/data/repos.txt`

In [4]:
repo_sources = []

with open("data/repos.txt") as f:
    for line in f:
        repo_sources.append(line.strip())

repos = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    repos = executor.map(clone_or_pull_repo, repo_sources)
repos = list(repos)

Pulling openssh-portable
[DONE] Pulling openssh-portable


## 2. Extraction

### To CSV

In [5]:
# count commits in all repos using list comprehension
commit_count = sum(1 for repo in repos for commit in repo.iter_commits())

with tqdm(total=commit_count, desc="Exporting commits to CSV") as pbar:
    for repo in repos:
        for commit in repo.iter_commits():
            export_commit_to_csv(commit, "data/commits.csv")
            pbar.update()

Exporting commits to CSV: 100%|██████████| 12209/12209 [00:14<00:00, 841.48it/s]


In [37]:
df = pd.read_csv("data/commits.csv", names=["message", "sha", "remote_url", "label"])
df.head()

Unnamed: 0,message,sha,remote_url,label
0,"fixup! if -s & -p specified, mention 'sftp -P'...",2709809fd616a0991dc18e3a58dea10fb383c3f0,https://github.com/openssh/openssh-portable,
1,Make ssh-copy-id(1) consistent with OpenSSH.\n...,204e0bf05161b7641500d7ab266c21217412379f,https://github.com/openssh/openssh-portable,
2,"if -s & -p specified, mention 'sftp -P' on suc...",9de79df66d1430d290fab670bb4b18612875e518,https://github.com/openssh/openssh-portable,
3,drop whitespace\n\nSSH-Copy-ID-Upstream: e604f...,801cda54c00e0f4e7d89345a90874c8d05dc233a,https://github.com/openssh/openssh-portable,
4,make -x also apply to the target script\n\nSSH...,288482f53613f3e74544eb92deeb24f7c7f1f371,https://github.com/openssh/openssh-portable,
