In [None]:
from contextlib import suppress
from datetime import datetime, UTC
import math
import os
from pathlib import Path

from dotenv import load_dotenv
from github import Auth, Github
from github.GithubException import UnknownObjectException
from requests.exceptions import RetryError
from tqdm.auto import tqdm

In [None]:
load_dotenv()

## Github Token

This notebook expects a Github token either as an environment variable named GITHUB_API_TOKEN or in a .env file in the root directory of this repo

In [None]:
auth = Auth.Token(os.getenv("GITHUB_API_TOKEN"))
g = Github(auth=auth)

## Description

The following logic searches Python repos in descending order of stars and checks filters out repos younger than _min_created_at_ date or if it fails to find a .pre-commit-config.yaml file.

## Warnings

1. Due to the Github REST API rate limits the following cell takes a long time to run, with the default arguments this took 8+ hours to scrape the .pre-commit-config.yaml files for 1000 repos.
2. The Github REST API has a nasty gotcha where if you try to search too many repos at once (i.e. have too broader filter criteria) it will time out and return some results - but those results will be time-sensitive and non-deterministic (i.e. when I tried searching for all repos at once in descending order of stars it would hit this timeout issue and randomly omit different repos that should've been included in the results). That was the reason for adding the bounds_scaling variable below where it only searches for repos between certain min and max star count values and those bounds reduce each iteration. bounds_scaling should be less than 1 and greater than zero but probably needs to remain quite close to 1, i.e. 0.85, 0.9, or 0.95 to prevent the timeout issues that cause non-deterministic reuslts.

## Potential Improvements

There's a number of potential improvements that could be made to the efficiency and coverage of this data collection, namely:

1. Further tweak the Github repo query strings to exclude archived repos or filter out repos that have no changes after the min_created_at date to filter out useless repos earlier in the pipeline.
2. This script aggregates commits to just take the most recent commit per month. You could add logic to inspect what commits actually touched the .pre-commit-config.yaml though and specially keep all of them to get the highest time fidelity change data.
3. There's no way a present (that I'm aware of) with the Github REST API search function to filter repos by the presence of a file (i.e. the .pre-commit-config.yaml) so this logic spends a lot of time manually checking the contents of repos in the search results. If it becomes possible in the future to just search repos on the presence of a particular file this code would probably run much faster (or maybe the filename could be searched first and then repos that contain it iterated instead).

In [None]:
processed_repos_names = set()
precommit_repo_list = []

In [None]:
# If this cell fails re-run and it should resume (it won't double count recorded repos)
%%time
pre_commits_path = Path("pre-commits")
min_created_at = datetime(year=2022, month=6, day=1, tzinfo=UTC)
saved_repo_count = sum(1 for subdirectory in pre_commits_path.iterdir() if subdirectory.is_dir()) if pre_commits_path.exists() else 0
required_repos = 1_000 - saved_repo_count
max_stars = 100000
bounds_scaling = 0.95
query = f"language:python pushed:>{min_created_at:%Y-%m-%d} stars:>{math.floor(max_stars * bounds_scaling)} archived:false"
with tqdm(total=required_repos) as pbar:
    while len(precommit_repo_list) < required_repos:
        result = g.search_repositories(query, sort="stars", order="desc")
        repo_list = list(result)

        print(f"{query} results={len(repo_list)}")

        # This checked_repo_details logic was just for debugging the repo search was progressing as expected
        checked_repos_log = Path("checked_repo_details.txt")
        if checked_repos_log.is_file():
            with checked_repos_log.open("a") as file:
                for repo in repo_list:
                    file.write(f"{repo.full_name} {repo.stargazers_count} {repo.created_at}\n")
        else:
            with checked_repos_log.open("w") as file:
                for repo in repo_list:
                    file.write(f"{repo.full_name} {repo.stargazers_count} {repo.created_at}\n")

        for repo in repo_list:
            with suppress(UnknownObjectException, AssertionError, RetryError):
                pre_commit_config = repo.get_contents(path=".pre-commit-config.yaml")
                if repo.full_name in processed_repos_names:
                    continue
                successful_save = False
                branch = repo.get_branch(repo.default_branch)
                commits = repo.get_commits(sha=branch.commit.sha, since=min_created_at)
                commit_list = list(commits)

                monthly_commits = {}
                for commit in commit_list:
                    commit_date = commit.commit.author.date
                    key = commit_date.strftime("%Y-%m")
                    if key not in monthly_commits or commit_date > monthly_commits[key].commit.author.date:
                        monthly_commits[key] = commit

                monthly_commit_list = sorted(monthly_commits.values(), key=lambda commit: commit.commit.author.date)
                monthly_commit_list = [commit for commit in monthly_commit_list if commit.commit.author.date >= min_created_at]

                for commit in monthly_commit_list:
                    with suppress(UnknownObjectException, AssertionError):
                        pre_commit_config = repo.get_contents(".pre-commit-config.yaml", ref=commit.sha)
                        save_path = Path(
                            f"""pre-commits/{repo.full_name.replace("/", "_")}/"""
                            f"""{commit.commit.author.date:%Y-%m}/.pre-commit-config.yaml"""
                        )
                        save_path.parent.mkdir(parents=True, exist_ok=True)
                        save_path.write_bytes(pre_commit_config.decoded_content)
                        successful_save = True

                if successful_save:
                    precommit_repo_list.append(repo)
                    processed_repos_names.add(repo.full_name)
                    pbar.update(1)

                if len(precommit_repo_list) >= required_repos:
                    break
        max_stars = math.floor(max_stars * bounds_scaling)
        if len(repo_list) > 0:
            max_stars = min(repo_list[-1].stargazers_count, max_stars)
        query = (
            f"language:python pushed:>{min_created_at:%Y-%m-%d} stars:{math.floor(max_stars * bounds_scaling)}..{max_stars} archived:false"
        )

# Save ordered list of repos used (just used for debugging)
with open("repo_names.txt", "w") as file:
    for repo in precommit_repo_list:
        file.write(repo.full_name + "\n")

In [None]:
len(processed_repos_names)