In [None]:
from collections import Counter, defaultdict
from pathlib import Path

import pandas as pd
from ruamel.yaml import YAML
from tqdm.auto import tqdm

In [None]:
months = ["2022-06",
          "2022-07",
          "2022-08",
          "2022-09",
          "2022-10",
          "2022-11",
          "2022-12",
          "2023-01",
          "2023-02",
          "2023-03",
          "2023-04",
          "2023-05",
          "2023-06",
          "2023-07",
          "2023-08",
          "2023-09",
          "2023-10"]
pre_commit_root = Path("pre-commits")
repo_dir_list = list(pre_commit_root.iterdir())
yaml = YAML(typ="safe")
yaml.allow_duplicate_keys = True

## Explanation

The following fairly dense logic iterates through each repos directory and each months .pre-commit-config.yaml files. Not all repos have a commit each calendar month so in that case if there is a preceding month that does have data about pre-commit tool the previous month data just fills forward into the missing month and that chaining repeats until a month without missing data is found.

Some repos didn't have .pre-commit-config.yaml files for several months at the beginning of the requested timespan so they have no tools to count for those initial missing month periods (which is why I skip a few months of collected data when plotting the output). Ruff wasn't published until late Aug 2022 - I think the Ruff pre-commit tool didn't get published until a couple of months or so after that.

Also note there's quite a few hacky looking conditional rules in the center of these loops to try and identify tool names. pre-commit can run any sort of tool - the easy case is the Github repo links, but there's also a lot of instances in repos of the pre-commit running local scripts and python modules so the conditional rules try to handle the majority of the cases I observed.

In [None]:
month_repo_counters = defaultdict(Counter)
for repo_dir in tqdm(repo_dir_list):
    tool_names = []
    precommit_dir_list = [(repo_dir / month) for month in months]
    for precommit_dir in precommit_dir_list:
        tool_names = []
        if (precommit_dir / ".pre-commit-config.yaml").is_file():
            yaml_data = yaml.load(precommit_dir / ".pre-commit-config.yaml")
            repos = yaml_data["repos"] if "repos" in yaml_data else yaml_data
            for repo in repos:
                if "http" in repo["repo"]:
                    tool_name = repo["repo"].split("/")[-1] if repo["repo"][-1] != "/" else repo["repo"].split("/")[-2]
                    tool_names.append(tool_name)
                elif len(repo["hooks"]) > 0:
                    hook = repo["hooks"][0]
                    if "entry" not in hook:
                        continue
                    tool_name = hook["entry"]
                    if tool_name.find("poetry run ") == 0:
                        tool_name = tool_name[len("poetry run "):].split(" ")[0]
                    elif tool_name.find("python -m ") == 0:
                        tool_name = tool_name[len("python -m "):].split(" ")[0]
                    elif tool_name.find("python ") == 0:
                        tool_name = tool_name[len("python "):].split(" ")[0]
                    elif tool_name.find("python3 ") == 0:
                        tool_name = tool_name[len("python3 "):].split(" ")[0]
                    elif tool_name.find("./activated.py python ") == 0:
                        tool_name = tool_name[len("./activated.py python "):].split(" ")[0]
                    elif tool_name.find("./activated.py ") == 0:
                        tool_name = tool_name[len("./activated.py "):].split(" ")[0]
                    elif tool_name == "poetry" and "args" in hook and len(hook["args"]) >= 2 and hook["args"][0] == "run":
                        tool_name = hook["args"][1]

                    if tool_name.find("mirrors-") == 0:
                        tool_name = tool_name[len("mirrors-"):]
                    
                    tool_names.append(tool_name)
        month_repo_counters[precommit_dir.name].update(tool_names)

In [None]:
ordered_counters = [month_repo_counters[month] for month in months]

In [None]:
df = pd.DataFrame(data=ordered_counters, index=months)

In [None]:
df.sum().sort_values(ascending=False)[:20].to_dict()

In [None]:
columns = ["black",
           "flake8",
           "isort",
           "pyupgrade",
           "ruff-pre-commit"]

In [None]:
ax = df.loc["2022-10":, columns].plot(figsize=(10.76, 7.68), title="Python Github Repo Pre-commit Tools")
ax.set_xlabel("Date")
ax.set_ylabel("Repo Count")

In [None]:
ax.get_figure().savefig("ruff_growth.png", bbox_inches="tight", pad_inches=0.1)