## Imports

In [None]:
import re
import os
import jsonpickle
import matplotlib.pyplot as plt

## Constants

In [None]:
DATA_FOLDER = "./../data/"

In [None]:
COMMIT_DATA_FILE_NAME = "commit_data_"

## Load data

In [None]:
REPO_NAME_TO_ID = {}

In [None]:
data = {}
for file in os.listdir(DATA_FOLDER):
        if COMMIT_DATA_FILE_NAME in file:
            with open(os.path.join(DATA_FOLDER, file), "r") as f_in:
                for line in f_in:
                    repo_data = jsonpickle.decode(line)
                    repo_name = list(repo_data.keys())[0]
                    repo_id = int(file.replace(COMMIT_DATA_FILE_NAME, "").replace(".json", ""))
                    data.update(repo_data)
                    REPO_NAME_TO_ID[repo_name] = repo_id
                    #
                    print(repo_name, "==>", file)

## Utils

In [None]:
def plot_distribution(data, title, buckets=100, x_min=None, x_max=None):
    if x_min is not None:
        data = [v for v in data if v >= x_min]   
    if x_max is not None:
        data = [v for v in data if v <= x_max]   
    plt.hist(data, buckets)
    plt.title(title)
    if x_min is not None and x_max is not None:
        plt.xlim(x_min, x_max)
    plt.show()

## Basic data analysis

In [None]:
total_commit_cnt = 0
for repo in data:
    if repo in ["gatsbyjs/gatsby", "scikit-learn/scikit-learn", "elastic/elasticsearch"]:
        continue
    print(f"{repo} number of commits: {len(data[repo]['commits'])}")
    total_commit_cnt = total_commit_cnt + len(data[repo]['commits'])
print("Total commit cnt:", total_commit_cnt)

In [None]:
for repo in data:
    repo_commits = data[repo]["commits"]
    print(f"Repository: {repo} \t==> {len(repo_commits)}")
    commit_stat_data = {
        "files_per_commit": []
    }
    #
    for commit in repo_commits:
        files = commit["files"]
        commit_stat_data["files_per_commit"].append(len(files))
    #
    avg_files_per_commit = sum(commit_stat_data['files_per_commit'])/len(commit_stat_data['files_per_commit'])
    #
    fc_plus = [fc for fc in commit_stat_data['files_per_commit'] if fc > 0]
    avg_files_per_commit_with_file = sum(fc_plus) / len(fc_plus)
    #
    cnt_commit_without_files = len([1 for fc in commit_stat_data['files_per_commit'] if fc == 0])
    #
    print(f"\t Avg files per commit: {avg_files_per_commit}")
    print(f"\t Avg files per commit with files: {avg_files_per_commit_with_file}")
    print(f"\t Cnt commit without file: {cnt_commit_without_files} => {round(100 * cnt_commit_without_files/len(repo_commits), 2)}%")
    print()

In [None]:
for repo in data:
    data[repo]["fc_plus_commits"] = [c for c in data[repo]["commits"] if len(c["files"]) > 0]

In [None]:
for repo in data:
    print(f"Repository: {repo} \t==> {len(data[repo]['fc_plus_commits'])}")
    #
    commit_stat_data = {
        "changes": [],
        "add": [],
        "del": []
    }
    #
    commit_file_stat_data = {
        "changes": [],
        "add": [],
        "del": []
    }
    #
    for commit in data[repo]["fc_plus_commits"]:
        change_cnt = 0
        add_cnt = 0
        del_cnt = 0
        for file in commit["files"]:
            commit_file_stat_data["changes"].append(file["change_cnt"])
            commit_file_stat_data["add"].append(file["add_cnt"])
            commit_file_stat_data["del"].append(file["del_cnt"])
            #
            change_cnt = change_cnt + file["change_cnt"]
            add_cnt = add_cnt + file["add_cnt"]
            del_cnt = del_cnt + file["del_cnt"]
        commit_stat_data["changes"].append(change_cnt)
        commit_stat_data["add"].append(add_cnt)
        commit_stat_data["del"].append(del_cnt)
    #
    print(f"\t Avg changes per commit: {sum(commit_stat_data['changes']) / len(commit_stat_data['changes'])}")
    print(f"\t Avg adds per commit: {sum(commit_stat_data['add']) / len(commit_stat_data['add'])}")
    print(f"\t Avg dels per commit: {sum(commit_stat_data['del']) / len(commit_stat_data['del'])}")
    print()
    print(f"\t Avg changes per file: {sum(commit_file_stat_data['changes']) / len(commit_file_stat_data['changes'])}")
    print(f"\t Avg adds per file: {sum(commit_file_stat_data['add']) / len(commit_file_stat_data['add'])}")
    print(f"\t Avg dels per file: {sum(commit_file_stat_data['del']) / len(commit_file_stat_data['del'])}")
    print()
    MIN_VALUE = 0
    MAX_VALUE = 250
    plot_distribution(commit_stat_data['changes'], "Changes per commit", x_min=MIN_VALUE, x_max=MAX_VALUE)
    plot_distribution(commit_stat_data['add'], "Add per commit", x_min=MIN_VALUE, x_max=MAX_VALUE)
    plot_distribution(commit_stat_data['del'], "Del per commit", x_min=MIN_VALUE, x_max=MAX_VALUE)
    plot_distribution(commit_file_stat_data['changes'], "Changes per file", x_min=MIN_VALUE, x_max=MAX_VALUE)
    plot_distribution(commit_file_stat_data['add'], "Add per file", x_min=MIN_VALUE, x_max=MAX_VALUE)
    plot_distribution(commit_file_stat_data['del'], "Del per file", x_min=MIN_VALUE, x_max=MAX_VALUE)
    print()


In [None]:
for repo in data:
    print(f"Repository: {repo} \t==> {len(data[repo]['fc_plus_commits'])}")
    #
    per_file_data = {}
    #
    for commit in data[repo]["fc_plus_commits"]:
        for file in commit["files"]:
            file_name = file["name"]
            if file_name not in per_file_data:
                per_file_data[file_name] = []
            per_file_data[file_name].append(commit)
    #
    data[repo]["per_file"] = per_file_data

In [None]:
for repo in data:
    print(f"Repository: {repo} \t==> {len(data[repo]['per_file'])} unique files")
    avg_commits_per_file = 0
    for file in data[repo]['per_file']:
        commits_per_file = len(data[repo]['per_file'][file])
        avg_commits_per_file = avg_commits_per_file + commits_per_file
    avg_commits_per_file = avg_commits_per_file / len(data[repo]['per_file'])
    print(f"\t Avg commits per file: {avg_commits_per_file}")
    print()
    print()

In [None]:
ISSUE_REF_REGEX = r"(?:#|\/issues\/|\/pull\/)(\d+)"

In [None]:
dummy_strings = [
    "This is a issue: #123 , #124 and https://github.com/yt-dlp/yt-dlp/issues/4635#issuecomment-1235384480",
    "cb0a719f67136e31b68d6f6e794fee10b256bf21",
    "PR-URL: https://github.com/nodejs/node/pull/42796"
]
for dummy in dummy_strings:
    res = re.findall(ISSUE_REF_REGEX, dummy)
    print(res)

In [None]:
for repo in data:
    print(f"Repository: {repo} \t==> {len(data[repo]['fc_plus_commits'])}")
    #
    cnt = 0
    for i, commit in enumerate(data[repo]["fc_plus_commits"]):
        refs = re.findall(ISSUE_REF_REGEX, commit["msg"])
        commit["has_refs"] = len(refs) > 0
        commit["refs"] = set(refs)
        if commit["has_refs"]:
            cnt = cnt + 1
    print(f"\tCommit with ref: {cnt}")
    print()        

In [None]:
files_of_interest = {}

for repo in data:
    print(f"Repository: {repo} \t==> {len(data[repo]['per_file'])} unique files")
    files_of_interest[repo] = {}
    files_with_all_refs = 0
    files_with_all_refs_multiple_commits = 0
    issues_of_interest = set()
    for file in data[repo]['per_file']:
        has_ref_cnt = 0
        for commit in data[repo]['per_file'][file]:
            if commit["has_refs"]:
                has_ref_cnt = has_ref_cnt + 1
        if has_ref_cnt == len(data[repo]['per_file'][file]):
            files_with_all_refs = files_with_all_refs + 1
            #
            if has_ref_cnt > 1:
                files_with_all_refs_multiple_commits = files_with_all_refs_multiple_commits + 1
            #
            for commit in data[repo]['per_file'][file]:
                for ref in commit["refs"]:
                    issues_of_interest.add(ref)
            #
            files_of_interest[repo][file] = data[repo]['per_file'][file]
    #
    repo_files_of_intrest = {}
    repo_files_of_intrest[repo] = files_of_interest[repo]
    #
    encoded = jsonpickle.encode(repo_files_of_intrest)
    repoId = REPO_NAME_TO_ID[repo]
    with open(os.path.join(DATA_FOLDER, f"files_of_interest_{repoId}.json"), "w") as f_out:
        f_out.write(encoded)
    print(f"\Files with all ref: {files_with_all_refs}")
    print(f"\Files with all ref and multiple commits: {files_with_all_refs_multiple_commits}")
    print(f"\Issues of interest: {len(issues_of_interest)}")
    print()
    print()

In [None]:
encoded = jsonpickle.encode(files_of_interest)
with open(os.path.join(DATA_FOLDER, "files_of_interest.json"), "w") as f_out:
    f_out.write(encoded)

## Separate issues of interest

In [None]:
ISSUE_DATA_FILE_NAME = "issue-data.json"

In [None]:
with open(os.path.join(DATA_FOLDER, ISSUE_DATA_FILE_NAME), "r") as f_in:
    for line in f_in:
        issue_data = jsonpickle.decode(line)

In [None]:
total_issue_cnt = 0
for repo in issue_data:
    print(repo["name"], "\t==>", len(repo["issues"]))
    if repo["name"] in ["gatsbyjs/gatsby", "scikit-learn/scikit-learn", "elastic/elasticsearch"]:
        continue
    total_issue_cnt = total_issue_cnt + len(repo["issues"])
print(f"Total issue count: {total_issue_cnt}")

In [None]:
issues_of_interest = {}
for repo in files_of_interest:
    issues_of_interest[repo] = {}
    for file in files_of_interest[repo]:
        for commit in files_of_interest[repo][file]:
            for ref in commit["refs"]:
                ref = int(ref)
                if ref not in issues_of_interest[repo]:
                    issues_of_interest[repo][ref] = None
#
for repo in issue_data:
    repo_name = repo["name"]
    if repo_name in issues_of_interest:
        for issue in repo["issues"]:
            nmr = issue["number"]
            if nmr in issues_of_interest[repo_name]:
                issues_of_interest[repo_name][nmr] = issue

In [None]:
ISSUE_OF_INTEREST_THRESHOLD = 0.9

In [None]:
for repo in issues_of_interest:
    not_found_cnt = 0
    for nmr in issues_of_interest[repo]:
        if issues_of_interest[repo][nmr] is None:
            not_found_cnt = not_found_cnt + 1
    #
    print(repo, "\t==>", f"Number of issues not found: {not_found_cnt}/{len(issues_of_interest[repo])}")   
    #
    threshold = not_found_cnt/len(issues_of_interest[repo])
    #
    if 1.0 - threshold > ISSUE_OF_INTEREST_THRESHOLD:
        repo_issues_of_interest = {}
        repo_issues_of_interest[repo] = issues_of_interest[repo]
        #
        encoded = jsonpickle.encode(repo_issues_of_interest)
        #
        repoId = REPO_NAME_TO_ID[repo]
        with open(os.path.join(DATA_FOLDER, f"issues_of_interest_{repoId}.json"), "w") as f_out:
            f_out.write(encoded)

In [None]:
encoded = jsonpickle.encode(issues_of_interest)
#
with open(os.path.join(DATA_FOLDER, "issues_of_interest.json"), "w") as f_out:
    f_out.write(encoded)