## Imports

In [None]:
import time
import base64
import jsonpickle
from github import Github

## Constants

In [None]:
ISSUE_DATA_FILE = "../data/issue-data.json"

In [None]:
LANGUAGES = set(["Python", "Java", "JavaScript", "PHP", "Ruby", "Go"])
FILE_EXTENSIONS = set([".py", ".java", ".js", ".php", ".rb", ".go"])

In [None]:
COMMIT_PRINT_FREQ = 10

## Utility

In [None]:
def note_time():
    timestr = time.strftime("%Y%m%d-%H%M%S")
    #
    with open("time.txt", "w") as time_out:
        time_out.write(timestr)

def check_remaining(github, MIN_REMAINING = 10):
    note_time()
    if github.get_rate_limit().raw_data["core"]["remaining"] < MIN_REMAINING:
        time.sleep(3650)

## Basic analysis

In [None]:
with open(ISSUE_DATA_FILE, "r") as f_in:
    for line in f_in:
        data = jsonpickle.decode(line)

In [None]:
print(f"Number of repos: {len(data)}")

## Filter repos by language

In [None]:
for repo in data:
    if repo["language"] in LANGUAGES:
        print(repo['name'])

## GitHub

In [None]:
GITHUB_ACCESS_TOKEN = ""

In [None]:
github = Github(GITHUB_ACCESS_TOKEN)

In [None]:
for repo_id, repo in enumerate(data):
    repos_commit_data = {}
    LAST_ERROR = None    
    try:
        print(f"{repo['name']} -- {repo_id + 1}/{len(data)}")
        #
        if repo["language"] not in LANGUAGES:
            continue
        check_remaining(github)
        #
        gh_repo_access = github.get_repo(repo["name"])
        #
        repos_commit_data[repo["name"]] = {}
        repos_commit_data[repo["name"]]["commits"] = []
        #
        commits = gh_repo_access.get_commits()
        #
        for commit_id, commit in enumerate(commits):
            try:
                if (commit_id+1) % COMMIT_PRINT_FREQ == 0:
                    timestr = time.strftime("%Y-%m-%d %H:%M:%S")
                    print(f"\t{timestr}: {commit_id+1}/{commits.totalCount}")
                #
                commit_data = {}
                #
                commit_data["sha"] = commit.sha
                commit_data["msg"] = commit.commit.message
                commit_data["date"] = commit.commit.committer.date
                commit_data["files"] = []
                #
                for file in commit.files:
                    try:
                        acceptable = False
                        for extension in FILE_EXTENSIONS:
                            if file.filename.endswith(extension):
                                acceptable = True
                                break
                        if not acceptable:
                            continue
                        #
                        check_remaining(github)
                        #
                        file_data = {}
                        #
                        file_data["sha"] = file.sha
                        file_data["name"] = file.filename
                        file_data["change_cnt"] = file.changes
                        file_data["add_cnt"] = file.additions
                        file_data["del_cnt"] = file.deletions
                        file_data["content"] = None
                        #
                        try:
                            check_remaining(github)
                            #
                            file_content_package = gh_repo_access.get_git_blob(file.sha)
                            file_data["content"] = file_content_package.content
                            file_data["content_encoding"] = file_content_package.encoding
                        except Exception as e:
                            LAST_ERROR = e
                            print(LAST_ERROR)
                        #
                        commit_data["files"].append(file_data)
                    except Exception as e:
                        LAST_ERROR = e
                        print(LAST_ERROR)
                #
                repos_commit_data[repo["name"]]["commits"].append(commit_data)
            except Exception as e:
                LAST_ERROR = e
                print(LAST_ERROR)
        #
        data_json = jsonpickle.encode(repos_commit_data)
        with open(f"commit_data_{repo_id}.json", "w") as out:
            out.write(data_json)            
        #
    except Exception as e:
        LAST_ERROR = e
        print(LAST_ERROR)
