## Imports

In [None]:
import time
import jsonpickle
from github import Github

## Utility

In [None]:
def note_time():
    timestr = time.strftime("%Y%m%d-%H%M%S")
    #
    with open("time.txt", "w") as time_out:
        time_out.write(timestr)

def check_remaining(github, MIN_REMAINING = 10):
    note_time()
    if github.get_rate_limit().raw_data["core"]["remaining"] < MIN_REMAINING:
        time.sleep(3650)

## Connect to GitHub

In [None]:
GITHUB_ACCESS_TOKEN = ""

In [None]:
github = Github(GITHUB_ACCESS_TOKEN)

In [None]:
check_remaining(github)

## Search for popular repositories with good first issues

In [None]:
FIRST_GOOD_CNT = 5

In [None]:
repositories = github.search_repositories(query='good-first-issues:>' + str(FIRST_GOOD_CNT))

In [None]:
check_remaining(github)

In [None]:
MAX_REPO_CNT = 100
#
MIN_STAR_CNT = 50
MIN_ISSUE_CNT = 100

In [None]:
repos_of_interest = []
for i, repo in enumerate(repositories):
    check_remaining(github)
    if repo.stargazers_count >= MIN_STAR_CNT:
        issue_cnt = 0
        for issue in repo.get_issues():
            check_remaining(github)
            issue_cnt = issue_cnt + 1
            if issue_cnt >= MIN_ISSUE_CNT:
                repos_of_interest.append(repo)
                break
    if i >= MAX_REPO_CNT:
        break
    check_remaining(github)

In [None]:
print("Number of collected repos: ", len(repos_of_interest))

## Manually set repos of interest

In [None]:
"""
repo_names = ["ytdl-org/youtube-dl", 
              "facebook/react-native",
              "kubernetes/kubernetes",
              "vercel/next.js",
              "nodejs/node",
              "mui/material-ui",
              "huggingface/transformers",
              "elastic/elasticsearch",
              "gatsbyjs/gatsby",
              "scikit-learn/scikit-learn"]
"""

In [None]:
repos_of_interest = []
for repo_name in repo_names:
    repo = github.get_repo(repo_name)
    repos_of_interest.append(repo)

## Collect and store data

In [None]:
print(len(repos_of_interest))

In [None]:
data = []

In [None]:
LAST_ERROR = None

In [None]:
for repo in repos_of_interest:
    try:
        print(repo.full_name, "--->", repo.stargazers_count, "--->", repo.language)
        check_remaining(github)
        #
        repo_data = {}
        repo_data["name"] = repo.full_name
        repo_data["star"] = repo.stargazers_count
        repo_data["language"] = repo.language
        repo_data["issues"] = []
        for issue in repo.get_issues(state="all"):# very important to set state -> default is open only
            try:
                check_remaining(github)
                #
                issue_data = {"id": issue.id,
                              "title": issue.title,
                              "number": issue.number,
                              "html_url": issue.html_url,
                              "body": issue.body, 
                              "labels": [lbl.name for lbl in issue.labels],
                              "comment cnt": issue.comments,
                              "comments": [],
                              "has_pull_request": issue.pull_request is not None}
                repo_data["issues"].append(issue_data)
            except Exception as e:
                LAST_ERROR = e
        #
        data.append(repo_data)
        #
        check_remaining(github)
    except Exception as e:
        LAST_ERROR = e       

In [None]:
print(LAST_ERROR)

In [None]:
data_json = jsonpickle.encode(data)
with open("issue-data.json", "w") as out:
    out.write(data_json)