In [None]:
from github import Github
import getpass
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
import langdetect

g = Github(getpass.getpass("Enter access token:"), per_page=100)

Retrieves repositories based on language, minimum star count, and keyword in description. Saves them in a CSV file.

In [None]:
languages = ["Java", "Python"]
min_stars = 100
keywords = "library"

for language in languages:
    print(f"Searching for repositories in {language}...")
    candidates_file = f"candidate_repos_{language.lower()}.csv"
    readmes_file = f"repo_readmes_{language.lower()}.csv"
    query = f"stars:>={min_stars} language:{language} in:description {keywords}"

    start_date = datetime.strptime("2010-01-01", "%Y-%m-%d")
    end_date = datetime.strptime("2015-01-01", "%Y-%m-%d")
    years_interval = 3

    collected = 0
    import csv
    with open(candidates_file, "w", newline="", encoding="UTF-8") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=";", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(["ID", "OWNER", "NAME", "URL", "DESC", "DESC_LANG", "CREATED", "LAST_UPDATED", "TOPICS", "STARS", "SEARCH_LANGUAGE", "REPO_LANGUAGES"])
        while start_date < end_date and start_date < datetime.now():
            start_date_str = start_date.strftime("%Y-%m-%d")
            end_date_str = end_date.strftime("%Y-%m-%d")
            interval_repos = g.search_repositories(f"{query} created:{start_date_str}..{end_date_str}")
            interval_repos_count = interval_repos.totalCount
            print(f"{interval_repos_count} repositories are found between {start_date_str} and {end_date_str}.")
            if interval_repos_count >= 1000: # Too many results, need to narrow the interval down
                years_interval = max(1, years_interval // 2)
                end_date -= relativedelta(years=years_interval)
                print("Narrowing down the search interval...")
            else:
                print(f"Collecting repositories between {start_date_str} and {end_date_str}...")
                for repo in interval_repos:
                    owner, name = repo.full_name.split("/", 1)
                    desc_lang = langdetect.detect(repo.description)
                    topics = ",".join(repo.get_topics())
                    csv_writer.writerow([repo.id, owner, name, repo.html_url, repo.description, desc_lang, repo.created_at, 
                                         repo.updated_at, topics, repo.stargazers_count, language, repo.get_languages()])
                    time.sleep(1) # an alternative would be checking the remaining requests count and adapt accordingly
                    
                collected += interval_repos_count
                start_date = end_date + relativedelta(days=1)
                end_date += relativedelta(years=years_interval)
                years_interval *= 2

    print(f"{collected} repositories were collected for {language}.")
    time.sleep(60)

In [None]:
# !jupyter nbconvert --to script 1_candidate_repo_collector.ipynb # converts to .py file