In [8]:
import requests
import os
import json
import time
from datetime import datetime, timedelta

# Replace with your GitHub username and personal access token (PAT)
# Generate a PAT with "repo" scope (or finer-grained scopes if possible)
GITHUB_USERNAME = "turbomam"
GITHUB_TOKEN = ""

# Replace with the repository you want to analyze
REPO_OWNER = "microbiomedata"  # e.g., "facebook"
REPO_NAME = "nmdc-metadata"  # e.g., "react"

In [10]:


# Output directory
OUTPUT_DIR = "github_data"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def download_data(api_url, data_type, filename):
    """Downloads issues, pull requests, or comments with enhanced status messages."""

    all_data = []
    page = 1
    start_time = datetime.now()

    while True:
        url_with_params = f"{api_url}?page={page}&per_page=100&state=all"
        response = requests.get(url_with_params, headers=headers)

        if response.status_code == 200:
            data = response.json()
            if not data:
                break

            all_data.extend(data)

            if data_type!= "comments":
                now = datetime.now()
                elapsed_time = now - start_time
                items_downloaded = len(all_data)
                rate = items_downloaded / elapsed_time.total_seconds() if elapsed_time.total_seconds() > 0 else 0
                time_remaining = timedelta(seconds=(total_items - items_downloaded) / rate) if rate > 0 and total_items > items_downloaded else "Unknown"
                print(
                    f"Downloaded page {page} of {data_type} ({items_downloaded} {data_type} so far). "
                    f"Elapsed: {elapsed_time}, Rate: {rate:.2f} {data_type}/sec, Time Remaining: {time_remaining}."
                )
            page += 1

        elif response.status_code == 403:  # rate limit handling
            print("Rate limit hit. Waiting...")
            rate_limit_reset = int(response.headers.get("X-RateLimit-Reset"))
            wait_time = rate_limit_reset - time.time() + 10  # add 10 seconds buffer
            if wait_time > 0:
                print(f"Waiting for {wait_time:.1f} seconds...")
                time.sleep(wait_time)
                continue
            else:
                raise Exception("Rate limit reset time in the past")

        else:
            print(f"Error downloading {data_type}: {response.status_code} - {response.text}")
            return None

    if data_type!= "comments":
        for item in all_data:
            comments_url = item.get("comments_url")
            if comments_url:
                comments = download_data(comments_url, "comments", None)
                if comments is not None:
                    item["comments"] = comments
                else:
                    print(f"Failed to download comments for {data_type} #{item.get('number') or item.get('id')}")

    return all_data


if __name__ == "__main__":
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}",
        "Accept": "application/vnd.github.v3+json",
    }

    API_URL = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues"

    # Use the search API to get the total count
    search_url = f"https://api.github.com/search/issues?q=repo:{REPO_OWNER}/{REPO_NAME}"
    total_items_response = requests.get(search_url, headers=headers)
    if total_items_response.status_code == 200:
        total_items = total_items_response.json()["total_count"]
    else:
        total_items = 0
        print("Could not retrieve total number of items. Progress estimation will not work.")


    # Download Issues
    issues = download_data(API_URL, "issues", f"{REPO_OWNER}_{REPO_NAME}_issues.json")
    if issues:
        with open(os.path.join(OUTPUT_DIR, f"{REPO_OWNER}_{REPO_NAME}_issues.json"), "w", encoding="utf-8") as f:
            json.dump(issues, f, indent=4)
        print("Issues and comments saved.")
    else:
        print("Failed to download issues or comments.")

    # Download Pull Requests
    pulls_url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls"
    pulls = download_data(pulls_url, "pull requests", f"{REPO_OWNER}_{REPO_NAME}_pulls.json")
    if pulls:
        with open(os.path.join(OUTPUT_DIR, f"{REPO_OWNER}_{REPO_NAME}_pulls.json"), "w", encoding="utf-8") as f:
            json.dump(pulls, f, indent=4)
        print("Pull requests and comments saved.")
    else:
        print("Failed to download pull requests or comments.")

Could not retrieve total number of items. Progress estimation will not work.
Downloaded page 1 of issues (100 issues so far). Elapsed: 0:00:00.635497, Rate: 157.36 issues/sec, Time Remaining: Unknown.
Downloaded page 2 of issues (200 issues so far). Elapsed: 0:00:01.253279, Rate: 159.58 issues/sec, Time Remaining: Unknown.
Downloaded page 3 of issues (300 issues so far). Elapsed: 0:00:01.804811, Rate: 166.22 issues/sec, Time Remaining: Unknown.
Downloaded page 4 of issues (400 issues so far). Elapsed: 0:00:02.391352, Rate: 167.27 issues/sec, Time Remaining: Unknown.
Downloaded page 5 of issues (433 issues so far). Elapsed: 0:00:02.736660, Rate: 158.22 issues/sec, Time Remaining: Unknown.
Issues and comments saved.
Downloaded page 1 of pull requests (100 pull requests so far). Elapsed: 0:00:00.752086, Rate: 132.96 pull requests/sec, Time Remaining: Unknown.
Downloaded page 2 of pull requests (138 pull requests so far). Elapsed: 0:00:01.170873, Rate: 117.86 pull requests/sec, Time Remain