# GitHub Repository Data Collector

This notebook collects GitHub repository statistics and saves them to a CSV file.

## Setup
1. Create a `.env` file with your GitHub token: `GITHUB_TOKEN=your_token_here`
2. Run all cells to collect fresh data
3. Use `github_stats_viewer.ipynb` to view the results

In [None]:
# Import required libraries
import requests
import pandas as pd
from datetime import datetime
import time
import os
from dotenv import load_dotenv
import json

# Load environment variables
load_dotenv()

print("Libraries imported successfully!")
print(f"GitHub token available: {'Yes' if os.environ.get('GITHUB_TOKEN') else 'No'}")

In [None]:
# Repository list with categories
repositories = {
    "AI/ML": [
        "huggingface/trl",
        "Significant-Gravitas/AutoGPT",
        "open-webui/open-webui",
        "comfyanonymous/ComfyUI",
        "langchain-ai/langchain",
        "huggingface/transformers",
        "ollama/ollama",
        "vllm-project/vllm",
        "n8n-io/n8n",
        "iterative/dvc",
        "langgenius/dify",
        "HumanSignal/label-studio",
        "microsoft/ML-For-Beginners",
        "cleanlab/cleanlab",
        "voxel51/fiftyone",
        "fastai/fastbook",
        "pytorch/pytorch",
        "tensorflow/tensorflow",
        "scikit-learn/scikit-learn",
        "pandas-dev/pandas"
    ],
    "TypeScript": [
        "nestjs/nest",
        "prisma/prisma",
        "adonisjs/core",
        "typeorm/typeorm",
        "sequelize/sequelize",
        "trpc/trpc",
        "fastify/fastify",
        "taskforcesh/bullmq",
        "mikro-orm/mikro-orm",
        "drizzle-team/drizzle-orm",
        "kysely-org/kysely",
        "colinhacks/zod",
        "winstonjs/winston",
        "pinojs/pino",
        "helmetjs/helmet",
        "moleculerjs/moleculer",
        "typestack/routing-controllers",
        "lukeautry/tsoa",
        "tsedio/tsed",
        "lobehub/lobe-chat",
        "vercel/ai"
    ],
    "C# ASP.NET": [
        "dotnet/aspnetcore",
        "aspnetrun/run-aspnetcore-microservices",
        "DapperLib/Dapper",
        "jasontaylordev/CleanArchitecture",
        "dotnet/efcore",
        "App-vNext/Polly",
        "dotnet/orleans",
        "HangfireIO/Hangfire",
        "nopSolutions/nopCommerce",
        "ThreeMammals/Ocelot",
        "OrchardCMS/OrchardCore",
        "dotnetcore/CAP",
        "gothinkster/aspnetcore-realworld-example-app"
    ],
    "Other": [
        "karpathy/nn-zero-to-hero"
    ]
}

total_repos = sum(len(repos) for repos in repositories.values())
print(f"Total repositories to analyze: {total_repos}")

In [None]:
# GitHub API functions
def get_repo_stats(owner, repo, token=None):
    """Fetch repository statistics from GitHub API"""
    headers = {}
    if token:
        headers['Authorization'] = f'token {token}'
    
    repo_url = f'https://api.github.com/repos/{owner}/{repo}'
    
    try:
        response = requests.get(repo_url, headers=headers)
        
        if response.status_code == 404:
            print(f"⚠️  Repository {owner}/{repo} not found (404)")
            return None
        elif response.status_code == 403:
            print(f"⚠️  Access denied to {owner}/{repo} (403 - private repo?)")
            return None
        elif response.status_code == 429:
            print(f"⚠️  Rate limit exceeded. Waiting 60 seconds...")
            time.sleep(60)
            return get_repo_stats(owner, repo, token)
        
        response.raise_for_status()
        repo_data = response.json()
        
        # Get contributors count
        contributors_count = get_contributors_count(owner, repo, headers)
        
        # Get open pull requests count
        open_prs_count = get_open_prs_count(owner, repo, headers)
        
        return {
            'repo_path': f'{owner}/{repo}',
            'stars': repo_data['stargazers_count'],
            'forks': repo_data['forks_count'],
            'contributors': contributors_count,
            'open_issues': repo_data['open_issues_count'],
            'open_prs': open_prs_count,
            'created_at': repo_data['created_at'][:10],
            'pushed_at': repo_data['pushed_at'][:10] if repo_data['pushed_at'] else repo_data['created_at'][:10]
        }
        
    except Exception as e:
        print(f"❌ Error fetching {owner}/{repo}: {str(e)}")
        return None

def get_contributors_count(owner, repo, headers):
    """Get the number of contributors for a repository"""
    try:
        # GitHub API has a max of 500 contributors returned
        # For accurate count, we need to check pagination
        contributors_url = f'https://api.github.com/repos/{owner}/{repo}/contributors?per_page=100&anon=true'
        response = requests.get(contributors_url, headers=headers)
        
        if response.status_code == 200:
            contributors = response.json()
            total_count = len(contributors)
            
            # Check if there are more pages
            if 'Link' in response.headers:
                link_header = response.headers['Link']
                # Parse the last page number from Link header
                import re
                last_page_match = re.search(r'page=(\d+)>; rel="last"', link_header)
                if last_page_match:
                    last_page = int(last_page_match.group(1))
                    # Get the last page to count remaining contributors
                    last_page_url = f'{contributors_url}&page={last_page}'
                    last_response = requests.get(last_page_url, headers=headers)
                    if last_response.status_code == 200:
                        last_page_contributors = len(last_response.json())
                        # Total = (pages - 1) * 100 + last page count
                        total_count = (last_page - 1) * 100 + last_page_contributors
            
            return total_count
        elif response.status_code == 403:
            # Repository might have disabled contributor stats
            return -1
        else:
            return -1
    except Exception as e:
        print(f"Error counting contributors for {owner}/{repo}: {str(e)}")
        return -1

def get_open_prs_count(owner, repo, headers):
    """Get the number of open pull requests"""
    try:
        # Use search API for accurate count (it returns total_count directly)
        search_url = f'https://api.github.com/search/issues?q=is:pr+is:open+repo:{owner}/{repo}'
        response = requests.get(search_url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            return data.get('total_count', 0)
        
        # Fallback to pulls endpoint if search fails
        prs_url = f'https://api.github.com/repos/{owner}/{repo}/pulls?state=open&per_page=100'
        response = requests.get(prs_url, headers=headers)
        
        if response.status_code == 200:
            prs = response.json()
            total_count = len(prs)
            
            # Check for pagination
            if 'Link' in response.headers:
                link_header = response.headers['Link']
                import re
                last_page_match = re.search(r'page=(\d+)>; rel="last"', link_header)
                if last_page_match:
                    last_page = int(last_page_match.group(1))
                    # Get the last page to count remaining PRs
                    last_page_url = f'{prs_url}&page={last_page}'
                    last_response = requests.get(last_page_url, headers=headers)
                    if last_response.status_code == 200:
                        last_page_prs = len(last_response.json())
                        # Total = (pages - 1) * 100 + last page count
                        total_count = (last_page - 1) * 100 + last_page_prs
            
            return total_count
        else:
            return -1
    except Exception as e:
        print(f"Error counting PRs for {owner}/{repo}: {str(e)}")
        return -1

def fetch_all_repositories(repositories_dict, token=None):
    """Fetch statistics for all repositories"""
    results = []
    total_repos = sum(len(repos) for repos in repositories_dict.values())
    current_repo = 0
    
    for field, repo_list in repositories_dict.items():
        print(f"\n🔍 Processing {field} repositories...")
        
        for repo_path in repo_list:
            current_repo += 1
            owner, repo = repo_path.split('/')
            
            print(f"[{current_repo}/{total_repos}] Fetching {repo_path}... ", end="")
            
            stats = get_repo_stats(owner, repo, token)
            if stats:
                stats['field'] = field
                results.append(stats)
                contributors_text = str(stats['contributors']) if stats['contributors'] >= 0 else "N/A"
                prs_text = str(stats['open_prs']) if stats['open_prs'] >= 0 else "N/A"
                print(f"✅ {stats['stars']} stars, {contributors_text} contributors, {prs_text} PRs")
            else:
                print("❌ Failed")
            
            time.sleep(0.1)  # Rate limiting
    
    return results

print("✅ GitHub API functions defined successfully!")

In [None]:
# Collect data
github_token = os.environ.get('GITHUB_TOKEN')

print("🚀 Starting data collection...")
print(f"Authentication: {'Enabled' if github_token else 'Disabled (60 requests/hour limit)'}")
print("="*50)

start_time = time.time()
results = fetch_all_repositories(repositories, github_token)
end_time = time.time()

print("\n" + "="*50)
print(f"✅ Data collection completed!")
print(f"📊 Successfully collected data for {len(results)} repositories")
print(f"⏱️  Total time: {end_time - start_time:.2f} seconds")

In [None]:
# Save to CSV
if len(results) > 0:
    df = pd.DataFrame(results)
    
    # Rename columns
    df = df.rename(columns={
        'repo_path': 'Name',
        'field': 'Field',
        'stars': 'Stars',
        'forks': 'Forks',
        'contributors': 'Contributors',
        'open_issues': 'Open Issues',
        'open_prs': 'Open Pull Requests',
        'created_at': 'Date Created',
        'pushed_at': 'Last Active'
    })
    
    # Reorder columns
    column_order = ['Name', 'Field', 'Stars', 'Forks', 'Contributors', 'Open Issues', 'Open Pull Requests', 'Date Created', 'Last Active']
    df = df[column_order]
    
    # Save with timestamp
    csv_filename = 'github_repository_stats.csv'
    df.to_csv(csv_filename, index=False)
    
    # Also save metadata
    metadata = {
        'last_updated': datetime.now().isoformat(),
        'total_repositories': len(df),
        'total_stars': int(df['Stars'].sum()),
        'collection_time_seconds': round(end_time - start_time, 2)
    }
    
    with open('github_stats_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\n💾 Data saved to {csv_filename}")
    print(f"📋 Metadata saved to github_stats_metadata.json")
    print(f"\n✨ Now open github_stats_viewer.ipynb to see the results!")
else:
    print("❌ No data collected. Please check your internet connection and GitHub token.")