# GitHub Repository Statistics Analysis

This notebook extracts GitHub repository statistics and displays them in grouped tables.

## Authentication Setup

To avoid rate limits, set up a GitHub Personal Access Token:
1. Go to GitHub Settings > Developer settings > Personal access tokens
2. Generate a new token with 'public_repo' scope
3. Set environment variable: `export GITHUB_TOKEN=your_token_here`

Without authentication: 60 requests/hour  
With authentication: 5000 requests/hour

In [1]:
# Cell 1: Setup and Imports
import requests
import pandas as pd
from datetime import datetime
import time
import os
from IPython.display import HTML, display
import json

print("Libraries imported successfully!")
print(f"GitHub token available: {'Yes' if os.environ.get('GITHUB_TOKEN') else 'No (using unauthenticated requests)'}")

Libraries imported successfully!
GitHub token available: Yes


In [2]:
# Cell 2: Repository List with Categories
repositories = {
    "AI/ML": [
        "huggingface/trl",
        "Significant-Gravitas/AutoGPT",
        "open-webui/open-webui",
        "comfyanonymous/ComfyUI",
        "langchain-ai/langchain",
        "huggingface/transformers",
        "ollama/ollama",
        "vllm-project/vllm",
        "n8n-io/n8n",
        "iterative/dvc",
        "langgenius/dify",
        "HumanSignal/label-studio",
        "microsoft/ML-For-Beginners",
        "cleanlab/cleanlab",
        "voxel51/fiftyone",
        "fastai/fastbook",
        "pytorch/pytorch",
        "tensorflow/tensorflow",
        "scikit-learn/scikit-learn",
        "pandas-dev/pandas"
    ],
    "TypeScript": [
        "nestjs/nest",
        "prisma/prisma",
        "adonisjs/core",
        "typeorm/typeorm",
        "sequelize/sequelize",
        "trpc/trpc",
        "fastify/fastify",
        "taskforcesh/bullmq",
        "mikro-orm/mikro-orm",
        "drizzle-team/drizzle-orm",
        "kysely-org/kysely",
        "colinhacks/zod",
        "winstonjs/winston",
        "pinojs/pino",
        "helmetjs/helmet",
        "moleculerjs/moleculer",
        "typestack/routing-controllers",
        "lukeautry/tsoa",
        "tsedio/tsed",
        "lobehub/lobe-chat",
        "vercel/ai"
    ],
    "C# ASP.NET": [
        "dotnet/aspnetcore",
        "aspnetrun/run-aspnetcore-microservices",
        "DapperLib/Dapper",
        "jasontaylordev/CleanArchitecture",
        "dotnet/efcore",
        "App-vNext/Polly",
        "dotnet/orleans",
        "HangfireIO/Hangfire",
        "nopSolutions/nopCommerce",
        "ThreeMammals/Ocelot",
        "OrchardCMS/OrchardCore",
        "dotnetcore/CAP",
        "gothinkster/aspnetcore-realworld-example-app"
    ],
    "Other": [
        "karpathy/nn-zero-to-hero"
    ]
}

total_repos = sum(len(repos) for repos in repositories.values())
print(f"Total repositories to analyze: {total_repos}")
for field, repos in repositories.items():
    print(f"{field}: {len(repos)} repositories")

Total repositories to analyze: 55
AI/ML: 20 repositories
TypeScript: 21 repositories
C# ASP.NET: 13 repositories
Other: 1 repositories


In [3]:
# Cell 3: GitHub API Functions

def get_repo_stats(owner, repo, token=None):
    """
    Fetch repository statistics from GitHub API
    """
    headers = {}
    if token:
        headers['Authorization'] = f'token {token}'
    
    # Main repository info
    repo_url = f'https://api.github.com/repos/{owner}/{repo}'
    
    try:
        response = requests.get(repo_url, headers=headers)
        
        if response.status_code == 404:
            print(f"⚠️  Repository {owner}/{repo} not found (404)")
            return None
        elif response.status_code == 403:
            print(f"⚠️  Access denied to {owner}/{repo} (403 - private repo?)")
            return None
        elif response.status_code == 429:
            print(f"⚠️  Rate limit exceeded. Waiting 60 seconds...")
            time.sleep(60)
            return get_repo_stats(owner, repo, token)  # Retry
        
        response.raise_for_status()
        repo_data = response.json()
        
        # Get contributors count
        contributors_count = get_contributors_count(owner, repo, headers)
        
        # Get open pull requests count
        open_prs_count = get_open_prs_count(owner, repo, headers)
        
        return {
            'repo_path': f'{owner}/{repo}',
            'stars': repo_data['stargazers_count'],
            'forks': repo_data['forks_count'],
            'contributors': contributors_count,
            'open_issues': repo_data['open_issues_count'],
            'open_prs': open_prs_count,
            'created_at': repo_data['created_at'][:10],  # YYYY-MM-DD
            'pushed_at': repo_data['pushed_at'][:10] if repo_data['pushed_at'] else repo_data['created_at'][:10]
        }
        
    except Exception as e:
        print(f"❌ Error fetching {owner}/{repo}: {str(e)}")
        return None

def get_contributors_count(owner, repo, headers):
    """
    Get the number of contributors for a repository
    """
    try:
        contributors_url = f'https://api.github.com/repos/{owner}/{repo}/contributors?per_page=1'
        response = requests.get(contributors_url, headers=headers)
        
        if response.status_code == 200:
            # Check if there's a 'Link' header for pagination
            if 'Link' in response.headers:
                link_header = response.headers['Link']
                # Parse the last page number from the Link header
                if 'rel="last"' in link_header:
                    last_page = link_header.split('page=')[1].split('&')[0].split('>')[0]
                    return int(last_page)
            
            # If no pagination, count the contributors directly
            contributors_response = requests.get(f'https://api.github.com/repos/{owner}/{repo}/contributors', headers=headers)
            if contributors_response.status_code == 200:
                return len(contributors_response.json())
        
        return 0
    except:
        return 0

def get_open_prs_count(owner, repo, headers):
    """
    Get the number of open pull requests
    """
    try:
        prs_url = f'https://api.github.com/repos/{owner}/{repo}/pulls?state=open&per_page=1'
        response = requests.get(prs_url, headers=headers)
        
        if response.status_code == 200:
            # Check if there's a 'Link' header for pagination
            if 'Link' in response.headers:
                link_header = response.headers['Link']
                if 'rel="last"' in link_header:
                    last_page = link_header.split('page=')[1].split('&')[0].split('>')[0]
                    return int(last_page)
            
            # If no pagination, count directly
            prs_response = requests.get(f'https://api.github.com/repos/{owner}/{repo}/pulls?state=open', headers=headers)
            if prs_response.status_code == 200:
                return len(prs_response.json())
        
        return 0
    except:
        return 0

def fetch_all_repositories(repositories_dict, token=None):
    """
    Fetch statistics for all repositories
    """
    results = []
    total_repos = sum(len(repos) for repos in repositories_dict.values())
    current_repo = 0
    
    for field, repo_list in repositories_dict.items():
        print(f"\n🔍 Processing {field} repositories...")
        
        for repo_path in repo_list:
            current_repo += 1
            owner, repo = repo_path.split('/')
            
            print(f"[{current_repo}/{total_repos}] Fetching {repo_path}... ", end="")
            
            stats = get_repo_stats(owner, repo, token)
            if stats:
                stats['field'] = field
                results.append(stats)
                print(f"✅ {stats['stars']} stars")
            else:
                print("❌ Failed")
            
            # Rate limiting - be respectful to GitHub API
            time.sleep(0.1)
    
    return results

print("✅ GitHub API functions defined successfully!")

✅ GitHub API functions defined successfully!


In [4]:
# Cell 4: Data Collection

# Get GitHub token from environment
github_token = os.environ.get('GITHUB_TOKEN')

print("🚀 Starting data collection...")
print(f"Authentication: {'Enabled' if github_token else 'Disabled (60 requests/hour limit)'}")
print("="*50)

# Collect all repository data
start_time = time.time()
results = fetch_all_repositories(repositories, github_token)
end_time = time.time()

print("\n" + "="*50)
print(f"✅ Data collection completed!")
print(f"📊 Successfully collected data for {len(results)} repositories")
print(f"⏱️  Total time: {end_time - start_time:.2f} seconds")

if len(results) == 0:
    print("❌ No data collected. Please check your internet connection and GitHub token.")

🚀 Starting data collection...
Authentication: Enabled

🔍 Processing AI/ML repositories...
[1/55] Fetching huggingface/trl... 

✅ 14768 stars
[2/55] Fetching Significant-Gravitas/AutoGPT... 

✅ 177287 stars
[3/55] Fetching open-webui/open-webui... 

✅ 104200 stars
[4/55] Fetching comfyanonymous/ComfyUI... 

✅ 83714 stars
[5/55] Fetching langchain-ai/langchain... 

✅ 112412 stars
[6/55] Fetching huggingface/transformers... 

✅ 147573 stars
[7/55] Fetching ollama/ollama... 

✅ 147790 stars
[8/55] Fetching vllm-project/vllm... 

✅ 53392 stars
[9/55] Fetching n8n-io/n8n... 

✅ 124448 stars
[10/55] Fetching iterative/dvc... 

✅ 14705 stars
[11/55] Fetching langgenius/dify... 

✅ 108749 stars
[12/55] Fetching HumanSignal/label-studio... 

✅ 23920 stars
[13/55] Fetching microsoft/ML-For-Beginners... 

✅ 75759 stars
[14/55] Fetching cleanlab/cleanlab... 

✅ 10742 stars
[15/55] Fetching voxel51/fiftyone... 

✅ 9745 stars
[16/55] Fetching fastai/fastbook... 

✅ 23466 stars
[17/55] Fetching pytorch/pytorch... 

✅ 91835 stars
[18/55] Fetching tensorflow/tensorflow... 

✅ 190933 stars
[19/55] Fetching scikit-learn/scikit-learn... 

✅ 62799 stars
[20/55] Fetching pandas-dev/pandas... 

✅ 46115 stars

🔍 Processing TypeScript repositories...
[21/55] Fetching nestjs/nest... 

✅ 71980 stars
[22/55] Fetching prisma/prisma... 

✅ 42966 stars
[23/55] Fetching adonisjs/core... 

✅ 18068 stars
[24/55] Fetching typeorm/typeorm... 

✅ 35593 stars
[25/55] Fetching sequelize/sequelize... 

✅ 30098 stars
[26/55] Fetching trpc/trpc... 

✅ 38001 stars
[27/55] Fetching fastify/fastify... 

✅ 34231 stars
[28/55] Fetching taskforcesh/bullmq... 

✅ 7288 stars
[29/55] Fetching mikro-orm/mikro-orm... 

✅ 8405 stars
[30/55] Fetching drizzle-team/drizzle-orm... 

✅ 29458 stars
[31/55] Fetching kysely-org/kysely... 

✅ 12395 stars
[32/55] Fetching colinhacks/zod... 

✅ 39247 stars
[33/55] Fetching winstonjs/winston... 

✅ 23836 stars
[34/55] Fetching pinojs/pino... 

✅ 15840 stars
[35/55] Fetching helmetjs/helmet... 

✅ 10465 stars
[36/55] Fetching moleculerjs/moleculer... 

✅ 6301 stars
[37/55] Fetching typestack/routing-controllers... 

✅ 4479 stars
[38/55] Fetching lukeautry/tsoa... 

✅ 3841 stars
[39/55] Fetching tsedio/tsed... 

✅ 2997 stars
[40/55] Fetching lobehub/lobe-chat... 

✅ 63875 stars
[41/55] Fetching vercel/ai... 

✅ 16080 stars

🔍 Processing C# ASP.NET repositories...
[42/55] Fetching dotnet/aspnetcore... 

✅ 36917 stars
[43/55] Fetching aspnetrun/run-aspnetcore-microservices... 

✅ 3052 stars
[44/55] Fetching DapperLib/Dapper... 

✅ 17996 stars
[45/55] Fetching jasontaylordev/CleanArchitecture... 

✅ 18569 stars
[46/55] Fetching dotnet/efcore... 

✅ 14229 stars
[47/55] Fetching App-vNext/Polly... 

✅ 13877 stars
[48/55] Fetching dotnet/orleans... 

✅ 10441 stars
[49/55] Fetching HangfireIO/Hangfire... 

✅ 9792 stars
[50/55] Fetching nopSolutions/nopCommerce... 

✅ 9720 stars
[51/55] Fetching ThreeMammals/Ocelot... 

✅ 8587 stars
[52/55] Fetching OrchardCMS/OrchardCore... 

✅ 7774 stars
[53/55] Fetching dotnetcore/CAP... 

✅ 6930 stars
[54/55] Fetching gothinkster/aspnetcore-realworld-example-app... 

✅ 2022 stars

🔍 Processing Other repositories...
[55/55] Fetching karpathy/nn-zero-to-hero... 

✅ 15105 stars

✅ Data collection completed!
📊 Successfully collected data for 55 repositories
⏱️  Total time: 118.97 seconds


In [5]:
# Cell 5: DataFrame Creation and Formatting

if len(results) > 0:
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Rename columns to match requirements
    df = df.rename(columns={
        'field': 'Field',
        'stars': 'Stars',
        'forks': 'Forks',
        'contributors': 'Contributors',
        'open_issues': 'Open Issues',
        'open_prs': 'Open Pull Requests',
        'created_at': 'Date Created',
        'pushed_at': 'Last Active'
    })
    
    # Create clickable GitHub links
    df['Name'] = df.apply(lambda row: f'<a href="https://github.com/{row["repo_path"]}" target="_blank">{row["repo_path"]}</a>', axis=1)
    
    # Drop the repo_path column
    df = df.drop('repo_path', axis=1)
    
    # Reorder columns
    column_order = ['Name', 'Field', 'Stars', 'Forks', 'Contributors', 'Open Issues', 'Open Pull Requests', 'Date Created', 'Last Active']
    df = df[column_order]
    
    print(f"📊 DataFrame created with {len(df)} repositories")
    print(f"📋 Columns: {list(df.columns)}")
    
    # Display basic info
    print("\n📈 Quick Stats:")
    print(f"Total Stars: {df['Stars'].sum():,}")
    print(f"Total Forks: {df['Forks'].sum():,}")
    print(f"Average Stars: {df['Stars'].mean():.0f}")
    print(f"Average Contributors: {df['Contributors'].mean():.0f}")
    
else:
    print("❌ Cannot create DataFrame - no data available")

📊 DataFrame created with 55 repositories
📋 Columns: ['Name', 'Field', 'Stars', 'Forks', 'Contributors', 'Open Issues', 'Open Pull Requests', 'Date Created', 'Last Active']

📈 Quick Stats:
Total Stars: 2,314,807
Total Forks: 462,626
Average Stars: 42087
Average Contributors: 1


In [6]:
# Cell 6: Display Grouped Tables

if 'df' in locals() and len(df) > 0:
    print("📊 GitHub Repository Statistics by Field\n")
    
    # Define field order for consistent display
    field_order = ['AI/ML', 'TypeScript', 'C# ASP.NET', 'Other']
    
    for field in field_order:
        if field in df['Field'].values:
            field_df = df[df['Field'] == field].sort_values('Stars', ascending=False)
            
            print(f"\n## {field} Repositories ({len(field_df)} total)\n")
            
            # Display stats for this field
            field_stats = f"""**Field Summary:**
- Total Stars: {field_df['Stars'].sum():,}
- Average Stars: {field_df['Stars'].mean():.0f}
- Total Contributors: {field_df['Contributors'].sum():,}
- Most Starred: {field_df.iloc[0]['Name'].split('>')[-2].split('<')[0] if len(field_df) > 0 else 'N/A'}
"""
            display(HTML(f"<div style='margin-bottom: 10px;'>{field_stats}</div>"))
            
            # Create a copy for display (remove Field column since it's redundant in grouped view)
            display_df = field_df.drop('Field', axis=1).reset_index(drop=True)
            
            # Style the table
            styled_html = display_df.to_html(escape=False, index=False, classes='table table-striped', table_id=f'table-{field.lower().replace(" ", "-").replace("#", "sharp").replace("/", "-")}')
            
            # Add some custom styling
            styled_html = f"""
            <style>
                .table {{
                    border-collapse: collapse;
                    margin: 10px 0;
                    font-size: 0.9em;
                    width: 100%;
                }}
                .table th {{
                    background-color: #f8f9fa;
                    padding: 8px;
                    text-align: left;
                    border-bottom: 2px solid #dee2e6;
                }}
                .table td {{
                    padding: 8px;
                    border-bottom: 1px solid #dee2e6;
                }}
                .table a {{
                    color: #0066cc;
                    text-decoration: none;
                }}
                .table a:hover {{
                    text-decoration: underline;
                }}
            </style>
            {styled_html}
            """
            
            display(HTML(styled_html))
            print("\n" + "-"*80)
            
else:
    print("❌ No data available to display")

📊 GitHub Repository Statistics by Field


## AI/ML Repositories (20 total)



Name,Stars,Forks,Contributors,Open Issues,Open Pull Requests,Date Created,Last Active
tensorflow/tensorflow,190933,74769,1,1510,1,2015-11-07,2025-07-28
Significant-Gravitas/AutoGPT,177287,45904,1,212,1,2023-03-16,2025-07-28
ollama/ollama,147790,12552,1,1957,1,2023-06-26,2025-07-27
huggingface/transformers,147573,29808,1,1897,1,2018-10-29,2025-07-28
n8n-io/n8n,124448,37748,1,983,1,2019-06-22,2025-07-28
langchain-ai/langchain,112412,18351,1,277,1,2022-10-17,2025-07-28
langgenius/dify,108749,16549,1,761,1,2023-04-12,2025-07-28
open-webui/open-webui,104200,14012,1,286,1,2023-10-06,2025-07-28
pytorch/pytorch,91835,24787,1,16649,1,2016-08-13,2025-07-28
comfyanonymous/ComfyUI,83714,9284,1,2603,1,2023-01-17,2025-07-28



--------------------------------------------------------------------------------

## TypeScript Repositories (21 total)



Name,Stars,Forks,Contributors,Open Issues,Open Pull Requests,Date Created,Last Active
nestjs/nest,71980,7959,1,64,1,2017-02-04,2025-07-28
lobehub/lobe-chat,63875,13276,1,946,1,2023-05-21,2025-07-28
prisma/prisma,42966,1805,1,2334,1,2019-06-20,2025-07-28
colinhacks/zod,39247,1549,1,318,1,2020-03-07,2025-07-25
trpc/trpc,38001,1395,1,185,1,2020-07-18,2025-07-22
typeorm/typeorm,35593,6423,1,2468,1,2016-02-29,2025-07-28
fastify/fastify,34231,2461,1,117,1,2016-09-28,2025-07-28
sequelize/sequelize,30098,4297,1,985,1,2010-07-22,2025-07-27
drizzle-team/drizzle-orm,29458,947,1,1657,1,2021-06-24,2025-07-28
winstonjs/winston,23836,1833,1,514,1,2010-12-29,2025-07-14



--------------------------------------------------------------------------------

## C# ASP.NET Repositories (13 total)



Name,Stars,Forks,Contributors,Open Issues,Open Pull Requests,Date Created,Last Active
dotnet/aspnetcore,36917,10415,1,3803,1,2014-03-11,2025-07-28
jasontaylordev/CleanArchitecture,18569,3923,1,23,1,2019-10-07,2025-07-25
DapperLib/Dapper,17996,3681,1,527,1,2011-04-14,2025-06-01
dotnet/efcore,14229,3284,1,2435,1,2014-01-23,2025-07-28
App-vNext/Polly,13877,1260,1,12,1,2013-05-05,2025-07-24
dotnet/orleans,10441,2092,1,636,1,2014-12-19,2025-07-24
HangfireIO/Hangfire,9792,1731,1,939,1,2013-08-06,2025-07-25
nopSolutions/nopCommerce,9720,5616,1,163,1,2014-04-09,2025-07-28
ThreeMammals/Ocelot,8587,1666,1,70,1,2016-06-29,2025-07-26
OrchardCMS/OrchardCore,7774,2481,1,940,1,2014-11-19,2025-07-28



--------------------------------------------------------------------------------

## Other Repositories (1 total)



Name,Stars,Forks,Contributors,Open Issues,Open Pull Requests,Date Created,Last Active
karpathy/nn-zero-to-hero,15105,2120,1,43,1,2022-09-08,2024-08-18



--------------------------------------------------------------------------------


In [7]:
# Cell 7: Export and Summary

if 'df' in locals() and len(df) > 0:
    # Create export version (remove HTML tags from Name column)
    df_export = df.copy()
    df_export['Name'] = df_export['Name'].str.replace('<[^<]+?>', '', regex=True)
    
    # Export to CSV
    csv_filename = 'github_repository_stats.csv'
    df_export.to_csv(csv_filename, index=False)
    print(f"💾 Results exported to {csv_filename}")
    
    # Display final summary
    print("\n" + "="*60)
    print("📈 FINAL SUMMARY")
    print("="*60)
    
    print(f"\n📊 Total repositories analyzed: {len(df)}")
    
    print("\n📋 Repositories by field:")
    for field in df['Field'].unique():
        count = len(df[df['Field'] == field])
        total_stars = df[df['Field'] == field]['Stars'].sum()
        print(f"  • {field}: {count} repositories ({total_stars:,} total stars)")
    
    print("\n🏆 Top 5 most starred repositories:")
    top_5 = df.nlargest(5, 'Stars')[['Name', 'Field', 'Stars']].copy()
    top_5['Name'] = top_5['Name'].str.replace('<[^<]+?>', '', regex=True)
    for idx, row in top_5.iterrows():
        print(f"  {row['Stars']:,} ⭐ {row['Name']} ({row['Field']})")
    
    print("\n📊 Overall statistics:")
    print(f"  • Total stars across all repos: {df['Stars'].sum():,}")
    print(f"  • Total forks across all repos: {df['Forks'].sum():,}")
    print(f"  • Total contributors: {df['Contributors'].sum():,}")
    print(f"  • Average stars per repo: {df['Stars'].mean():.0f}")
    print(f"  • Average contributors per repo: {df['Contributors'].mean():.0f}")
    
    print("\n✅ Analysis complete!")
    
else:
    print("❌ No data available for export and summary")

💾 Results exported to github_repository_stats.csv

📈 FINAL SUMMARY

📊 Total repositories analyzed: 55

📋 Repositories by field:
  • AI/ML: 20 repositories (1,624,352 total stars)
  • TypeScript: 21 repositories (515,444 total stars)
  • C# ASP.NET: 13 repositories (159,906 total stars)
  • Other: 1 repositories (15,105 total stars)

🏆 Top 5 most starred repositories:
  190,933 ⭐ tensorflow/tensorflow (AI/ML)
  177,287 ⭐ Significant-Gravitas/AutoGPT (AI/ML)
  147,790 ⭐ ollama/ollama (AI/ML)
  147,573 ⭐ huggingface/transformers (AI/ML)
  124,448 ⭐ n8n-io/n8n (AI/ML)

📊 Overall statistics:
  • Total stars across all repos: 2,314,807
  • Total forks across all repos: 462,626
  • Total contributors: 55
  • Average stars per repo: 42087
  • Average contributors per repo: 1

✅ Analysis complete!
