# Data Collection by Bucket

In [1]:
import requests
import json
import time
import os
from datetime import datetime
from tqdm import tqdm

In [2]:
LICHESS_TOKEN = os.environ.get('LICHESS_TOKEN', '')

PLAYER_LIST_FILE = "player_list_by_rating_v2.json"
OUTPUT_DIR = "bucket_data" 

GAME_TYPE = "blitz"
GAMES_PER_PLAYER = 200
MIN_GAMES = 10

SLEEP_TIME = 0.5 if LICHESS_TOKEN else 1.5

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
BUCKETS_TO_FETCH = [
    # "800-1000",
    # "1000-1200",
    # "1200-1400",
    # "1400-1600",
    # "1600-1800",
    # "1800-2000",
    # "2000-2200",
    "2200-2400",
    "2400+",
]

print(f"Bucket count: {len(BUCKETS_TO_FETCH)}")
print(f"Buckets: {BUCKETS_TO_FETCH}")

Bucket count: 2
Buckets: ['2200-2400', '2400+']


In [4]:
with open(PLAYER_LIST_FILE, 'r') as f:
    player_list_by_rating = json.load(f)

print("Player list loaded:")
print("\nBucket          | Players | Status")
print("-" * 45)

for bucket, players in player_list_by_rating.items():
    output_file = os.path.join(OUTPUT_DIR, f"bucket_{bucket.replace('+', '_plus')}_games.json")
    status = "✓ Exists" if os.path.exists(output_file) else "○ To fetch" if bucket in BUCKETS_TO_FETCH else "- Skip"
    print(f"{bucket:15} | {len(players):7} | {status}")

Player list loaded:

Bucket          | Players | Status
---------------------------------------------
800-1000        |     350 | ✓ Exists
1000-1200       |     350 | ✓ Exists
1200-1400       |     350 | ✓ Exists
1400-1600       |     350 | ✓ Exists
1600-1800       |     350 | ✓ Exists
1800-2000       |     350 | ✓ Exists
2000-2200       |     350 | ✓ Exists
2200-2400       |     350 | ○ To fetch
2400+           |     350 | ○ To fetch


In [5]:
def get_headers():
    headers = {"Accept": "application/x-ndjson"}
    if LICHESS_TOKEN:
        headers["Authorization"] = f"Bearer {LICHESS_TOKEN}"
    return headers


def fetch_user_games(username, max_games=200, perf_type="blitz"):
    url = f"https://lichess.org/api/games/user/{username}"
    headers = get_headers()
    params = {
        "max": max_games,
        "rated": "true",
        "perfType": perf_type,
        "clocks": "true",
        "opening": "true"
    }
    
    games = []
    
    try:
        response = requests.get(url, headers=headers, params=params, stream=True, timeout=60)
        
        if response.status_code == 404:
            return None, "User not found"
        if response.status_code == 429:
            return None, "Rate limited"
            
        response.raise_for_status()
        
        for line in response.iter_lines():
            if line:
                game = json.loads(line.decode('utf-8'))
                games.append(game)
        
        return games, None
        
    except requests.exceptions.Timeout:
        return None, "Timeout"
    except Exception as e:
        return None, str(e)


def collect_bucket_games(bucket_name, players):
    print(f"BUCKET: {bucket_name}")
    print(f"Players: {len(players)}")
    print(f"Estimated time: {len(players) * SLEEP_TIME / 60:.1f} minutes")
    print()
    
    all_games = {}
    failed = []
    total_games = 0
    
    for i, player in enumerate(tqdm(players, desc=f"Fetching {bucket_name}")):
        games, error = fetch_user_games(player, GAMES_PER_PLAYER, GAME_TYPE)
        
        if error:
            failed.append((player, error))
            if error == "Rate limited":
                print(f"\n⚠️ Rate limited! Waiting 60 seconds...")
                time.sleep(60)
        elif games and len(games) >= MIN_GAMES:
            all_games[player] = games
            total_games += len(games)
        
        # Rate limiting
        time.sleep(SLEEP_TIME)
        
        if (i + 1) % 50 == 0:
            print(f"\n  [{i+1}/{len(players)}] Games collected: {total_games:,}")
    
    print(f"\n✓ Bucket {bucket_name} complete!")
    print(f"  Players with data: {len(all_games)}")
    print(f"  Total games: {total_games:,}")
    print(f"  Failed: {len(failed)}")
    
    return all_games, failed

In [6]:
results_summary = []

for bucket in BUCKETS_TO_FETCH:
    safe_name = bucket.replace('+', '_plus')
    output_file = os.path.join(OUTPUT_DIR, f"bucket_{safe_name}_games.json")
    
    if os.path.exists(output_file):
        print(f"\n Skipping {bucket} - already exists: {output_file}")
        continue
    
    if bucket not in player_list_by_rating:
        print(f"\n⚠️ Bucket {bucket} not found in player list!")
        continue
    
    players = player_list_by_rating[bucket]
    
    games_data, failed = collect_bucket_games(bucket, players)
    
    with open(output_file, 'w') as f:
        json.dump(games_data, f)
    
    file_size = os.path.getsize(output_file) / (1024*1024)
    print(f"\n Saved: {output_file} ({file_size:.1f} MB)")
    
    total_games = sum(len(g) for g in games_data.values())
    results_summary.append({
        'bucket': bucket,
        'players': len(games_data),
        'games': total_games,
        'failed': len(failed),
        'file_size_mb': file_size
    })

print("DATA COLLECTION COMPLETE")

BUCKET: 2200-2400
Players: 350
Estimated time: 2.9 minutes



Fetching 2200-2400:  14%|█▍        | 50/350 [06:41<44:36,  8.92s/it]


  [50/350] Games collected: 9,240


Fetching 2200-2400:  29%|██▊       | 100/350 [13:20<33:42,  8.09s/it]


  [100/350] Games collected: 18,477


Fetching 2200-2400:  43%|████▎     | 150/350 [19:46<29:08,  8.74s/it]


  [150/350] Games collected: 27,507


Fetching 2200-2400:  57%|█████▋    | 200/350 [26:32<21:05,  8.44s/it]


  [200/350] Games collected: 37,114


Fetching 2200-2400:  71%|███████▏  | 250/350 [33:48<15:59,  9.60s/it]


  [250/350] Games collected: 47,402


Fetching 2200-2400:  86%|████████▌ | 300/350 [40:32<05:58,  7.17s/it]


  [300/350] Games collected: 56,642


Fetching 2200-2400: 100%|██████████| 350/350 [47:39<00:00,  8.17s/it]


  [350/350] Games collected: 66,512

✓ Bucket 2200-2400 complete!
  Players with data: 317
  Total games: 66,512
  Failed: 33






 Saved: bucket_data\bucket_2200-2400_games.json (90.9 MB)
BUCKET: 2400+
Players: 350
Estimated time: 2.9 minutes



Fetching 2400+:  14%|█▍        | 50/350 [06:19<31:06,  6.22s/it]


  [50/350] Games collected: 8,870


Fetching 2400+:  29%|██▊       | 100/350 [12:42<30:48,  7.40s/it]


  [100/350] Games collected: 17,743


Fetching 2400+:  43%|████▎     | 150/350 [19:29<29:21,  8.81s/it]


  [150/350] Games collected: 27,401


Fetching 2400+:  57%|█████▋    | 200/350 [26:17<21:01,  8.41s/it]


  [200/350] Games collected: 37,096


Fetching 2400+:  71%|███████▏  | 250/350 [33:06<14:46,  8.87s/it]


  [250/350] Games collected: 46,756


Fetching 2400+:  86%|████████▌ | 300/350 [39:34<06:27,  7.75s/it]


  [300/350] Games collected: 55,994


Fetching 2400+: 100%|██████████| 350/350 [46:04<00:00,  7.90s/it]


  [350/350] Games collected: 64,895

✓ Bucket 2400+ complete!
  Players with data: 312
  Total games: 64,895
  Failed: 38






 Saved: bucket_data\bucket_2400_plus_games.json (93.5 MB)
DATA COLLECTION COMPLETE
