In [5]:
import requests
import time
import json
import os
from pathlib import Path
from typing import Optional
import pprint
import pandas as pd
import time
import os
from pathlib import Path
from tqdm import tqdm
import requests


In [6]:
#### beatmap count info
stats = requests.get("https://api.chimu.moe/v1/stats").json()
print(stats)


ConnectionError: HTTPSConnectionPool(host='api.chimu.moe', port=443): Max retries exceeded with url: /v1/stats (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002BB828AB110>: Failed to resolve 'api.chimu.moe' ([Errno 11001] getaddrinfo failed)"))

### API ACCESS SETUP

In [7]:
OSU_CLIENT_ID = 40895
OSU_CLIENT_SECRET = "79LCCASPyRhlj2zhAfG08FcpV4uQy1a0O8O1GOn1"

def get_access_token(client_id: str, client_secret: str) -> str:
    url = "https://osu.ppy.sh/oauth/token"
    payload = {
        "client_id": client_id,
        "client_secret": client_secret,
        "grant_type": "client_credentials",
        "scope": "public"
    }
    response = requests.post(url, json=payload)
    response.raise_for_status()
    return response.json()["access_token"]

osu_token: Optional[str] = None

def get_cached_token() -> str:
    global osu_token
    if osu_token is None:
        osu_token = get_access_token(OSU_CLIENT_ID, OSU_CLIENT_SECRET)
    return osu_token


def osu_api_get(endpoint: str, params: dict = None) -> dict:
    headers = {
        "Authorization": f"Bearer {get_cached_token()}"
    }
    url = f"https://osu.ppy.sh/api/v2/{endpoint}"
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 401:
        # Token expired, refresh
        global osu_token
        osu_token = get_access_token(OSU_CLIENT_ID, OSU_CLIENT_SECRET)
        return osu_api_get(endpoint, params)
    response.raise_for_status()
    return response.json()



##### TEST - get User Info
user_data = osu_api_get("users/35188368", params={"mode": "osu"})  # replace with your user ID
for key in user_data:
    print(f"{key}: {type(user_data[key])}")


avatar_url: <class 'str'>
country_code: <class 'str'>
default_group: <class 'str'>
id: <class 'int'>
is_active: <class 'bool'>
is_bot: <class 'bool'>
is_deleted: <class 'bool'>
is_online: <class 'bool'>
is_supporter: <class 'bool'>
last_visit: <class 'str'>
pm_friends_only: <class 'bool'>
profile_colour: <class 'NoneType'>
username: <class 'str'>
cover_url: <class 'str'>
discord: <class 'str'>
has_supported: <class 'bool'>
interests: <class 'NoneType'>
join_date: <class 'str'>
location: <class 'NoneType'>
max_blocks: <class 'int'>
max_friends: <class 'int'>
occupation: <class 'str'>
playmode: <class 'str'>
playstyle: <class 'list'>
post_count: <class 'int'>
profile_hue: <class 'NoneType'>
profile_order: <class 'list'>
title: <class 'NoneType'>
title_url: <class 'NoneType'>
twitter: <class 'NoneType'>
website: <class 'str'>
country: <class 'dict'>
cover: <class 'dict'>
kudosu: <class 'dict'>
account_history: <class 'list'>
active_tournament_banner: <class 'NoneType'>
active_tournament_b

### Extracting User Data
literally just trying all user ids sequentially

In [8]:
def extract_user_row(user_data: dict) -> dict:
    row = {
        "id": user_data["id"],
        "username": user_data["username"],
        "is_active": user_data["is_active"],
        "join_date": user_data["join_date"],
        "playmode": user_data["playmode"],
        "playstyle": user_data.get("playstyle", []),
        "beatmap_playcounts_count": user_data["beatmap_playcounts_count"],
        "favourite_beatmapset_count": user_data["favourite_beatmapset_count"],
        "scores_recent_count": user_data["scores_recent_count"],
    }

    stats = user_data.get("statistics", {})
    row.update({
        "statistics_pp": stats.get("pp"),
        "statistics_accuracy": stats.get("accuracy"),
        "statistics_play_count": stats.get("play_count"),
        "statistics_play_time": stats.get("play_time"),
        "statistics_global_rank": stats.get("global_rank"),
        "statistics_country_rank": stats.get("country_rank"),
        "statistics_hit_accuracy": stats.get("hit_accuracy"),
        "statistics_maximum_combo": stats.get("maximum_combo"),
        "statistics_level_current": stats.get("level", {}).get("current"),
    })

    return row

def try_get_user(user_id: int) -> dict:
    try:
        user_data = osu_api_get(f"users/{user_id}", params={"mode": "osu"})
        return extract_user_row(user_data)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            return None
        raise  # re-raise on other errors


def ensure_dir(path: Path):
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

def scrape_users_range_batched_to_csv(
    start_id: int,
    end_id: int,
    batch_size: int,
    max_skips: int = 500,
    sleep_time: float = 0.0,
    output_file: Path = Path("Data/users/users.csv")
):
    output_dir = output_file.parent
    ensure_dir(output_dir)

    user_id = start_id
    not_found_streak = 0
    batch = []

    write_header = not output_file.exists()
    total = end_id - start_id + 1
    bar = tqdm(total=total, desc="Scraping users", unit="user", initial=0)

    while user_id <= end_id and not_found_streak < max_skips:
        try:
            user_data = osu_api_get(f"users/{user_id}", params={"mode": "osu"})
            user_row = extract_user_row(user_data)
            batch.append(user_row)
            not_found_streak = 0
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 404:
                not_found_streak += 1
            else:
                print(f"\n⚠️ Unexpected HTTP error at ID {user_id}: {e}")
                break
        except Exception as e:
            print(f"\n💥 Unexpected error at ID {user_id}: {e}")
            break

        user_id += 1
        bar.update(1)
        time.sleep(sleep_time)

        if len(batch) >= batch_size:
            df = pd.DataFrame(batch)
            df.to_csv(output_file, mode='a', header=write_header, index=False)
            print(f"💾 Appended {len(batch)} users to {output_file}")
            batch.clear()
            write_header = False

    bar.close()

    if not_found_streak >= max_skips:
        print(f"\n🛑 Stopped early due to {not_found_streak} consecutive 404s at user ID {user_id}.")

    if batch:
        df = pd.DataFrame(batch)
        df.to_csv(output_file, mode='a', header=write_header, index=False)
        print(f"💾 Appended final {len(batch)} users to {output_file}")

In [7]:
scrape_users_range_batched_to_csv(
    start_id=0,
    end_id=999,
    batch_size=100,
    max_skips=200
)



Scraping users:   4%|▍         | 41/1000 [00:24<10:02,  1.59user/s]

KeyboardInterrupt: 

In [9]:
def osu_api_get(endpoint: str, params: dict = None) -> dict:
    headers = {
        "Authorization": f"Bearer {get_cached_token()}",
        "Content-Type": "application/json",
        "Accept": "application/json"
    }
    url = f"https://osu.ppy.sh/api/v2/{endpoint}"
    response = requests.get(url, headers=headers, params=params)

    print(f"URL: {response.url}")
    print(f"Status: {response.status_code}")
    print(f"Response:\n{response.text}")  # <--- this will show the full response

    if response.status_code == 401:
        # Token expired or invalid — refresh
        global osu_token
        osu_token = get_access_token(OSU_CLIENT_ID, OSU_CLIENT_SECRET)
        return osu_api_get(endpoint, params)

    response.raise_for_status()  # still raise error if needed
    return response.json()


In [24]:
for i in range(1):
    scores = osu_api_get("users/35188368/scores/recent", params={"mode": "osu", "limit": 100, "offset": 1})
    print(f"Total best scores fetched: {len(scores)}")
    for score in scores[:3]:  # show first 3 as preview
        print(score["score"], score.get("beatmap", {}).get("title", "unknown"))


URL: https://osu.ppy.sh/api/v2/users/35188368/scores/recent?mode=osu&limit=100&offset=1
Status: 200
Response:
[{"accuracy":0.8521276595744681,"best_id":null,"created_at":"2025-05-22T17:12:34Z","id":4883839110,"max_combo":356,"mode":"osu","mode_int":0,"mods":["DT"],"passed":true,"perfect":false,"pp":79.1396,"rank":"C","replay":true,"score":0,"statistics":{"count_100":94,"count_300":369,"count_50":1,"count_geki":null,"count_katu":null,"count_miss":6},"type":"solo_score","user_id":35188368,"current_user_attributes":{"pin":null},"beatmap":{"beatmapset_id":2269665,"difficulty_rating":3.87,"id":4838050,"mode":"osu","status":"ranked","total_length":167,"user_id":5875419,"version":"newton's Insane","accuracy":8,"ar":8,"bpm":149,"convert":false,"count_circles":318,"count_sliders":152,"count_spinners":0,"cs":3.5,"deleted_at":null,"drain":5,"hit_length":155,"is_scoreable":true,"last_updated":"2024-10-29T21:55:54Z","mode_int":0,"passcount":9101,"playcount":36352,"ranked":1,"url":"https:\/\/osu.ppy