### Top Coins

In [None]:
from curl_cffi import requests
from typing import List
from pydantic import BaseModel
import pandas as pd

params = {
    "start": 1,
    "limit": 100,
    "sortBy": "rank",
    "sortType": "desc",
    "convert": "USD,BTC,ETH",
    "cryptoType": "all",
    "tagType": "all",
    "audited": "false",
    "aux": "cmc_rank,date_added",
    "marketCapRange": "100000000~",
    "volume24hRange": "1000000~",
    "ageRange": "4204800~",
}
r = requests.get(
    "https://api.coinmarketcap.com/data-api/v3/cryptocurrency/listing",
    params=params,
    impersonate="chrome",
)
data = r.json()


class CryptoCurrency(BaseModel):
    id: int
    name: str
    symbol: str
    dateAdded: str


class CryptoData(BaseModel):
    cryptoCurrencyList: List[CryptoCurrency]


class ApiResponse(BaseModel):
    data: CryptoData


parsed = ApiResponse(**data)
df = pd.DataFrame(
    [
        {
            "name": coin.name.lower(),
            "symbol": coin.symbol.upper(),
        }
        for coin in parsed.data.cryptoCurrencyList
    ]
)

df = df.set_index("name")

In [None]:
import calendar
import time
import threading
from datetime import datetime, timedelta, timezone
from pathlib import Path
from binance.client import Client
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration
BASE_PATH = Path("./data/spot/hourly/klines/")
SYMBOLS = [f"{sym}USDT" for sym in df["symbol"] if sym.upper() != "USDT"]
DATE_START_GLOBAL = "1 Jan, 2017"
DATE_END_GLOBAL = "now UTC"
MAX_WORKERS = 8
SLEEP_PER_REQUEST = 0.5
thread_local = threading.local()


def get_client():
    if not hasattr(thread_local, "client"):
        thread_local.client = Client()
    return thread_local.client


def ts_to_dt(timestamp_ms):
    return datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)


def dt_to_binance_str(dt):
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    dt = dt.astimezone(timezone.utc)
    return dt.strftime("%d %b, %Y %H:%M:%S")


def month_range(start_input, end_input):
    def parse_input(inp):
        if isinstance(inp, datetime):
            if inp.tzinfo is None:
                inp = inp.replace(tzinfo=timezone.utc)
            return inp
        if isinstance(inp, str):
            if inp.lower() == "now utc":
                return datetime.now(timezone.utc)
            for fmt in ["%d %b, %Y %H:%M:%S", "%d %b, %Y"]:
                try:
                    return datetime.strptime(inp, fmt).replace(tzinfo=timezone.utc)
                except ValueError:
                    continue
            raise ValueError(f"Can't parse date string: {inp}")
        raise TypeError(f"Unsupported input type: {type(inp)}")

    start_dt = parse_input(start_input)
    end_dt = parse_input(end_input)
    current = start_dt.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
    while current <= end_dt:
        year, month = current.year, current.month
        last_day = calendar.monthrange(year, month)[1]
        month_end = current.replace(day=last_day, hour=23, minute=59, second=59)
        yield_start = max(current, start_dt)
        yield_end = min(month_end, end_dt)
        yield yield_start, yield_end
        if month == 12:
            current = current.replace(year=year + 1, month=1)
        else:
            current = current.replace(month=month + 1)


# File helpers


def get_last_timestamp(data_file):
    if not data_file.exists() or data_file.stat().st_size == 0:
        return None
    try:
        df = pd.read_csv(data_file, header=None)
        return int(df.iloc[-1, 0]) if not df.empty else None
    except Exception as e:
        print(f"[{data_file.parent.name}] Could not read file: {e}")
        return None


# Main symbol processing


def process_symbol(symbol):
    client = get_client()
    symbol_dir = BASE_PATH / symbol
    symbol_dir.mkdir(exist_ok=True, parents=True)
    data_file = symbol_dir / "data.csv"
    last_ts = get_last_timestamp(data_file)
    if last_ts is not None:
        resume_dt = ts_to_dt(last_ts) + timedelta(hours=1)
        effective_start = resume_dt
    else:
        effective_start = DATE_START_GLOBAL
    if isinstance(effective_start, datetime):
        now_dt = datetime.now(timezone.utc)
        if effective_start > now_dt:
            return (
                symbol,
                0,
                f"Resume time is in the future ({effective_start}). Skipping.",
            )
    try:
        months = list(month_range(effective_start, DATE_END_GLOBAL))
    except Exception as e:
        return symbol, 0, f"Date parse error: {e}"
    if not months:
        return symbol, 0, "No months to fetch"
    total_new_rows = 0
    current_last_ts = last_ts
    for month_start, month_end in months:
        start_str = dt_to_binance_str(month_start)
        end_str = dt_to_binance_str(month_end)
        try:
            klines = client.get_historical_klines(
                symbol, Client.KLINE_INTERVAL_1HOUR, start_str, end_str
            )
        except Exception as e:
            print(f"{symbol}: error fetching klines: {e}")
            time.sleep(1)
            continue
        if not klines:
            time.sleep(SLEEP_PER_REQUEST)
            continue
        if current_last_ts is not None:
            klines = [k for k in klines if k[0] > current_last_ts]
            if not klines:
                time.sleep(SLEEP_PER_REQUEST)
                continue
        df_new = pd.DataFrame(klines)
        mode = "a" if (data_file.exists() and data_file.stat().st_size > 0) else "w"
        df_new.to_csv(data_file, mode=mode, header=False, index=False)
        total_new_rows += len(klines)
        current_last_ts = klines[-1][0]
        time.sleep(SLEEP_PER_REQUEST)
    return symbol, total_new_rows, "Success"


# Run parallel fetching

print(
    f"Starting parallel fetch for {len(SYMBOLS)} symbols with {MAX_WORKERS} workers...\n"
)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_symbol = {executor.submit(process_symbol, sym): sym for sym in SYMBOLS}
    for future in as_completed(future_to_symbol):
        symbol, rows, status = future.result()
        if rows > 0:
            print(f"{symbol}: {rows} new rows saved ({status})")
        else:
            print(f"{symbol}: {status}")

print("All symbols processed.")