# Cryptocurrency Data Download
Downloads historical OHLCV data from Binance and saves to Google Drive

# Libraries

In [None]:
# !pip install ccxt -q

In [6]:
import ccxt
import pandas as pd
from datetime import datetime, timedelta
import time
import os

# Binance data collection

In [3]:
# Create directory for crypto data
save_path = './crypto_data/'
!mkdir -p "{save_path}"

In [4]:
exchange = ccxt.binance({
    'enableRateLimit': True,
})

In [5]:
def download_ohlcv(symbol, timeframe, start_date, end_date):
    """
    Download OHLCV data from Binance

    Args:
        symbol: Trading pair
        timeframe: '1d' for daily, '1m' for minute
        start_date
        end_date
    """
    print(f"Downloading {symbol} {timeframe} data from {start_date} to {end_date}...")

    # Convert dates to milliseconds
    start_ts = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp() * 1000)
    end_ts = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp() * 1000)

    all_data = []
    current_ts = start_ts

    # Binance limit: 1000 candles per request
    while current_ts < end_ts:
        try:
            ohlcv = exchange.fetch_ohlcv(symbol, timeframe, current_ts, limit=1000)

            if not ohlcv:
                break

            all_data.extend(ohlcv)
            current_ts = ohlcv[-1][0] + 1

            progress_pct = min(((current_ts - start_ts) / (end_ts - start_ts)) * 100, 100.0)
            print(f"Progress: {progress_pct:.1f}%", end='\r')

            time.sleep(exchange.rateLimit / 1000)  # rate limits

        except Exception as e:
            print(f"\nError: {e}")
            time.sleep(5)
            continue

    # To DataFrame
    df = pd.DataFrame(all_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df = df[(df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)]

    print(f"\nDownloaded {len(df)} candles")
    return df

## Download Daily Data (10 years)

In [8]:
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=365*10)).strftime('%Y-%m-%d')

# top 10
top_pairs = [
    'BTC/USDT',
    'ETH/USDT', 
    'BNB/USDT',
    'SOL/USDT',
    'XRP/USDT',
    'ADA/USDT',
    'DOGE/USDT',
    'AVAX/USDT',
    'DOT/USDT',
    'MATIC/USDT'
]

all_data = {}

for symbol in top_pairs:
    filename = f'{symbol.replace("/", "_")}_daily_10y.parquet'
    filepath = os.path.join(save_path, filename)
    
    if os.path.exists(filepath):
        print(f"Loading {symbol} from file...")
        daily_data = pd.read_parquet(filepath)
        print(f"{symbol} - Loaded, shape: {daily_data.shape}")
    else:
        print(f"Downloading {symbol}...")
        try:
            daily_data = download_ohlcv(
                symbol=symbol,
                timeframe='1d',
                start_date=start_date,
                end_date=end_date
            )
            daily_data.to_parquet(filepath, index=False)
            print(f"{symbol} - Downloaded and saved, shape: {daily_data.shape}")
        except Exception as e:
            print(f"Error downloading {symbol}: {e}")
            continue
    
    all_data[symbol] = daily_data

first_pair = top_pairs[0]
print(f"\n{first_pair} sample:")
print(all_data[first_pair].head())
print(f"\nAll data saved to {save_path}")

Loading BTC/USDT from file...
BTC/USDT - Loaded, shape: (3102, 6)
Downloading ETH/USDT...
Downloading ETH/USDT 1d data from 2016-02-16 to 2026-02-13...
Progress: 100.0%
Downloaded 3103 candles
ETH/USDT - Downloaded and saved, shape: (3103, 6)
Downloading BNB/USDT...
Downloading BNB/USDT 1d data from 2016-02-16 to 2026-02-13...
Progress: 100.0%
Downloaded 3022 candles
BNB/USDT - Downloaded and saved, shape: (3022, 6)
Downloading SOL/USDT...
Downloading SOL/USDT 1d data from 2016-02-16 to 2026-02-13...
Progress: 100.0%
Downloaded 2013 candles
SOL/USDT - Downloaded and saved, shape: (2013, 6)
Downloading XRP/USDT...
Downloading XRP/USDT 1d data from 2016-02-16 to 2026-02-13...
Progress: 100.0%
Downloaded 2843 candles
XRP/USDT - Downloaded and saved, shape: (2843, 6)
Downloading ADA/USDT...
Downloading ADA/USDT 1d data from 2016-02-16 to 2026-02-13...
Progress: 100.0%
Downloaded 2860 candles
ADA/USDT - Downloaded and saved, shape: (2860, 6)
Downloading DOGE/USDT...
Downloading DOGE/USDT 1d

## Download Minute Data (1 year)

In [None]:
end_date_min = datetime.now()
start_date_min = datetime.now() - timedelta(days=365)

batch_size = 30  
current_date = start_date_min
batch_num = 0
all_batches = []

print(f"Downloading minute data in {batch_size}-day batches...")

while current_date < end_date_min:
    batch_num += 1
    batch_end = min(current_date + timedelta(days=batch_size), end_date_min)
    
    print(f"\n[Batch {batch_num}] {current_date.strftime('%Y-%m-%d')} to {batch_end.strftime('%Y-%m-%d')}")
    
    try:
        batch_data = download_ohlcv(
            symbol='BTC/USDT',  # Add more pairs
            timeframe='1m',
            start_date=current_date.strftime('%Y-%m-%d'),
            end_date=batch_end.strftime('%Y-%m-%d')
        )
        
        if len(batch_data) > 0:
            batch_filename = f'BTC_USDT_1m_{current_date.strftime("%Y%m%d")}_{batch_end.strftime("%Y%m%d")}.parquet'
            batch_data.to_parquet(f'{save_path}{batch_filename}', index=False)
            print(f"Saved {len(batch_data)} rows → {batch_filename}")
            all_batches.append(batch_filename)
        else:
            print(f"No data")
            
    except Exception as e:
        print(f"Error: {e}")
    
    current_date = batch_end
    time.sleep(1)  

print(f"\nDownloaded {len(all_batches)} batches")


Downloading minute data in 30-day batches...

[Batch 1] 2025-02-12 to 2025-03-14
Downloading BTC/USDT 1m data from 2025-02-12 to 2025-03-14...
Progress: 101.8%
Downloaded 43201 candles
Saved 43201 rows → BTC_USDT_1m_20250212_20250314.parquet

[Batch 2] 2025-03-14 to 2025-04-13
Downloading BTC/USDT 1m data from 2025-03-14 to 2025-04-13...
Progress: 102.0%
Downloaded 43201 candles
Saved 43201 rows → BTC_USDT_1m_20250314_20250413.parquet

[Batch 3] 2025-04-13 to 2025-05-13
Downloading BTC/USDT 1m data from 2025-04-13 to 2025-05-13...
Progress: 101.8%
Downloaded 43201 candles
Saved 43201 rows → BTC_USDT_1m_20250413_20250513.parquet

[Batch 4] 2025-05-13 to 2025-06-12
Downloading BTC/USDT 1m data from 2025-05-13 to 2025-06-12...
Progress: 101.8%
Downloaded 43201 candles
Saved 43201 rows → BTC_USDT_1m_20250513_20250612.parquet

[Batch 5] 2025-06-12 to 2025-07-12
Downloading BTC/USDT 1m data from 2025-06-12 to 2025-07-12...
Progress: 101.8%
Downloaded 43201 candles
Saved 43201 rows → BTC_USDT

In [15]:
dfs = [pd.read_parquet(f'{save_path}{batch}') for batch in all_batches]
minute_data = pd.concat(dfs, ignore_index=True).drop_duplicates(subset='timestamp')
minute_data = minute_data.sort_values('timestamp').reset_index(drop=True)
minute_data.to_parquet(f'{save_path}BTC_USDT_minute_1y.parquet', index=False)