In [12]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os
import time
from tqdm import tqdm

In [None]:
# Load environment and setup
load_dotenv()
COINAPI_KEY = os.getenv('COIN_API_KEY')
SYMBOL_ID = "COINBASE_SPOT_ETH_USD"

headers = {
    'Accept': 'application/json',
    'X-CoinAPI-Key': COINAPI_KEY
}

9634350c-028f-4851-9fe9-3f8b7651e88b


In [14]:
# Load existing CSV
df = pd.read_csv('./5s_data/eth_orderbook_coinbase_5s.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Loaded {len(df)} rows")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Columns: {list(df.columns[:10])}...")

Loaded 197800 rows
Date range: 2025-12-01 02:57:15.727819 to 2025-12-13 19:06:24.371968
Columns: ['timestamp', 'bid_price_1', 'bid_vol_1', 'ask_price_1', 'ask_vol_1', 'bid_price_2', 'bid_vol_2', 'ask_price_2', 'ask_vol_2', 'bid_price_3']...


In [15]:
def get_ohlcv_data(symbol_id, start_time, end_time, period='1MIN'):
    """
    Fetch OHLCV (candlestick) data which includes volume.
    """
    url = f"https://rest.coinapi.io/v1/ohlcv/{symbol_id}/history"
    
    # Format timestamps properly (remove microseconds)
    start_str = start_time.replace(microsecond=0).isoformat() + 'Z'
    end_str = end_time.replace(microsecond=0).isoformat() + 'Z'
    
    params = {
        'period_id': period,
        'time_start': start_str,
        'time_end': end_str,
        'limit': 10000
    }
    
    print(f"  Requesting: {url}")
    print(f"  Period: {period}, Start: {start_str}, End: {end_str}")
    
    for attempt in range(3):
        try:
            r = requests.get(url, headers=headers, params=params)
            print(f"  Status: {r.status_code}")
            
            if r.status_code == 400:
                print(f"  Response: {r.text[:200]}")
                return []
            elif r.status_code == 429:
                print("  Rate limit, waiting...")
                time.sleep(60)
                continue
            elif r.status_code == 500:
                time.sleep(2 ** attempt)
                continue
            
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            print(f"  Error: {e}")
            time.sleep(2 ** attempt)
    return []

In [16]:
# Fetch OHLCV data in chunks for the full date range
start_time = df['timestamp'].min()
end_time = df['timestamp'].max()
chunk_size = timedelta(hours=24)  # Fetch 24 hours at a time

print(f"Fetching OHLCV data from {start_time} to {end_time}")
print(f"Date range: {(end_time - start_time).days} days")

all_ohlcv = []
current_start = start_time

while current_start < end_time:
    current_end = min(current_start + chunk_size, end_time)
    print(f"\nFetching chunk: {current_start} to {current_end}")
    
    chunk_ohlcv = get_ohlcv_data(SYMBOL_ID, current_start, current_end, period='5SEC')
    if chunk_ohlcv:
        all_ohlcv.extend(chunk_ohlcv)
        print(f"  Got {len(chunk_ohlcv)} candles")
    else:
        print(f"  No data returned")
    
    current_start = current_end
    time.sleep(1)  # Rate limiting

ohlcv_df = pd.DataFrame(all_ohlcv)
print(f"\nTotal fetched: {len(ohlcv_df)} candles")
if len(ohlcv_df) > 0:
    print(f"Sample data:")
    print(ohlcv_df.head())

Fetching OHLCV data from 2025-12-01 02:57:15.727819 to 2025-12-13 19:06:24.371968
Date range: 12 days

Fetching chunk: 2025-12-01 02:57:15.727819 to 2025-12-02 02:57:15.727819
  Requesting: https://rest.coinapi.io/v1/ohlcv/COINBASE_SPOT_ETH_USD/history
  Period: 5SEC, Start: 2025-12-01T02:57:15Z, End: 2025-12-02T02:57:15Z
  Status: 200
  Got 9936 candles
  Status: 200
  Got 9936 candles

Fetching chunk: 2025-12-02 02:57:15.727819 to 2025-12-03 02:57:15.727819
  Requesting: https://rest.coinapi.io/v1/ohlcv/COINBASE_SPOT_ETH_USD/history
  Period: 5SEC, Start: 2025-12-02T02:57:15Z, End: 2025-12-03T02:57:15Z

Fetching chunk: 2025-12-02 02:57:15.727819 to 2025-12-03 02:57:15.727819
  Requesting: https://rest.coinapi.io/v1/ohlcv/COINBASE_SPOT_ETH_USD/history
  Period: 5SEC, Start: 2025-12-02T02:57:15Z, End: 2025-12-03T02:57:15Z
  Status: 200
  Got 9922 candles
  Status: 200
  Got 9922 candles

Fetching chunk: 2025-12-03 02:57:15.727819 to 2025-12-04 02:57:15.727819
  Requesting: https://rest

In [21]:
# Debug: Check what columns we actually got
if len(ohlcv_df) > 0:
    print("\nActual columns in API response:")
    print(ohlcv_df.columns.tolist())
    print("\nFirst row sample:")
    print(ohlcv_df.iloc[0].to_dict())


Actual columns in API response:
['timestamp', 'price', 'volume']

First row sample:
{'timestamp': Timestamp('2025-12-01 02:57:15+0000', tz='UTC'), 'price': 2841.165, 'volume': 0.59791517}


In [22]:
# Process OHLCV data for merging
if len(ohlcv_df) > 0:
    # Convert timezone-aware timestamps to naive (remove UTC timezone)
    ohlcv_df['timestamp'] = pd.to_datetime(ohlcv_df['timestamp']).dt.tz_localize(None)
    
    # Merge with nearest timestamp (within 5 seconds)
    df = pd.merge_asof(
        df.sort_values('timestamp'),
        ohlcv_df[['timestamp', 'price', 'volume']].sort_values('timestamp'),
        on='timestamp',
        direction='nearest',
        tolerance=pd.Timedelta(seconds=5)
    )
    
    print(f"\nMerged {len(df)} rows")
    print(f"Missing price: {df['price'].isna().sum()}")
    print(f"Missing volume: {df['volume'].isna().sum()}")
else:
    print("\nNo OHLCV data returned - using fallback (mid-price)")
    df['price'] = (df['bid_price_1'] + df['ask_price_1']) / 2.0
    df['volume'] = 0.0


Merged 197800 rows
Missing price: 81701
Missing volume: 81701


In [23]:
# Save updated CSV
output_path = './5s_data/eth_orderbook_coinbase_5s_with_price_volume.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")
print(f"Final shape: {df.shape}")

# Preview
print("\nPreview:")
print(df[['timestamp', 'bid_price_1', 'ask_price_1', 'price', 'volume']].head(10))

# Statistics
print("\nStatistics:")
print(f"  Price range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
print(f"  Mean volume: {df['volume'].mean():.6f}")
print(f"  Total volume: {df['volume'].sum():.2f}")


Saved to: ./5s_data/eth_orderbook_coinbase_5s_with_price_volume.csv
Final shape: (197800, 43)

Preview:
                   timestamp  bid_price_1  ask_price_1     price    volume
0 2025-12-01 02:57:15.727819      2841.16      2841.17  2841.165  0.597915
1 2025-12-01 02:57:21.073133      2841.10      2841.11  2841.085  0.777114
2 2025-12-01 02:57:26.375175      2840.82      2841.00  2840.990  1.177497
3 2025-12-01 02:57:31.660663      2841.00      2841.01  2841.005  0.025164
4 2025-12-01 02:57:37.007031      2841.00      2841.01  2841.005  0.003338
5 2025-12-01 02:57:42.671388      2841.11      2841.12  2842.230  2.630444
6 2025-12-01 02:57:47.996316      2841.53      2841.54  2841.730  2.091136
7 2025-12-01 02:57:53.320210      2841.72      2841.73  2841.500  0.349916
8 2025-12-01 02:57:58.597944      2841.32      2841.33  2841.490  0.043777
9 2025-12-01 02:58:03.907542      2841.32      2841.33  2841.505  0.065284

Statistics:
  Price range: $2719.95 - $3372.61
  Mean volume: 5.33038