In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os
import time
from tqdm import tqdm
from coinbase import jwt_generator


In [2]:
# Coinbase API setup
load_dotenv()

# Coinbase Cloud Trading API credentials
KEY_NAME = os.getenv('KEY_NAME')
KEY_SECRET = os.getenv('KEY_SECRET')  # EC private key in PEM format
REQUEST_HOST = os.getenv('REQUEST_HOST', 'api.coinbase.com')

COINBASE_API_BASE = f"https://{REQUEST_HOST}/api/v3/brokerage"
ETH_PRODUCT_ID = "ETH-USD"
BTC_PRODUCT_ID = "BTC-USD"

In [3]:

def generate_jwt_token(request_method="GET", request_path="/api/v3/brokerage/products"):
    """
    Generate JWT token using official Coinbase SDK method.
    """
    if not KEY_NAME or not KEY_SECRET:
        raise ValueError("KEY_NAME and KEY_SECRET must be set in environment variables")
    
    # Fix newlines in private key if they're escaped
    api_secret = KEY_SECRET.replace('\\r\\n', '\n').replace('\\n', '\n')
    
    # Use official Coinbase JWT generator
    jwt_uri = jwt_generator.format_jwt_uri(request_method, request_path)
    jwt_token = jwt_generator.build_rest_jwt(jwt_uri, KEY_NAME, api_secret)
    
    return jwt_token

In [4]:
# Test JWT token generation
print("Testing Coinbase JWT token generation...")
print(f"KEY_NAME: {KEY_NAME}")
print(f"Private key length: {len(KEY_SECRET) if KEY_SECRET else 0} characters")

try:
    test_token = generate_jwt_token("GET", "/api/v3/brokerage/accounts")
    print(f"✓ Token generated successfully")
    print(f"Token (first 50 chars): {test_token[:50]}...")
except Exception as e:
    print(f"✗ Failed to generate token: {e}")

Testing Coinbase JWT token generation...
KEY_NAME: organizations/dbe231cd-d524-4255-bc63-4db7ac53a191/apiKeys/6b1cb020-b8d2-475d-849a-2d164e5e9811
Private key length: 232 characters
✓ Token generated successfully
Token (first 50 chars): eyJhbGciOiJFUzI1NiIsImtpZCI6Im9yZ2FuaXphdGlvbnMvZG...


In [5]:
# Load existing CSVs
import os

base_csv = './5s_data/eth_orderbook_coinbase_5s.csv'
output_csv = './5s_data/eth_orderbook_coinbase_5s_with_price_volume.csv'

# Check if output file already exists
if os.path.exists(output_csv):
    print(f"Loading existing output file: {output_csv}")
    df = pd.read_csv(output_csv)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Check what columns exist
    has_eth_price = 'eth_price' in df.columns
    has_eth_volume = 'eth_volume' in df.columns
    has_btc_price = 'btc_price' in df.columns
    has_btc_volume = 'btc_volume' in df.columns
    
    print(f"Existing columns: eth_price={has_eth_price}, eth_volume={has_eth_volume}, btc_price={has_btc_price}, btc_volume={has_btc_volume}")
    
    # Load base orderbook file to check for new rows
    df_base = pd.read_csv(base_csv)
    df_base['timestamp'] = pd.to_datetime(df_base['timestamp'])
    
    # Find new rows that aren't in the output yet
    new_rows = df_base[~df_base['timestamp'].isin(df['timestamp'])]
    if len(new_rows) > 0:
        print(f"Found {len(new_rows)} new rows from base CSV")
        # Add new rows with NaN for price/volume columns
        for col in ['eth_price', 'eth_volume', 'btc_price', 'btc_volume', 'price', 'volume']:
            if col not in new_rows.columns:
                new_rows[col] = float('nan')
        df = pd.concat([df, new_rows], ignore_index=True).sort_values('timestamp').reset_index(drop=True)
        print(f"Total rows after merge: {len(df)}")
else:
    print(f"Output file doesn't exist. Loading base: {base_csv}")
    df = pd.read_csv(base_csv)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    # Initialize columns
    df['eth_price'] = float('nan')
    df['eth_volume'] = float('nan')
    df['btc_price'] = float('nan')
    df['btc_volume'] = float('nan')
    df['price'] = float('nan')
    df['volume'] = float('nan')

print(f"\nLoaded {len(df)} rows")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Columns: {list(df.columns[:15])}...")

# Check missing data
print(f"\nMissing data:")
print(f"  ETH price: {df['eth_price'].isna().sum()} rows")
print(f"  ETH volume: {df['eth_volume'].isna().sum()} rows")
print(f"  BTC price: {df['btc_price'].isna().sum()} rows")
print(f"  BTC volume: {df['btc_volume'].isna().sum()} rows")

Output file doesn't exist. Loading base: ./5s_data/eth_orderbook_coinbase_5s.csv

Loaded 276300 rows
Date range: 2025-12-01 02:57:15.727819 to 2025-12-18 22:05:26.176962
Columns: ['timestamp', 'bid_price_1', 'bid_vol_1', 'ask_price_1', 'ask_vol_1', 'bid_price_2', 'bid_vol_2', 'ask_price_2', 'ask_vol_2', 'bid_price_3', 'bid_vol_3', 'ask_price_3', 'ask_vol_3', 'bid_price_4', 'bid_vol_4']...

Missing data:
  ETH price: 276300 rows
  ETH volume: 276300 rows
  BTC price: 276300 rows
  BTC volume: 276300 rows


In [6]:
# Check which rows were forward-filled by detecting duplicates
print("\n=== FORWARD-FILLED DATA DETECTION ===")

# Sort by timestamp
df_sorted = df.sort_values('timestamp').reset_index(drop=True)

# Find rows where BTC price equals the previous row (forward-filled)
btc_price_ffilled = (df_sorted['btc_price'] == df_sorted['btc_price'].shift(1))
btc_volume_ffilled = (df_sorted['btc_volume'] == df_sorted['btc_volume'].shift(1))

# Find rows where ETH price equals the previous row
eth_price_ffilled = (df_sorted['eth_price'] == df_sorted['eth_price'].shift(1))
eth_volume_ffilled = (df_sorted['eth_volume'] == df_sorted['eth_volume'].shift(1))

print(f"\nBTC forward-filled rows (duplicate values):")
print(f"  BTC price duplicates: {btc_price_ffilled.sum()} rows ({100*btc_price_ffilled.sum()/len(df):.2f}%)")
print(f"  BTC volume duplicates: {btc_volume_ffilled.sum()} rows ({100*btc_volume_ffilled.sum()/len(df):.2f}%)")

print(f"\nETH forward-filled rows (duplicate values):")
print(f"  ETH price duplicates: {eth_price_ffilled.sum()} rows ({100*eth_price_ffilled.sum()/len(df):.2f}%)")
print(f"  ETH volume duplicates: {eth_volume_ffilled.sum()} rows ({100*eth_volume_ffilled.sum()/len(df):.2f}%)")

# Show runs of consecutive duplicates (likely forward-filled blocks)
def find_duplicate_runs(series):
    """Find consecutive runs of duplicate values"""
    is_dup = (series == series.shift(1))
    run_ids = (~is_dup).cumsum()
    runs = is_dup.groupby(run_ids).sum()
    long_runs = runs[runs > 10]  # Runs longer than 10
    return len(long_runs), long_runs.sum()

btc_price_runs, btc_price_run_total = find_duplicate_runs(df_sorted['btc_price'])
eth_price_runs, eth_price_run_total = find_duplicate_runs(df_sorted['eth_price'])

print(f"\nLong duplicate runs (>10 consecutive):")
print(f"  BTC price: {btc_price_runs} runs totaling {btc_price_run_total} rows")
print(f"  ETH price: {eth_price_runs} runs totaling {eth_price_run_total} rows")


=== FORWARD-FILLED DATA DETECTION ===

BTC forward-filled rows (duplicate values):
  BTC price duplicates: 0 rows (0.00%)
  BTC volume duplicates: 0 rows (0.00%)

ETH forward-filled rows (duplicate values):
  ETH price duplicates: 0 rows (0.00%)
  ETH volume duplicates: 0 rows (0.00%)

Long duplicate runs (>10 consecutive):
  BTC price: 0 runs totaling 0 rows
  ETH price: 0 runs totaling 0 rows


In [9]:
def get_coinbase_candles(product_id, start_time, end_time, granularity='ONE_MINUTE'):
    """
    Fetch historical candles from Coinbase API using official JWT generator.
    
    Args:
        product_id: Product identifier (e.g., 'ETH-USD', 'BTC-USD')
        start_time: datetime object
        end_time: datetime object
        granularity: 'ONE_MINUTE', 'FIVE_MINUTE', 'FIFTEEN_MINUTE', 'ONE_HOUR', etc.
    
    Returns:
        List of candles with [timestamp, low, high, open, close, volume]
    """
    start_unix = int(start_time.timestamp())
    end_unix = int(end_time.timestamp())
    
    request_path = f"/api/v3/brokerage/products/{product_id}/candles"
    url = f"{COINBASE_API_BASE}/products/{product_id}/candles"
    
    params = {
        'start': start_unix,
        'end': end_unix,
        'granularity': granularity
    }
    
    print(f"  Requesting: {product_id} from {start_time} to {end_time}")
    
    for attempt in range(3):
        try:
            # Generate JWT token using official Coinbase method
            jwt_token = generate_jwt_token("GET", request_path)
            
            headers = {
                'Authorization': f'Bearer {jwt_token}',
            }
            
            r = requests.get(url, headers=headers, params=params, timeout=10)
            print(f"  Status: {r.status_code}")
            
            if r.status_code == 429:
                print("  Rate limit, waiting...")
                time.sleep(2)
                continue
            elif r.status_code >= 500:
                print(f"  Server error, retrying...")
                time.sleep(2 ** attempt)
                continue
            elif r.status_code == 400:
                print(f"  Bad request: {r.text[:500]}")
                return []
            elif r.status_code == 401:
                print(f"  Auth error: {r.text[:200]}")
                return []
            
            r.raise_for_status()
            data = r.json()
            
            candles = data.get('candles', [])
            print(f"  Got {len(candles)} candles")
            return candles
            
        except Exception as e:
            print(f"  Error: {e}")
            if attempt < 2:
                time.sleep(2 ** attempt)
            else:
                return []
    
    return []            

In [None]:
# Only fetch data for missing timestamps
missing_eth = df[df['eth_price'].isna() | df['eth_volume'].isna()]
missing_btc = df[df['btc_price'].isna() | df['btc_volume'].isna()]

print(f"\nData to fetch:")
print(f"  ETH: {len(missing_eth)} rows need data")
print(f"  BTC: {len(missing_btc)} rows need data")

# Check data completeness - skip fetching if >= 95% complete
eth_completeness = 100 * (1 - len(missing_eth) / len(df))
btc_completeness = 100 * (1 - len(missing_btc) / len(df))

print(f"\nData completeness:")
print(f"  ETH: {eth_completeness:.2f}%")
print(f"  BTC: {btc_completeness:.2f}%")

COMPLETENESS_THRESHOLD = 95.0
skip_eth_fetch = eth_completeness >= COMPLETENESS_THRESHOLD
skip_btc_fetch = btc_completeness >= COMPLETENESS_THRESHOLD

if skip_eth_fetch:
    print(f"\n✓ ETH data is {eth_completeness:.2f}% complete (>= {COMPLETENESS_THRESHOLD}%) - skipping fetch, will forward-fill")
if skip_btc_fetch:
    print(f"✓ BTC data is {btc_completeness:.2f}% complete (>= {COMPLETENESS_THRESHOLD}%) - skipping fetch, will forward-fill")

# Coinbase API limit: max 350 candles per request
# For 1-minute candles: 350 minutes = ~5.8 hours, use 5 hours to be safe
chunk_size = timedelta(hours=5)

# Fetch ETH data using Coinbase API
if len(missing_eth) > 0 and not skip_eth_fetch:
    print("\n" + "="*60)
    print("FETCHING ETH DATA from Coinbase API")
    print("="*60)
    
    start_time = missing_eth['timestamp'].min()
    end_time = missing_eth['timestamp'].max()
    
    all_eth_candles = []
    current_start = start_time
    chunks_fetched = 0
    chunks_skipped = 0
    
    while current_start < end_time:
        current_end = min(current_start + chunk_size, end_time)
        
        # Check if this chunk has missing data
        chunk_missing = missing_eth[(missing_eth['timestamp'] >= current_start) & 
                                     (missing_eth['timestamp'] < current_end)]
        
        if len(chunk_missing) > 0:
            print(current_end)
            print(current_start)
            candles = get_coinbase_candles(ETH_PRODUCT_ID, current_start, current_end, granularity='ONE_MINUTE')
            if candles:
                all_eth_candles.extend(candles)
            chunks_fetched += 1
            time.sleep(0.2)  # Rate limiting: 10 req/sec = 0.1s, using 0.2s to be safe
        else:
            chunks_skipped += 1
        
        current_start = current_end
    
    # Convert Coinbase candles to DataFrame
    if all_eth_candles:
        eth_ohlcv_df = pd.DataFrame(all_eth_candles, columns=['start', 'low', 'high', 'open', 'close', 'volume'])
        eth_ohlcv_df['timestamp'] = pd.to_datetime(eth_ohlcv_df['start'], unit='s')
        eth_ohlcv_df['eth_price'] = pd.to_numeric(eth_ohlcv_df['close'])
        eth_ohlcv_df['eth_volume'] = pd.to_numeric(eth_ohlcv_df['volume'])
        print(f"\nTotal ETH: {len(eth_ohlcv_df)} candles from {chunks_fetched} chunks ({chunks_skipped} skipped)")
    else:
        eth_ohlcv_df = pd.DataFrame()
else:
    print("\nSkipping ETH fetch")
    eth_ohlcv_df = pd.DataFrame()

# Fetch BTC data using Coinbase API
if len(missing_btc) > 0 and not skip_btc_fetch:
    print("\n" + "="*60)
    print("FETCHING BTC DATA from Coinbase API")
    print("="*60)
    
    start_time = missing_btc['timestamp'].min()
    end_time = missing_btc['timestamp'].max()
    
    all_btc_candles = []
    current_start = start_time
    chunks_fetched = 0
    chunks_skipped = 0
    
    while current_start < end_time:
        current_end = min(current_start + chunk_size, end_time)
        
        # Check if this chunk has missing data
        chunk_missing = missing_btc[(missing_btc['timestamp'] >= current_start) & 
                                     (missing_btc['timestamp'] < current_end)]
        
        if len(chunk_missing) > 0:
            candles = get_coinbase_candles(BTC_PRODUCT_ID, current_start, current_end, granularity='ONE_MINUTE')
            if candles:
                all_btc_candles.extend(candles)
            chunks_fetched += 1
            time.sleep(0.2)  # Rate limiting
        else:
            chunks_skipped += 1
        
        current_start = current_end
    
    # Convert Coinbase candles to DataFrame
    if all_btc_candles:
        btc_ohlcv_df = pd.DataFrame(all_btc_candles, columns=['start', 'low', 'high', 'open', 'close', 'volume'])
        btc_ohlcv_df['timestamp'] = pd.to_datetime(btc_ohlcv_df['start'], unit='s')
        btc_ohlcv_df['btc_price'] = pd.to_numeric(btc_ohlcv_df['close'])
        btc_ohlcv_df['btc_volume'] = pd.to_numeric(btc_ohlcv_df['volume'])
        print(f"\nTotal BTC: {len(btc_ohlcv_df)} candles from {chunks_fetched} chunks ({chunks_skipped} skipped)")
    else:
        btc_ohlcv_df = pd.DataFrame()
else:
    print("\nSkipping BTC fetch")
    btc_ohlcv_df = pd.DataFrame()


Data to fetch:
  ETH: 276300 rows need data
  BTC: 276300 rows need data

Data completeness:
  ETH: 0.00%
  BTC: 0.00%

FETCHING ETH DATA from Coinbase API
2025-12-02 02:57:15.727819
2025-12-01 02:57:15.727819
  Requesting: ETH-USD from 2025-12-01 02:57:15.727819 to 2025-12-02 02:57:15.727819
  Status: 400
  Bad request: {"error":"INVALID_ARGUMENT","error_details":"start and end argument is invalid - number of candles requested should be less than 350 ","message":"start and end argument is invalid - number of candles requested should be less than 350 "}
2025-12-03 02:57:15.727819
2025-12-02 02:57:15.727819
  Requesting: ETH-USD from 2025-12-02 02:57:15.727819 to 2025-12-03 02:57:15.727819
  Status: 400
  Bad request: {"error":"INVALID_ARGUMENT","error_details":"start and end argument is invalid - number of candles requested should be less than 350 ","message":"start and end argument is invalid - number of candles requested should be less than 350 "}
2025-12-04 02:57:15.727819
2025-12-

KeyboardInterrupt: 

In [None]:
# Merge fetched data with orderbook snapshots
if len(eth_ohlcv_df) > 0:
    print(f"\nMerging ETH data:")
    print(f"  Timestamp range: {eth_ohlcv_df['timestamp'].min()} to {eth_ohlcv_df['timestamp'].max()}")
    
    # Merge with 60-second tolerance for 1-minute candles
    df_temp = pd.merge_asof(
        df.sort_values('timestamp'),
        eth_ohlcv_df[['timestamp', 'eth_price', 'eth_volume']].sort_values('timestamp'),
        on='timestamp',
        direction='nearest',
        tolerance=pd.Timedelta(seconds=60),
        suffixes=('', '_new')
    )
    
    # Only update rows that were missing
    eth_price_missing = df['eth_price'].isna()
    eth_volume_missing = df['eth_volume'].isna()
    
    df.loc[eth_price_missing, 'eth_price'] = df_temp.loc[eth_price_missing, 'eth_price_new']
    df.loc[eth_volume_missing, 'eth_volume'] = df_temp.loc[eth_volume_missing, 'eth_volume_new']
    
    filled_price = eth_price_missing.sum() - df['eth_price'].isna().sum()
    filled_volume = eth_volume_missing.sum() - df['eth_volume'].isna().sum()
    
    print(f"  Filled {filled_price} eth_price, {filled_volume} eth_volume")
    print(f"  Still missing: {df['eth_price'].isna().sum()} rows")

if len(btc_ohlcv_df) > 0:
    print(f"\nMerging BTC data:")
    print(f"  Timestamp range: {btc_ohlcv_df['timestamp'].min()} to {btc_ohlcv_df['timestamp'].max()}")
    
    # Merge with 60-second tolerance for 1-minute candles
    df_temp = pd.merge_asof(
        df.sort_values('timestamp'),
        btc_ohlcv_df[['timestamp', 'btc_price', 'btc_volume']].sort_values('timestamp'),
        on='timestamp',
        direction='nearest',
        tolerance=pd.Timedelta(seconds=60),
        suffixes=('', '_new')
    )
    
    # Only update rows that were missing
    btc_price_missing = df['btc_price'].isna()
    btc_volume_missing = df['btc_volume'].isna()
    
    df.loc[btc_price_missing, 'btc_price'] = df_temp.loc[btc_price_missing, 'btc_price_new']
    df.loc[btc_volume_missing, 'btc_volume'] = df_temp.loc[btc_volume_missing, 'btc_volume_new']
    
    filled_price = btc_price_missing.sum() - df['btc_price'].isna().sum()
    filled_volume = btc_volume_missing.sum() - df['btc_volume'].isna().sum()
    
    print(f"  Filled {filled_price} btc_price, {filled_volume} btc_volume")
    print(f"  Still missing: {df['btc_price'].isna().sum()} rows")

# Update backward compatibility columns
price_missing = df['price'].isna() if 'price' in df.columns else pd.Series([True] * len(df))
volume_missing = df['volume'].isna() if 'volume' in df.columns else pd.Series([True] * len(df))

df.loc[price_missing, 'price'] = df.loc[price_missing, 'eth_price']
df.loc[volume_missing, 'volume'] = df.loc[volume_missing, 'eth_volume']

In [None]:
# Forward-fill any remaining missing values (for BTC especially)
print("\n=== FORWARD-FILLING MISSING VALUES ===")

# Sort by timestamp to ensure proper forward-fill
df = df.sort_values('timestamp').reset_index(drop=True)

# Forward-fill BTC data
if df['btc_price'].isna().sum() > 0:
    btc_missing_before = df['btc_price'].isna().sum()
    df['btc_price'] = df['btc_price'].fillna(method='ffill')
    df['btc_volume'] = df['btc_volume'].fillna(method='ffill')
    btc_filled = btc_missing_before - df['btc_price'].isna().sum()
    print(f"Forward-filled {btc_filled} BTC rows")
    print(f"  BTC still missing: {df['btc_price'].isna().sum()} (likely at start of dataset)")
else:
    print("No BTC data to forward-fill")

# Forward-fill ETH data
if df['eth_price'].isna().sum() > 0:
    eth_missing_before = df['eth_price'].isna().sum()
    df['eth_price'] = df['eth_price'].fillna(method='ffill')
    df['eth_volume'] = df['eth_volume'].fillna(method='ffill')
    eth_filled = eth_missing_before - df['eth_price'].isna().sum()
    print(f"Forward-filled {eth_filled} ETH rows")
    print(f"  ETH still missing: {df['eth_price'].isna().sum()} (likely at start of dataset)")
else:
    print("No ETH data to forward-fill")

# Update backward compatibility columns
df['price'] = df['eth_price']
df['volume'] = df['eth_volume']


=== FORWARD-FILLING MISSING VALUES ===
No BTC data to forward-fill
No ETH data to forward-fill


In [None]:
# Save updated CSV
output_path = './5s_data/eth_orderbook_coinbase_5s_with_price_volume.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")
print(f"Final shape: {df.shape}")

# Preview
print("\nPreview:")
display_cols = ['timestamp', 'bid_price_1', 'ask_price_1', 'eth_price', 'eth_volume', 'btc_price', 'btc_volume']
print(df[display_cols].head(10))

# Statistics
print("\n=== ETH STATISTICS ===")
print(f"  Price range: ${df['eth_price'].min():.2f} - ${df['eth_price'].max():.2f}")
print(f"  Mean volume: {df['eth_volume'].mean():.6f}")
print(f"  Total volume: {df['eth_volume'].sum():.2f}")

print("\n=== BTC STATISTICS ===")
print(f"  Price range: ${df['btc_price'].min():.2f} - ${df['btc_price'].max():.2f}")
print(f"  Mean volume: {df['btc_volume'].mean():.6f}")
print(f"  Total volume: {df['btc_volume'].sum():.2f}")

print("\n=== DATA COMPLETENESS ===")
print(f"  Total rows: {len(df)}")
print(f"  ETH price missing: {df['eth_price'].isna().sum()} ({100*df['eth_price'].isna().sum()/len(df):.2f}%)")
print(f"  BTC price missing: {df['btc_price'].isna().sum()} ({100*df['btc_price'].isna().sum()/len(df):.2f}%)")


Saved to: ./5s_data/eth_orderbook_coinbase_5s_with_price_volume.csv
Final shape: (276300, 47)

Preview:
                   timestamp  bid_price_1  ask_price_1  eth_price  eth_volume   
0 2025-12-01 02:57:15.727819      2841.16      2841.17   2841.165    0.597915  \
1 2025-12-01 02:57:21.073133      2841.10      2841.11   2841.085    0.777114   
2 2025-12-01 02:57:26.375175      2840.82      2841.00   2840.990    1.177497   
3 2025-12-01 02:57:31.660663      2841.00      2841.01   2841.005    0.025164   
4 2025-12-01 02:57:37.007031      2841.00      2841.01   2841.005    0.003338   
5 2025-12-01 02:57:42.671388      2841.11      2841.12   2842.230    2.630444   
6 2025-12-01 02:57:47.996316      2841.53      2841.54   2841.730    2.091136   
7 2025-12-01 02:57:53.320210      2841.72      2841.73   2841.500    0.349916   
8 2025-12-01 02:57:58.597944      2841.32      2841.33   2841.490    0.043777   
9 2025-12-01 02:58:03.907542      2841.32      2841.33   2841.505    0.065284   

   

In [None]:
# Deep dive: Compare fetched BTC data vs actual coverage
print("\n=== BTC DATA FETCH ANALYSIS ===")

# Check how much BTC data was actually fetched in the last run
if len(btc_ohlcv_df) > 0:
    print(f"\nBTC candles fetched from API: {len(btc_ohlcv_df)}")
    print(f"BTC timestamp range: {btc_ohlcv_df['timestamp'].min()} to {btc_ohlcv_df['timestamp'].max()}")
    print(f"BTC time span: {btc_ohlcv_df['timestamp'].max() - btc_ohlcv_df['timestamp'].min()}")
    
    # Calculate expected number of 5-second candles
    total_seconds = (btc_ohlcv_df['timestamp'].max() - btc_ohlcv_df['timestamp'].min()).total_seconds()
    expected_candles = int(total_seconds / 5)
    print(f"\nExpected 5-sec candles: {expected_candles}")
    print(f"Actual candles: {len(btc_ohlcv_df)}")
    print(f"API Coverage: {100 * len(btc_ohlcv_df) / expected_candles:.2f}%")
    
    # Check for gaps in fetched BTC data
    btc_ohlcv_df_sorted = btc_ohlcv_df.sort_values('timestamp').copy()
    btc_ohlcv_df_sorted['time_diff'] = btc_ohlcv_df_sorted['timestamp'].diff()
    large_gaps = btc_ohlcv_df_sorted[btc_ohlcv_df_sorted['time_diff'] > pd.Timedelta(seconds=10)]
    
    print(f"\nGaps in fetched BTC data (>10 sec): {len(large_gaps)}")
    if len(large_gaps) > 0:
        print(f"\nLargest 5 gaps in BTC API data:")
        print(large_gaps.nlargest(5, 'time_diff')[['timestamp', 'time_diff']])
else:
    print("\nNo BTC data was fetched (btc_ohlcv_df is empty)")

print("\n=== ORDERBOOK TIMESTAMP ANALYSIS ===")
print(f"Orderbook rows: {len(df)}")
print(f"Orderbook timestamp range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Orderbook time span: {df['timestamp'].max() - df['timestamp'].min()}")

# Average time between orderbook snapshots
df_sorted = df.sort_values('timestamp').copy()
df_sorted['snapshot_diff'] = df_sorted['timestamp'].diff()
avg_snapshot_interval = df_sorted['snapshot_diff'].median()
print(f"Median snapshot interval: {avg_snapshot_interval}")
print(f"This suggests orderbook snapshots are ~{avg_snapshot_interval.total_seconds():.1f} seconds apart")


=== BTC DATA FETCH ANALYSIS ===

No BTC data was fetched (btc_ohlcv_df is empty)

=== ORDERBOOK TIMESTAMP ANALYSIS ===
Orderbook rows: 276300
Orderbook timestamp range: 2025-12-01 02:57:15.727819 to 2025-12-18 22:05:26.176962
Orderbook time span: 17 days 19:08:10.449143
Median snapshot interval: 0 days 00:00:05.353050
This suggests orderbook snapshots are ~5.4 seconds apart
