In [1]:
import os
import pandas as pd
import requests
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# Assets needed (from eth_xgboost_golden_trio)
assets = [
    "BTC", "AAVE", "SOL", "STRK", "ETH", "OP", "UNI", "LINK", "MKR",
    "ARB", "AVAX", "BNB", "NEAR", "ADA", "POL"
]

# Symbol mapping (from specific_assets_data_grabber)
ASSET_SYMBOLS = {
    'BTC': 'COINBASE_SPOT_BTC_USD',
    'ETH': 'COINBASE_SPOT_ETH_USD', 
    'SOL': 'COINBASE_SPOT_SOL_USD',
    'ADA': 'COINBASE_SPOT_ADA_USD',
    'AVAX': 'COINBASE_SPOT_AVAX_USD',
    'LINK': 'COINBASE_SPOT_LINK_USD',
    'UNI': 'COINBASE_SPOT_UNI_USD',
    'AAVE': 'COINBASE_SPOT_AAVE_USD',
    'ARB': 'COINBASE_SPOT_ARB_USD',
    'MKR': 'COINBASE_SPOT_MKR_USD',
    'NEAR': 'KRAKEN_SPOT_NEAR_USD',
    'BNB': 'KRAKEN_SPOT_BNB_USD',
    'STRK': 'KRAKEN_SPOT_STRK_USD',
    'POL': 'KRAKEN_SPOT_POL_USD',
    'OP': 'BITSTAMP_SPOT_OP_USD'
}

print(f"üìä Assets to fetch: {len(assets)}")
print(f"   {', '.join(assets)}")

üìä Assets to fetch: 15
   BTC, AAVE, SOL, STRK, ETH, OP, UNI, LINK, MKR, ARB, AVAX, BNB, NEAR, ADA, POL


In [3]:
# ---- Config ----
base_path = "./data"
ref_file = os.path.join(base_path, "ETHUSD_5s.csv")
output_file = os.path.join(base_path, "combined_assets_5s.csv")

COINAPI_KEY = os.getenv('COIN_API_KEY')
GRANULARITY_5S = 5  # 5 seconds
MAX_RETRIES = 3
NO_DATA_THRESHOLD = 3

headers = {
    'Accept': 'application/json',
    'X-CoinAPI-Key': COINAPI_KEY
}

print(f"üîë API Key loaded: {'‚úÖ Yes' if COINAPI_KEY else '‚ùå No'}")
print(f"‚è±Ô∏è  Granularity: {GRANULARITY_5S} seconds")

üîë API Key loaded: ‚úÖ Yes
‚è±Ô∏è  Granularity: 5 seconds


In [4]:
# ---- Load reference times from ETHUSD_5s.csv ----
print("üìÇ Loading reference timeline from ETHUSD_5s.csv...")
ref = pd.read_csv(ref_file)

# Find timestamp column
time_col = None
for possible_col in ['timestamp', 'time', 'datetime', 'Date']:
    if possible_col in ref.columns:
        time_col = possible_col
        break

if not time_col:
    raise ValueError(f"Could not find timestamp column in {ref_file}. Columns: {ref.columns.tolist()}")

ref[time_col] = pd.to_datetime(ref[time_col], utc=True)
ref = ref.rename(columns={time_col: 'timestamp'})
ref = ref[['timestamp']].sort_values('timestamp').drop_duplicates()

print(f"‚úÖ Reference timeline loaded")
print(f"   Time range: {ref['timestamp'].min()} to {ref['timestamp'].max()}")
print(f"   Total timestamps: {len(ref):,}")
print(f"   Duration: {(ref['timestamp'].max() - ref['timestamp'].min()).days} days")

# Get time range for queries
start_time = ref['timestamp'].min().to_pydatetime().replace(tzinfo=None)
end_time = ref['timestamp'].max().to_pydatetime().replace(tzinfo=None)

üìÇ Loading reference timeline from ETHUSD_5s.csv...
‚úÖ Reference timeline loaded
   Time range: 2025-12-21 18:16:07.574055+00:00 to 2026-02-07 02:47:11.595901+00:00
   Total timestamps: 767,250
   Duration: 47 days


In [5]:
# ---- Helper: Fetch ETH Funding Rates ----
def fetch_eth_funding_rates(start_time, end_time):
    """
    Fetch ETH perpetual futures funding rates.
    Note: Funding rates are typically available for perpetual futures markets.
    """
    # Try common perpetual symbols
    perp_symbols = [
        'BINANCE_PERP_ETH_USDT',
        'DERIBIT_FUT_ETH_PERPETUAL',
        'BYBIT_PERP_ETH_USDT'
    ]
    
    for symbol in perp_symbols:
        try:
            print(f"üîÑ Trying funding rates for {symbol}...")
            
            # CoinAPI doesn't have a dedicated funding rates endpoint in free tier
            # Alternative: Use quotes or metrics endpoint if available
            # For now, we'll return None and document this limitation
            print(f"   ‚ö†Ô∏è  Funding rates require specialized CoinAPI subscription")
            print(f"   ‚ÑπÔ∏è  Consider using exchange-specific APIs (Binance, Bybit, etc.)")
            return None
            
        except Exception as e:
            print(f"   ‚ùå Failed for {symbol}: {e}")
            continue
    
    return None

In [6]:
# ---- Helper: Fetch ETH Trades Data ----
def fetch_eth_trades(start_time, end_time, symbol_id='COINBASE_SPOT_ETH_USD'):
    """
    Fetch ETH trade data from CoinAPI and aggregate to 5s intervals.
    Note: CoinAPI trades endpoint requires date-by-date queries for historical data.
    """
    url = f"https://rest.coinapi.io/v1/trades/{symbol_id}/history"
    
    all_trades = []
    current_date = start_time.date()
    end_date = end_time.date()
    
    print(f"üîÑ Fetching ETH trades from {current_date} to {end_date}...")
    
    while current_date <= end_date:
        params = {
            'time_start': datetime.combine(current_date, datetime.min.time()).isoformat() + 'Z',
            'time_end': datetime.combine(current_date, datetime.max.time()).isoformat() + 'Z',
            'limit': 100000
        }
        
        for attempt in range(MAX_RETRIES):
            try:
                r = requests.get(url, headers=headers, params=params, timeout=30)
                
                if r.status_code == 429:
                    print(f"   ‚è≥ Rate limit hit. Waiting 60 seconds...")
                    time.sleep(60)
                    continue
                    
                r.raise_for_status()
                data = r.json()
                
                if data:
                    all_trades.extend(data)
                    print(f"   ‚úÖ {current_date}: {len(data)} trades")
                else:
                    print(f"   ‚ö†Ô∏è  {current_date}: No trades")
                    
                break
                
            except Exception as e:
                if attempt == MAX_RETRIES - 1:
                    print(f"   ‚ùå Failed for {current_date}: {e}")
                print(f"   ‚ö†Ô∏è  Attempt {attempt + 1} failed. Retrying...")
                time.sleep(5)
        
        current_date += timedelta(days=1)
        time.sleep(1)  # Polite delay between days
    
    if not all_trades:
        print(f"   ‚ùå No trade data retrieved")
        return None
    
    # Process trades
    df = pd.DataFrame(all_trades)
    df['time_coinapi'] = pd.to_datetime(df['time_coinapi'])
    df = df.rename(columns={'time_coinapi': 'timestamp'})
    
    # Aggregate to 5s intervals
    df = df.set_index('timestamp')
    agg = df.resample('5s').agg({
        'price': 'mean',
        'size': 'sum'
    }).reset_index()
    
    agg = agg.rename(columns={
        'price': 'ETH_trade_price_mean',
        'size': 'ETH_trade_volume_sum'
    })
    
    print(f"   ‚úÖ Aggregated to {len(agg)} 5s intervals")
    return agg

In [7]:
# ---- Helper: Fetch OHLCV Data from CoinAPI ----
def fetch_ohlcv_data(asset_name, symbol_id, start_time, end_time, period='5SEC'):
    """
    Fetch OHLCV data from CoinAPI for the given time range.
    Returns DataFrame with [timestamp, price, volume].
    """
    url = f"https://rest.coinapi.io/v1/ohlcv/{symbol_id}/history"
    params = {
        'period_id': period,
        'time_start': start_time.isoformat() + 'Z',
        'time_end': end_time.isoformat() + 'Z',
        'limit': 10000  # CoinAPI max
    }
    
    all_data = []
    empty_count = 0
    
    # Fetch in chunks if needed
    current_start = start_time
    while current_start < end_time:
        params['time_start'] = current_start.isoformat() + 'Z'
        
        for attempt in range(MAX_RETRIES):
            try:
                r = requests.get(url, headers=headers, params=params, timeout=30)
                
                if r.status_code == 429:
                    print(f"   ‚è≥ Rate limit hit. Waiting 60 seconds...")
                    time.sleep(60)
                    continue
                    
                r.raise_for_status()
                data = r.json()
                
                if len(data) == 0:
                    empty_count += 1
                    if empty_count >= NO_DATA_THRESHOLD:
                        print(f"   ‚ö†Ô∏è  No more data available for {asset_name}")
                        return pd.DataFrame(all_data) if all_data else pd.DataFrame()
                    break
                else:
                    empty_count = 0
                    all_data.extend(data)
                    print(f"   ‚úÖ Fetched {len(data)} rows for {asset_name} | Total: {len(all_data)}")
                    
                    # Update start time for next chunk
                    if len(data) > 0:
                        last_time = pd.to_datetime(data[-1]['time_period_end'])
                        current_start = last_time
                    else:
                        break
                        
                    if len(data) < 10000:  # Less than max, we're done
                        return pd.DataFrame(all_data)
                    break
                    
            except Exception as e:
                if attempt == MAX_RETRIES - 1:
                    print(f"   ‚ùå Failed after {MAX_RETRIES} attempts: {e}")
                    return pd.DataFrame(all_data) if all_data else pd.DataFrame()
                print(f"   ‚ö†Ô∏è  Attempt {attempt + 1} failed: {e}. Retrying...")
                time.sleep(5)
        
        time.sleep(0.5)  # Polite delay
    
    return pd.DataFrame(all_data)

In [8]:
# ---- Helper: Process Asset Data ----
def fetch_asset_data(asset_name, start_time, end_time):
    """Fetch and process asset data"""
    symbol_id = ASSET_SYMBOLS.get(asset_name)
    if not symbol_id:
        print(f"‚ùå No symbol mapping for {asset_name}")
        return None
    
    print(f"üîÑ Fetching {asset_name} ({symbol_id})...")
    df = fetch_ohlcv_data(asset_name, symbol_id, start_time, end_time)
    
    if df.empty:
        print(f"   ‚ùå No data returned for {asset_name}")
        return None
    
    # Process data
    df = df.rename(columns={
        'time_period_start': 'timestamp',
        'price_close': f'{asset_name}_price',
        'volume_traded': f'{asset_name}_volume'
    })
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df[['timestamp', f'{asset_name}_price', f'{asset_name}_volume']]
    df = df.sort_values('timestamp').drop_duplicates('timestamp')
    
    print(f"   ‚úÖ {asset_name}: {len(df)} rows")
    return df

In [9]:
# ---- Fetch ETH Funding Rates ----
print("\n" + "="*70)
print("üéØ FETCHING ETH FUNDING RATES")
print("="*70)

try:
    funding_df = fetch_eth_funding_rates(start_time, end_time)
    
    if funding_df is not None and not funding_df.empty:
        print("üîÑ Merging funding rates with timeline...")
        merged = pd.merge_asof(
            merged.sort_values('timestamp'),
            funding_df.sort_values('timestamp'),
            on='timestamp',
            direction='backward'
        )
        print(f"‚úÖ Funding rates merged: {funding_df.shape}")
    else:
        print("‚ö†Ô∏è  No funding rates data available (requires specialized subscription)")
        print("‚ÑπÔ∏è  Consider fetching directly from Binance/Bybit APIs if needed")
        
except Exception as e:
    print(f"‚ùå Funding rates fetch failed: {e}")


üéØ FETCHING ETH FUNDING RATES
üîÑ Trying funding rates for BINANCE_PERP_ETH_USDT...
   ‚ö†Ô∏è  Funding rates require specialized CoinAPI subscription
   ‚ÑπÔ∏è  Consider using exchange-specific APIs (Binance, Bybit, etc.)
‚ö†Ô∏è  No funding rates data available (requires specialized subscription)
‚ÑπÔ∏è  Consider fetching directly from Binance/Bybit APIs if needed


In [10]:
# ---- Fetch ETH Trades Data ----
print("\n" + "="*70)
print("üéØ FETCHING ETH TRADES DATA")
print("="*70)

try:
    trades_df = fetch_eth_trades(start_time, end_time)
    
    if trades_df is not None and not trades_df.empty:
        print("üîÑ Merging ETH trades with timeline...")
        merged = pd.merge_asof(
            merged.sort_values('timestamp'),
            trades_df.sort_values('timestamp'),
            on='timestamp',
            direction='backward',
            tolerance=pd.Timedelta('5s')
        )
        print(f"‚úÖ ETH trades merged: {trades_df.shape}")
    else:
        print("‚ö†Ô∏è  No ETH trades data available")
        
except Exception as e:
    print(f"‚ùå ETH trades fetch failed: {e}")


üéØ FETCHING ETH TRADES DATA
üîÑ Fetching ETH trades from 2025-12-21 to 2026-02-07...
   ‚ö†Ô∏è  Attempt 1 failed. Retrying...
   ‚ö†Ô∏è  Attempt 2 failed. Retrying...
   ‚ùå Failed for 2025-12-21: 400 Client Error: Bad Request for url: https://rest.coinapi.io/v1/trades/COINBASE_SPOT_ETH_USD/history?time_start=2025-12-21T00%3A00%3A00Z&time_end=2025-12-21T23%3A59%3A59.999999Z&limit=100000
   ‚ö†Ô∏è  Attempt 3 failed. Retrying...
   ‚ö†Ô∏è  Attempt 1 failed. Retrying...
   ‚ö†Ô∏è  Attempt 2 failed. Retrying...
   ‚ùå Failed for 2025-12-22: 400 Client Error: Bad Request for url: https://rest.coinapi.io/v1/trades/COINBASE_SPOT_ETH_USD/history?time_start=2025-12-22T00%3A00%3A00Z&time_end=2025-12-22T23%3A59%3A59.999999Z&limit=100000
   ‚ö†Ô∏è  Attempt 3 failed. Retrying...


KeyboardInterrupt: 

In [None]:
# ---- Fetch all asset data ----
print("\n" + "="*70)
print("üéØ FETCHING ASSET DATA FROM COINAPI")
print("="*70)

merged = ref.copy()
successful_assets = []
failed_assets = []

for asset in assets:
    try:
        asset_df = fetch_asset_data(asset, start_time, end_time)
        
        if asset_df is not None and not asset_df.empty:
            # Merge with reference timeline
            merged = pd.merge(merged, asset_df, on='timestamp', how='left')
            successful_assets.append(asset)
            print(f"   ‚úÖ {asset} merged successfully")
        else:
            failed_assets.append(asset)
            print(f"   ‚ùå {asset} failed - no data")
            
        time.sleep(2)  # Polite delay between assets
        
    except Exception as e:
        failed_assets.append(asset)
        print(f"   ‚ùå {asset} failed: {e}")

print(f"\nüìä Asset Fetch Results:")
print(f"   ‚úÖ Successful: {len(successful_assets)}/{len(assets)}")
print(f"   ‚ùå Failed: {len(failed_assets)}/{len(assets)}")
if failed_assets:
    print(f"   Failed assets: {', '.join(failed_assets)}")

print(f"\nüìä Merged shape: {merged.shape}")
merged.head()

In [None]:
# ---- Load reference times from ETHUSD_5s.csv ----
print("üìÇ Loading reference timeline from ETHUSD_5s.csv...")
ref = pd.read_csv(ref_file)

# Find timestamp column
time_col = None
for possible_col in ['timestamp', 'time', 'datetime', 'Date']:
    if possible_col in ref.columns:
        time_col = possible_col
        break

if not time_col:
    raise ValueError(f"Could not find timestamp column in {ref_file}. Columns: {ref.columns.tolist()}")

ref[time_col] = pd.to_datetime(ref[time_col], utc=True)
ref = ref.rename(columns={time_col: 'timestamp'})
ref = ref[['timestamp']].sort_values('timestamp').drop_duplicates()

print(f"‚úÖ Reference timeline loaded")
print(f"   Time range: {ref['timestamp'].min()} to {ref['timestamp'].max()}")
print(f"   Total timestamps: {len(ref):,}")
print(f"   Duration: {(ref['timestamp'].max() - ref['timestamp'].min()).days} days")

# Get time range for queries
start_time = ref['timestamp'].min().to_pydatetime().replace(tzinfo=None)
end_time = ref['timestamp'].max().to_pydatetime().replace(tzinfo=None)

In [None]:
# ---- Save final dataset ----
print("\n" + "="*70)
print("üíæ SAVING FINAL DATASET")
print("="*70)

# Create save directory if needed
os.makedirs(save_path, exist_ok=True)

merged.to_csv(output_file, index=False)

print(f"‚úÖ Dataset saved: {output_file}")
print(f"üìä Final shape: {merged.shape}")
print(f"üìÖ Date range: {merged['timestamp'].min()} to {merged['timestamp'].max()}")
print(f"üìè Duration: {(merged['timestamp'].max() - merged['timestamp'].min()).days} days")

# Data quality report
print(f"\nüìà DATA COVERAGE REPORT:")
print(f"{'Column':<30} {'Non-Null':<12} {'Coverage %':<12}")
print("-" * 54)

for col in merged.columns:
    if col != 'timestamp':
        non_null = merged[col].notna().sum()
        coverage = (non_null / len(merged)) * 100
        print(f"{col:<30} {non_null:<12,} {coverage:<11.2f}%")

file_size_mb = os.path.getsize(output_file) / (1024*1024)
print(f"\nüíæ File size: {file_size_mb:.2f} MB")
print(f"\nüéâ Dataset ready for backtesting!")

merged.info()