# Specific Assets Data Grabber

This notebook fetches historical price and volume data for a specific list of cryptocurrencies:
- OP, ARB, STRK, POL/MATIC
- UNI, AAVE, MKR, LINK
- SOL, AVAX, NEAR, ADA
- BNB, BTC, ETH

Based on the original data_grabber_with_micro_altcoin.ipynb

In [1]:
import requests
import pandas as pd
import time
from dotenv import load_dotenv
import os
from datetime import datetime, timedelta
from functools import reduce

load_dotenv()

# Constants
COINAPI_KEY = os.getenv('COIN_API_KEY')

GRANULARITY = 3600  # 1 hour in seconds
TOTAL_HOURS = 100000  # total number of hours to fetch
CHUNK_SIZE = 100   # CoinAPI limit per request

headers = {
    'Accept': 'application/json',
    'X-CoinAPI-Key': COINAPI_KEY
}

In [None]:
# Specific assets you requested with their CoinAPI symbol IDs
# Using multiple exchanges to ensure we find all assets
SPECIFIC_ASSETS = {
    'BTC': 'COINBASE_SPOT_BTC_USD',
    'ETH': 'COINBASE_SPOT_ETH_USD', 
    'BNB': 'BINANCE_SPOT_BNB_USDT',
    'SOL': 'COINBASE_SPOT_SOL_USD',
    'ADA': 'COINBASE_SPOT_ADA_USD',
    'AVAX': 'COINBASE_SPOT_AVAX_USD',
    'LINK': 'COINBASE_SPOT_LINK_USD',
    'UNI': 'COINBASE_SPOT_UNI_USD',
    'AAVE': 'COINBASE_SPOT_AAVE_USD',
    'MKR': 'COINBASE_SPOT_MKR_USD',
    'NEAR': 'COINBASE_SPOT_NEAR_USD',
    'OP': 'COINBASE_SPOT_OP_USD',
    'ARB': 'COINBASE_SPOT_ARB_USD',
    'STRK': 'BINANCE_SPOT_STRK_USDT',  # Newer token, try Binance
    'MATIC': 'COINBASE_SPOT_MATIC_USD'  # Use MATIC instead of POL for now
}

print(f"Will fetch data for {len(SPECIFIC_ASSETS)} assets:")
for name, symbol_id in SPECIFIC_ASSETS.items():
    print(f"  {name}: {symbol_id}")

Will fetch data for 15 assets:
  BTC: BINANCE_SPOT_BTC_USDT
  ETH: BINANCE_SPOT_ETH_USDT
  BNB: BINANCE_SPOT_BNB_USDT
  SOL: BINANCE_SPOT_SOL_USDT
  ADA: BINANCE_SPOT_ADA_USDT
  AVAX: BINANCE_SPOT_AVAX_USDT
  LINK: BINANCE_SPOT_LINK_USDT
  UNI: BINANCE_SPOT_UNI_USDT
  AAVE: BINANCE_SPOT_AAVE_USDT
  MKR: BINANCE_SPOT_MKR_USDT
  NEAR: BINANCE_SPOT_NEAR_USDT
  OP: BINANCE_SPOT_OP_USDT
  ARB: BINANCE_SPOT_ARB_USDT
  STRK: BINANCE_SPOT_STRK_USDT
  POL: BINANCE_SPOT_POL_USDT


In [3]:
def get_historic_candles(name, symbol_id, granularity, total_hours, chunk_size, now):
    all_data = []
    for i in range(0, total_hours, chunk_size):
        end_time = (now - timedelta(seconds=granularity * i)).replace(microsecond=0)
        start_time = (end_time - timedelta(seconds=granularity * chunk_size)).replace(microsecond=0)

        url = f"https://rest.coinapi.io/v1/ohlcv/{symbol_id}/history"
        params = {
            'period_id': '1HRS',
            'time_start': start_time.isoformat() + 'Z',
            'time_end': end_time.isoformat() + 'Z',
            'limit': chunk_size
        }

        for attempt in range(3):
            r = requests.get(url, headers=headers, params=params)
            if r.status_code == 429:
                print("Rate limit hit. Waiting 60 seconds...")
                time.sleep(60)
                continue
            r.raise_for_status()
            data = r.json()
            if len(data) == 0:  # Return immediately if no rows are fetched
                print(f"No data returned for {start_time} to {end_time}. Exiting.")
                return pd.DataFrame(all_data)  # Return what we have so far
            all_data.extend(data)
            print(f"Fetched {len(data)} rows from of {name} {start_time} to {end_time}")
            break
        else:
            raise Exception("Failed to fetch after multiple attempts.")

        time.sleep(1)  # Polite delay to avoid spamming API

    # Create DataFrame and rename columns
    df = pd.DataFrame(all_data)
    df = df.rename(columns={
        'time_period_start': 'time',
        'price_close': f'{name}_close',
        'volume_traded': f'{name}_volume'
    })
    df['time'] = pd.to_datetime(df['time'])

    # Drop all other columns and keep only time, close, and volume
    df = df[['time', f'{name}_close', f'{name}_volume']]
    df = df.sort_values('time').reset_index(drop=True)
    return df

In [4]:
# Create data directory if it doesn't exist
os.makedirs('./data', exist_ok=True)

# Fetch data for each specific asset
nearest_hour_rounded_down = datetime.utcnow().replace(minute=0, second=0, microsecond=0)  # Round down to the nearest hour

successful_fetches = []
failed_fetches = []

for name, symbol_id in SPECIFIC_ASSETS.items():
    print(f"\n🔄 Fetching data for {name} ({symbol_id})...")
    try:
        df = get_historic_candles(name, symbol_id, GRANULARITY, TOTAL_HOURS, CHUNK_SIZE, nearest_hour_rounded_down)
        if df is not None and not df.empty:  # Null and empty check
            df.to_csv(f"./data/{name}_data.csv", index=False)  # Save to a CSV file
            print(f"✅ Saved {len(df)} rows for {name} to {name}_data.csv")
            successful_fetches.append(name)
        else:
            print(f"❌ No data fetched for {name} ({symbol_id}). Skipping.")
            failed_fetches.append(name)
    except Exception as e:
        print(f"❌ Error fetching {name}: {e}")
        failed_fetches.append(name)
    
    time.sleep(2)  # Polite delay between assets

print(f"\n📊 Summary:")
print(f"✅ Successfully fetched: {len(successful_fetches)} assets")
print(f"❌ Failed to fetch: {len(failed_fetches)} assets")
if failed_fetches:
    print(f"Failed assets: {', '.join(failed_fetches)}")


🔄 Fetching data for BTC (BINANCE_SPOT_BTC_USDT)...


  nearest_hour_rounded_down = datetime.utcnow().replace(minute=0, second=0, microsecond=0)  # Round down to the nearest hour


Fetched 100 rows from of BTC 2025-10-16 12:00:00 to 2025-10-20 16:00:00
Fetched 100 rows from of BTC 2025-10-12 08:00:00 to 2025-10-16 12:00:00
Fetched 100 rows from of BTC 2025-10-12 08:00:00 to 2025-10-16 12:00:00
Fetched 100 rows from of BTC 2025-10-08 04:00:00 to 2025-10-12 08:00:00
Fetched 100 rows from of BTC 2025-10-08 04:00:00 to 2025-10-12 08:00:00
Fetched 100 rows from of BTC 2025-10-04 00:00:00 to 2025-10-08 04:00:00
Fetched 100 rows from of BTC 2025-10-04 00:00:00 to 2025-10-08 04:00:00
Fetched 100 rows from of BTC 2025-09-29 20:00:00 to 2025-10-04 00:00:00
Fetched 100 rows from of BTC 2025-09-29 20:00:00 to 2025-10-04 00:00:00
Fetched 100 rows from of BTC 2025-09-25 16:00:00 to 2025-09-29 20:00:00
Fetched 100 rows from of BTC 2025-09-25 16:00:00 to 2025-09-29 20:00:00
Fetched 100 rows from of BTC 2025-09-21 12:00:00 to 2025-09-25 16:00:00
Fetched 100 rows from of BTC 2025-09-21 12:00:00 to 2025-09-25 16:00:00
Fetched 100 rows from of BTC 2025-09-17 08:00:00 to 2025-09-21 1

In [5]:
def prep(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # normalize *_close -> *_price (optional)
    rename_map = {c: f"{c.rsplit('_',1)[0]}_price" for c in df.columns if c.endswith('_close')}
    if rename_map:
        df = df.rename(columns=rename_map)

    # parse & sort
    df['time'] = pd.to_datetime(df['time'], format='ISO8601', utc=True)
    df = df.drop_duplicates('time').sort_values('time').set_index('time')

    # optional guards
    for c in df.columns:
        if c.endswith('_price'):
            df[c] = df[c].mask(df[c] <= 0)  # avoid bad logs later
    return df

def merge_asof_many(dfs, tolerance='5min', direction='nearest'):
    dfs = [prep(d) for d in dfs]
    # choose a calendar to control the output timeline:
    # here: use the intersection of timestamps across all dfs (robust),
    # or pick the densest df as the first/left.
    # Simple: start with the densest df as left:
    left = max(dfs, key=lambda d: len(d))
    others = [d for d in dfs if d is not left]

    tol = pd.Timedelta(tolerance)

    def merge_two(left, right):
        # Avoid column collisions: add suffixes if needed
        intersect = set(left.columns).intersection(right.columns)
        if intersect:
            right = right.rename(columns={c: f"{c}_r" for c in intersect})
        return pd.merge_asof(
            left.sort_index(),
            right.sort_index(),
            left_index=True,
            right_index=True,
            direction=direction,
            tolerance=tol
        )

    merged = reduce(merge_two, others, left)
    return merged.reset_index().rename(columns={'index': 'time'})

In [6]:
# Combine all individual CSV files into one dataset
csv_directory = "./data"  
output_file = "specific_assets_dataset.csv"

# List all CSV files in the directory
csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]
print(f"Found {len(csv_files)} CSV files: {csv_files}")

# Read all CSV files into a list of DataFrames
dataframes = []
for csv_file in csv_files:
    file_path = os.path.join(csv_directory, csv_file)
    print(f"📂 Processing {file_path}...")
    df = pd.read_csv(file_path)
    
    # Check if the CSV has the specific column format
    if list(df.columns) == [
        "time_period_start", "time_period_end", "time_open", "time_close",
        "price_open", "price_high", "price_low", "price_close",
        "volume_traded", "trades_count"
    ]:
        # Rename columns to the correct format
        asset_name = csv_file.split('_')[0]
        df = df.rename(columns={
            "time_period_start": "time",
            "price_close": f"{asset_name}_close",
            "volume_traded": f"{asset_name}_volume"
        })
        # Keep only the relevant columns
        df = df[["time", f"{asset_name}_close", f"{asset_name}_volume"]]
    
    print(f"  📊 Loaded {len(df)} rows for {csv_file}")
    dataframes.append(df)

print(f"\n🔄 Merging {len(dataframes)} DataFrames...")

# Merge all DataFrames on the 'time' column
merged = merge_asof_many(dataframes, tolerance='5min', direction='nearest')

print(f"📊 Merged dataset shape: {merged.shape}")
print(f"📊 Columns: {list(merged.columns)}")

# Save the combined DataFrame to a new CSV file
merged.to_csv(output_file, index=False)
print(f"✅ Combined CSV saved to {output_file}")

Found 27 CSV files: ['ENA_data.csv', 'TAO_data.csv', 'MNT_data.csv', 'AAVE_data.csv', 'SOL_data.csv', 'SHIB_data.csv', 'XLM_data.csv', 'ETC_data.csv', 'ETH_data.csv', 'PEPE_data.csv', 'UNI_data.csv', 'BTC_data.csv', 'SUI_data.csv', 'XMR_data.csv', 'LINK_data.csv', 'DOGE_data.csv', 'WLFI_data.csv', 'AVAX_data.csv', 'LTC_data.csv', 'XRP_data.csv', 'TRX_data.csv', 'BCH_data.csv', 'DOT_data.csv', 'TON_data.csv', 'ADA_data.csv', 'HYPE_data.csv', 'HBAR_data.csv']
📂 Processing ./data/ENA_data.csv...
  📊 Loaded 2156 rows for ENA_data.csv
📂 Processing ./data/TAO_data.csv...
  📊 Loaded 4636 rows for TAO_data.csv
📂 Processing ./data/MNT_data.csv...
  📊 Loaded 8078 rows for MNT_data.csv
📂 Processing ./data/AAVE_data.csv...
  📊 Loaded 40979 rows for AAVE_data.csv
📂 Processing ./data/SOL_data.csv...
  📊 Loaded 45440 rows for SOL_data.csv
📂 Processing ./data/SHIB_data.csv...
  📊 Loaded 45440 rows for SOL_data.csv
📂 Processing ./data/SHIB_data.csv...
  📊 Loaded 34701 rows for SHIB_data.csv
📂 Processin

In [7]:
# Preview the final dataset
df = pd.read_csv("./specific_assets_dataset.csv")
print(f"📊 Final dataset shape: {df.shape}")
print(f"📊 Date range: {df['time'].min()} to {df['time'].max()}")
print(f"📊 Columns: {list(df.columns)}")
print("\n📊 First few rows:")
df.head()

📊 Final dataset shape: (74683, 55)
📊 Date range: 2017-01-27 14:00:00+00:00 to 2025-09-03 15:00:00+00:00
📊 Columns: ['time', 'LTC_price', 'LTC_volume', 'ENA_price', 'ENA_volume', 'TAO_price', 'TAO_volume', 'MNT_price', 'MNT_volume', 'AAVE_price', 'AAVE_volume', 'SOL_price', 'SOL_volume', 'SHIB_price', 'SHIB_volume', 'XLM_price', 'XLM_volume', 'ETC_price', 'ETC_volume', 'ETH_price', 'ETH_volume', 'PEPE_price', 'PEPE_volume', 'UNI_price', 'UNI_volume', 'BTC_price', 'BTC_volume', 'SUI_price', 'SUI_volume', 'XMR_price', 'XMR_volume', 'LINK_price', 'LINK_volume', 'DOGE_price', 'DOGE_volume', 'WLFI_price', 'WLFI_volume', 'AVAX_price', 'AVAX_volume', 'XRP_price', 'XRP_volume', 'TRX_price', 'TRX_volume', 'BCH_price', 'BCH_volume', 'DOT_price', 'DOT_volume', 'TON_price', 'TON_volume', 'ADA_price', 'ADA_volume', 'HYPE_price', 'HYPE_volume', 'HBAR_price', 'HBAR_volume']

📊 First few rows:


Unnamed: 0,time,LTC_price,LTC_volume,ENA_price,ENA_volume,TAO_price,TAO_volume,MNT_price,MNT_volume,AAVE_price,...,DOT_price,DOT_volume,TON_price,TON_volume,ADA_price,ADA_volume,HYPE_price,HYPE_volume,HBAR_price,HBAR_volume
0,2017-01-27 14:00:00+00:00,3.83,0.280952,,,,,,,,...,,,,,,,,,,
1,2017-01-30 18:00:00+00:00,3.96,0.06165,,,,,,,,...,,,,,,,,,,
2,2017-01-30 23:00:00+00:00,4.06,1.0,,,,,,,,...,,,,,,,,,,
3,2017-01-31 12:00:00+00:00,4.03,0.125934,,,,,,,,...,,,,,,,,,,
4,2017-02-04 12:00:00+00:00,3.97,0.079648,,,,,,,,...,,,,,,,,,,


In [8]:
# Data quality check
print("📊 Data Quality Summary:")
print(f"Total rows: {len(df)}")
print(f"Date range: {df['time'].min()} to {df['time'].max()}")
print("\nMissing data by asset:")

# Check missing data for each asset
assets = [col.split('_')[0] for col in df.columns if '_price' in col]
for asset in assets:
    price_col = f"{asset}_price"
    volume_col = f"{asset}_volume"
    
    if price_col in df.columns:
        missing_price = df[price_col].isna().sum()
        missing_volume = df[volume_col].isna().sum() if volume_col in df.columns else "N/A"
        coverage = (1 - missing_price / len(df)) * 100
        print(f"  {asset:6s}: {missing_price:6d} missing price ({coverage:5.1f}% coverage)")

print(f"\n✅ Dataset ready for training! Saved as 'specific_assets_dataset.csv'")

📊 Data Quality Summary:
Total rows: 74683
Date range: 2017-01-27 14:00:00+00:00 to 2025-09-03 15:00:00+00:00

Missing data by asset:
  LTC   :      0 missing price (100.0% coverage)
  ENA   :  72527 missing price (  2.9% coverage)
  TAO   :  70047 missing price (  6.2% coverage)
  MNT   :  66609 missing price ( 10.8% coverage)
  AAVE  :  33716 missing price ( 54.9% coverage)
  SOL   :  30719 missing price ( 58.9% coverage)
  SHIB  :  39982 missing price ( 46.5% coverage)
  XLM   :  18393 missing price ( 75.4% coverage)
  ETC   :  13220 missing price ( 82.3% coverage)
  ETH   :   4681 missing price ( 93.7% coverage)
  PEPE  :  67631 missing price (  9.4% coverage)
  UNI   :  31609 missing price ( 57.7% coverage)
  BTC   :   4680 missing price ( 93.7% coverage)
  SUI   :  54748 missing price ( 26.7% coverage)
  XMR   :   1523 missing price ( 98.0% coverage)
  LINK  :  20874 missing price ( 72.0% coverage)
  DOGE  :  37617 missing price ( 49.6% coverage)
  WLFI  :  74633 missing price (  