# Specific Assets Data Grabber (Minute-Level Data)

This notebook fetches **minute-level** historical price and volume data for 15 specific cryptocurrencies:
- **Major:** BTC, ETH, BNB, SOL, ADA, AVAX, LINK
- **DeFi:** UNI, AAVE, MKR
- **Layer 2:** OP, ARB, STRK, POL/MATIC
- **Emerging:** NEAR

**Key Features:**
- **Minute-level granularity** (60-second candles)
- **Reverse chronological fetching** - starts from NOW and works backwards
- **Smart early stopping** - stops fetching when no more data is available for an asset
- Consolidated, non-redundant code
- Smart symbol resolution across multiple exchanges
- Comprehensive error handling and retry logic
- Final dataset statistics and quality report

In [11]:
import requests
import pandas as pd
import time
import os
from datetime import datetime, timedelta
from functools import reduce
from dotenv import load_dotenv

load_dotenv()

# Configuration
COINAPI_KEY = os.getenv('COIN_API_KEY')
GRANULARITY = 60  # 1 minute in seconds (changed from 3600)
TOTAL_MINUTES = 1051200  # total number of minutes to fetch (~2 years)
CHUNK_SIZE = 10000   # CoinAPI limit per request
MAX_RETRIES = 3
NO_DATA_THRESHOLD = 3  # Stop after this many consecutive empty responses

headers = {
    'Accept': 'application/json',
    'X-CoinAPI-Key': COINAPI_KEY
}

print(f"üîë API Key loaded: {'‚úÖ Yes' if COINAPI_KEY else '‚ùå No'}")
print(f"‚è±Ô∏è  Granularity: 1 minute")
print(f"üìä Target: {TOTAL_MINUTES:,} minutes (~{TOTAL_MINUTES/60/24/365:.1f} years)")

üîë API Key loaded: ‚úÖ Yes
‚è±Ô∏è  Granularity: 1 minute
üìä Target: 1,051,200 minutes (~2.0 years)


In [12]:
# Asset symbol configuration (consolidated from all testing)
# Using the working symbols discovered through iterative testing
SPECIFIC_ASSETS = {
    # Coinbase USD pairs (primary source)
    'BTC': 'COINBASE_SPOT_BTC_USD',
    'ETH': 'COINBASE_SPOT_ETH_USD', 
    'SOL': 'COINBASE_SPOT_SOL_USD',
    'ADA': 'COINBASE_SPOT_ADA_USD',
    'AVAX': 'COINBASE_SPOT_AVAX_USD',
    'LINK': 'COINBASE_SPOT_LINK_USD',
    'UNI': 'COINBASE_SPOT_UNI_USD',
    'AAVE': 'COINBASE_SPOT_AAVE_USD',
    'ARB': 'COINBASE_SPOT_ARB_USD',
    'MKR': 'COINBASE_SPOT_MKR_USD',  # Alternative that worked
    
    # Kraken USD pairs (backup exchange)
    'NEAR': 'KRAKEN_SPOT_NEAR_USD',
    'BNB': 'KRAKEN_SPOT_BNB_USD',
    'STRK': 'KRAKEN_SPOT_STRK_USD',
    'POL': 'KRAKEN_SPOT_POL_USD',    # Alternative that worked
    
    # Bitstamp (alternative exchange)
    'OP': 'BITSTAMP_SPOT_OP_USD'     # Alternative that worked
}

print(f"üìã Configured {len(SPECIFIC_ASSETS)} assets:")
for asset, symbol in SPECIFIC_ASSETS.items():
    exchange = symbol.split('_')[0]
    print(f"  {asset:6s}: {exchange}")

üìã Configured 15 assets:
  BTC   : COINBASE
  ETH   : COINBASE
  SOL   : COINBASE
  ADA   : COINBASE
  AVAX  : COINBASE
  LINK  : COINBASE
  UNI   : COINBASE
  AAVE  : COINBASE
  ARB   : COINBASE
  MKR   : COINBASE
  NEAR  : KRAKEN
  BNB   : KRAKEN
  STRK  : KRAKEN
  POL   : KRAKEN
  OP    : BITSTAMP


In [13]:
def get_historic_candles(name, symbol_id, granularity, total_minutes, chunk_size, now):
    """
    Fetch historical OHLCV data from CoinAPI, starting from now and going backwards.
    Stops early if consecutive empty responses are received.
    """
    all_data = []
    empty_response_count = 0
    
    for i in range(0, total_minutes, chunk_size):
        # Start from now and go backwards
        end_time = (now - timedelta(seconds=granularity * i)).replace(microsecond=0)
        start_time = (end_time - timedelta(seconds=granularity * chunk_size)).replace(microsecond=0)

        url = f"https://rest.coinapi.io/v1/ohlcv/{symbol_id}/history"
        params = {
            'period_id': '1MIN',  # Changed from 1HRS to 1MIN
            'time_start': start_time.isoformat() + 'Z',
            'time_end': end_time.isoformat() + 'Z',
            'limit': chunk_size
        }

        # Retry logic for rate limits
        for attempt in range(3):
            try:
                r = requests.get(url, headers=headers, params=params, timeout=30)
                if r.status_code == 429:
                    print(f"‚è≥ Rate limit hit for {name}. Waiting 60 seconds...")
                    time.sleep(60)
                    continue
                r.raise_for_status()
                data = r.json()
                
                if len(data) == 0:
                    empty_response_count += 1
                    print(f"‚ö†Ô∏è  No data for {name} from {start_time} to {end_time} (empty count: {empty_response_count}/{NO_DATA_THRESHOLD})")
                    
                    # Stop after consecutive empty responses
                    if empty_response_count >= NO_DATA_THRESHOLD:
                        print(f"üõë Stopping {name} after {NO_DATA_THRESHOLD} consecutive empty responses")
                        return pd.DataFrame(all_data)
                    break
                else:
                    # Reset counter when we get data
                    empty_response_count = 0
                
                all_data.extend(data)
                print(f"‚úÖ Fetched {len(data)} rows from {name} | {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')} | Total: {len(all_data):,}")
                break
                
            except Exception as e:
                if attempt == 2:  # Last attempt
                    raise Exception(f"Failed after 3 attempts: {e}")
                print(f"‚ö†Ô∏è  Attempt {attempt + 1} failed: {e}. Retrying...")
                time.sleep(5)

        time.sleep(0.5)  # Polite delay between chunks

    # Process and clean data
    if not all_data:
        return pd.DataFrame()
        
    df = pd.DataFrame(all_data)
    df = df.rename(columns={
        'time_period_start': 'time',
        'price_close': f'{name}_close',
        'volume_traded': f'{name}_volume'
    })
    df['time'] = pd.to_datetime(df['time'])
    df = df[['time', f'{name}_close', f'{name}_volume']]
    df = df.sort_values('time').reset_index(drop=True)
    
    print(f"üéØ {name} complete: {len(df):,} total rows")
    return df


def fetch_asset_with_retry(name, symbol_id, max_retries=3):
    """Fetch data for an asset with comprehensive retry and error handling"""
    for attempt in range(1, max_retries + 1):
        try:
            print(f"üîÑ Attempt {attempt}/{max_retries} for {name} ({symbol_id})...")
            
            # Start from current time (now) and work backwards
            df = get_historic_candles(
                name, symbol_id, GRANULARITY, TOTAL_MINUTES, CHUNK_SIZE, 
                datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute
            )
            
            if df is not None and not df.empty:
                df.to_csv(f"./specific_asset_data/{name}_data.csv", index=False)
                print(f"‚úÖ Success! Saved {len(df)} rows for {name}")
                return True, None
            else:
                error_msg = "No data returned from API"
                
        except Exception as e:
            error_msg = str(e)
            print(f"‚ö†Ô∏è Attempt {attempt} failed: {error_msg}")
            
            # Specific error handling
            if "403" in error_msg or "Forbidden" in error_msg:
                print(f"   üö´ 403 Forbidden - symbol may not exist on this exchange")
            elif "404" in error_msg:
                print(f"   üö´ 404 Not Found - {symbol_id} may not exist")
            elif "422" in error_msg:
                print(f"   üö´ 422 Unprocessable - symbol format issue")
        
        if attempt < max_retries:
            print(f"   Retrying in 5 seconds...")
            time.sleep(5)
    
    return False, error_msg


print("‚úÖ Core functions defined")

‚úÖ Core functions defined


In [14]:
def prep_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare dataframe for merging: rename columns, parse time, handle duplicates"""
    df = df.copy()
    
    # Rename *_close -> *_price for consistency
    rename_map = {c: f"{c.rsplit('_',1)[0]}_price" for c in df.columns if c.endswith('_close')}
    if rename_map:
        df = df.rename(columns=rename_map)

    # Parse time and set as index
    df['time'] = pd.to_datetime(df['time'], format='ISO8601', utc=True)
    df = df.drop_duplicates('time').sort_values('time').set_index('time')

    # Handle bad price data
    for c in df.columns:
        if c.endswith('_price'):
            df[c] = df[c].mask(df[c] <= 0)  # Remove zero/negative prices
            
    return df


def merge_asof_many(dfs, tolerance='5min', direction='nearest'):
    """Merge multiple dataframes using pandas merge_asof for time-series alignment"""
    dfs = [prep_dataframe(d) for d in dfs]
    
    # Use the densest dataframe as the base timeline
    left = max(dfs, key=lambda d: len(d))
    others = [d for d in dfs if d is not left]
    
    tol = pd.Timedelta(tolerance)
    
    def merge_two(left, right):
        # Handle column name collisions
        intersect = set(left.columns).intersection(right.columns)
        if intersect:
            right = right.rename(columns={c: f"{c}_r" for c in intersect})
            
        return pd.merge_asof(
            left.sort_index(), right.sort_index(),
            left_index=True, right_index=True,
            direction=direction, tolerance=tol
        )
    
    # Sequentially merge all dataframes
    merged = reduce(merge_two, others, left)
    return merged.reset_index().rename(columns={'index': 'time'})


print("‚úÖ Data merging functions defined")

‚úÖ Data merging functions defined


In [15]:
# Main execution: Fetch data for all assets
os.makedirs('./specific_asset_data', exist_ok=True)

# Check existing files to avoid re-downloading
existing_files = []
missing_assets = {}

print("üìÇ Checking existing data files...")
for name, symbol_id in SPECIFIC_ASSETS.items():
    csv_path = f"./specific_asset_data/{name}_data.csv"
    if os.path.exists(csv_path):
        try:
            df = pd.read_csv(csv_path)
            if len(df) > 0:
                existing_files.append(name)
                print(f"‚úÖ {name}: Already exists ({len(df):,} rows)")
                continue
        except Exception:
            pass
    
    missing_assets[name] = symbol_id
    print(f"‚ùå {name}: Will fetch")

print(f"\nüìä Status: {len(existing_files)} exist, {len(missing_assets)} to fetch")

# Fetch missing assets
if missing_assets:
    successful_fetches = []
    failed_fetches = []
    
    for name, symbol_id in missing_assets.items():
        print(f"\nüéØ Fetching data for {name}...")
        success, error = fetch_asset_with_retry(name, symbol_id, MAX_RETRIES)
        
        if success:
            successful_fetches.append(name)
        else:
            failed_fetches.append((name, error))
            print(f"‚ùå Final failure for {name}: {error}")
        
        time.sleep(2)  # Polite delay
    
    print(f"\nüìä Fetch Results:")
    print(f"‚úÖ Successful: {len(successful_fetches)} ({', '.join(successful_fetches)})")
    if failed_fetches:
        print(f"‚ùå Failed: {len(failed_fetches)}")
        for name, error in failed_fetches:
            print(f"   {name}: {error}")
else:
    print("üéâ All assets already exist!")

üìÇ Checking existing data files...
‚ùå BTC: Will fetch
‚ùå ETH: Will fetch
‚ùå SOL: Will fetch
‚ùå ADA: Will fetch
‚ùå AVAX: Will fetch
‚ùå LINK: Will fetch
‚ùå UNI: Will fetch
‚ùå AAVE: Will fetch
‚ùå ARB: Will fetch
‚ùå MKR: Will fetch
‚ùå NEAR: Will fetch
‚ùå BNB: Will fetch
‚ùå STRK: Will fetch
‚ùå POL: Will fetch
‚ùå OP: Will fetch

üìä Status: 0 exist, 15 to fetch

üéØ Fetching data for BTC...
üîÑ Attempt 1/3 for BTC (COINBASE_SPOT_BTC_USD)...


  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6933 rows from BTC | 2025-11-04 20:28 to 2025-11-11 19:08 | Total: 6,933
‚ö†Ô∏è  Attempt 1 failed: 500 Server Error: Internal Server Error for url: https://rest.coinapi.io/v1/ohlcv/COINBASE_SPOT_BTC_USD/history?period_id=1MIN&time_start=2025-10-28T21%3A48%3A00Z&time_end=2025-11-04T20%3A28%3A00Z&limit=10000. Retrying...
‚úÖ Fetched 9944 rows from BTC | 2025-10-28 21:48 to 2025-11-04 20:28 | Total: 16,877
‚úÖ Fetched 9664 rows from BTC | 2025-10-21 23:08 to 2025-10-28 21:48 | Total: 26,541
‚úÖ Fetched 9998 rows from BTC | 2025-10-15 00:28 to 2025-10-21 23:08 | Total: 36,539
‚úÖ Fetched 9968 rows from BTC | 2025-10-08 01:48 to 2025-10-15 00:28 | Total: 46,507
‚úÖ Fetched 9984 rows from BTC | 2025-10-01 03:08 to 2025-10-08 01:48 | Total: 56,491
‚úÖ Fetched 10000 rows from BTC | 2025-09-24 04:28 to 2025-10-01 03:08 | Total: 66,491
‚úÖ Fetched 9997 rows from BTC | 2025-09-17 05:48 to 2025-09-24 04:28 | Total: 76,488
‚úÖ Fetched 9994 rows from BTC | 2025-09-10 07:08 to 2025-09-17 

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6930 rows from ETH | 2025-11-04 20:31 to 2025-11-11 19:11 | Total: 6,930
‚úÖ Fetched 9944 rows from ETH | 2025-10-28 21:51 to 2025-11-04 20:31 | Total: 16,874
‚úÖ Fetched 9661 rows from ETH | 2025-10-21 23:11 to 2025-10-28 21:51 | Total: 26,535
‚úÖ Fetched 9999 rows from ETH | 2025-10-15 00:31 to 2025-10-21 23:11 | Total: 36,534
‚úÖ Fetched 9967 rows from ETH | 2025-10-08 01:51 to 2025-10-15 00:31 | Total: 46,501
‚úÖ Fetched 9985 rows from ETH | 2025-10-01 03:11 to 2025-10-08 01:51 | Total: 56,486
‚úÖ Fetched 10000 rows from ETH | 2025-09-24 04:31 to 2025-10-01 03:11 | Total: 66,486
‚úÖ Fetched 9993 rows from ETH | 2025-09-17 05:51 to 2025-09-24 04:31 | Total: 76,479
‚úÖ Fetched 9995 rows from ETH | 2025-09-10 07:11 to 2025-09-17 05:51 | Total: 86,474
‚úÖ Fetched 9880 rows from ETH | 2025-09-03 08:31 to 2025-09-10 07:11 | Total: 96,354
‚úÖ Fetched 9988 rows from ETH | 2025-08-27 09:51 to 2025-09-03 08:31 | Total: 106,342
‚úÖ Fetched 9999 rows from ETH | 2025-08-20 11:11 to 

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6929 rows from SOL | 2025-11-04 20:34 to 2025-11-11 19:14 | Total: 6,929
‚úÖ Fetched 9944 rows from SOL | 2025-10-28 21:54 to 2025-11-04 20:34 | Total: 16,873
‚úÖ Fetched 9666 rows from SOL | 2025-10-21 23:14 to 2025-10-28 21:54 | Total: 26,539
‚úÖ Fetched 9999 rows from SOL | 2025-10-15 00:34 to 2025-10-21 23:14 | Total: 36,538
‚úÖ Fetched 9970 rows from SOL | 2025-10-08 01:54 to 2025-10-15 00:34 | Total: 46,508
‚úÖ Fetched 9983 rows from SOL | 2025-10-01 03:14 to 2025-10-08 01:54 | Total: 56,491
‚úÖ Fetched 10000 rows from SOL | 2025-09-24 04:34 to 2025-10-01 03:14 | Total: 66,491
‚úÖ Fetched 9998 rows from SOL | 2025-09-17 05:54 to 2025-09-24 04:34 | Total: 76,489
‚úÖ Fetched 9995 rows from SOL | 2025-09-10 07:14 to 2025-09-17 05:54 | Total: 86,484
‚úÖ Fetched 9878 rows from SOL | 2025-09-03 08:34 to 2025-09-10 07:14 | Total: 96,362
‚úÖ Fetched 9995 rows from SOL | 2025-08-27 09:54 to 2025-09-03 08:34 | Total: 106,357
‚úÖ Fetched 10000 rows from SOL | 2025-08-20 11:14 to

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6927 rows from ADA | 2025-11-04 20:36 to 2025-11-11 19:16 | Total: 6,927
‚úÖ Fetched 9944 rows from ADA | 2025-10-28 21:56 to 2025-11-04 20:36 | Total: 16,871
‚úÖ Fetched 9666 rows from ADA | 2025-10-21 23:16 to 2025-10-28 21:56 | Total: 26,537
‚úÖ Fetched 9996 rows from ADA | 2025-10-15 00:36 to 2025-10-21 23:16 | Total: 36,533
‚úÖ Fetched 9971 rows from ADA | 2025-10-08 01:56 to 2025-10-15 00:36 | Total: 46,504
‚úÖ Fetched 9984 rows from ADA | 2025-10-01 03:16 to 2025-10-08 01:56 | Total: 56,488
‚úÖ Fetched 10000 rows from ADA | 2025-09-24 04:36 to 2025-10-01 03:16 | Total: 66,488
‚úÖ Fetched 9998 rows from ADA | 2025-09-17 05:56 to 2025-09-24 04:36 | Total: 76,486
‚úÖ Fetched 9991 rows from ADA | 2025-09-10 07:16 to 2025-09-17 05:56 | Total: 86,477
‚úÖ Fetched 9878 rows from ADA | 2025-09-03 08:36 to 2025-09-10 07:16 | Total: 96,355
‚úÖ Fetched 9997 rows from ADA | 2025-08-27 09:56 to 2025-09-03 08:36 | Total: 106,352
‚úÖ Fetched 9998 rows from ADA | 2025-08-20 11:16 to 

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6924 rows from AVAX | 2025-11-04 20:39 to 2025-11-11 19:19 | Total: 6,924
‚úÖ Fetched 9944 rows from AVAX | 2025-10-28 21:59 to 2025-11-04 20:39 | Total: 16,868
‚úÖ Fetched 9659 rows from AVAX | 2025-10-21 23:19 to 2025-10-28 21:59 | Total: 26,527
‚úÖ Fetched 9993 rows from AVAX | 2025-10-15 00:39 to 2025-10-21 23:19 | Total: 36,520
‚úÖ Fetched 9970 rows from AVAX | 2025-10-08 01:59 to 2025-10-15 00:39 | Total: 46,490
‚úÖ Fetched 9986 rows from AVAX | 2025-10-01 03:19 to 2025-10-08 01:59 | Total: 56,476
‚úÖ Fetched 10000 rows from AVAX | 2025-09-24 04:39 to 2025-10-01 03:19 | Total: 66,476
‚úÖ Fetched 9999 rows from AVAX | 2025-09-17 05:59 to 2025-09-24 04:39 | Total: 76,475
‚úÖ Fetched 9995 rows from AVAX | 2025-09-10 07:19 to 2025-09-17 05:59 | Total: 86,470
‚úÖ Fetched 9879 rows from AVAX | 2025-09-03 08:39 to 2025-09-10 07:19 | Total: 96,349
‚úÖ Fetched 9999 rows from AVAX | 2025-08-27 09:59 to 2025-09-03 08:39 | Total: 106,348
‚úÖ Fetched 10000 rows from AVAX | 2025-08

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6921 rows from LINK | 2025-11-04 20:41 to 2025-11-11 19:21 | Total: 6,921
‚úÖ Fetched 9944 rows from LINK | 2025-10-28 22:01 to 2025-11-04 20:41 | Total: 16,865
‚úÖ Fetched 9667 rows from LINK | 2025-10-21 23:21 to 2025-10-28 22:01 | Total: 26,532
‚úÖ Fetched 9998 rows from LINK | 2025-10-15 00:41 to 2025-10-21 23:21 | Total: 36,530
‚úÖ Fetched 9963 rows from LINK | 2025-10-08 02:01 to 2025-10-15 00:41 | Total: 46,493
‚úÖ Fetched 9985 rows from LINK | 2025-10-01 03:21 to 2025-10-08 02:01 | Total: 56,478
‚úÖ Fetched 10000 rows from LINK | 2025-09-24 04:41 to 2025-10-01 03:21 | Total: 66,478
‚úÖ Fetched 9998 rows from LINK | 2025-09-17 06:01 to 2025-09-24 04:41 | Total: 76,476
‚úÖ Fetched 9995 rows from LINK | 2025-09-10 07:21 to 2025-09-17 06:01 | Total: 86,471
‚úÖ Fetched 9873 rows from LINK | 2025-09-03 08:41 to 2025-09-10 07:21 | Total: 96,344
‚úÖ Fetched 9990 rows from LINK | 2025-08-27 10:01 to 2025-09-03 08:41 | Total: 106,334
‚úÖ Fetched 10000 rows from LINK | 2025-08

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6920 rows from UNI | 2025-11-04 20:43 to 2025-11-11 19:23 | Total: 6,920
‚úÖ Fetched 9944 rows from UNI | 2025-10-28 22:03 to 2025-11-04 20:43 | Total: 16,864
‚úÖ Fetched 9664 rows from UNI | 2025-10-21 23:23 to 2025-10-28 22:03 | Total: 26,528
‚úÖ Fetched 9996 rows from UNI | 2025-10-15 00:43 to 2025-10-21 23:23 | Total: 36,524
‚úÖ Fetched 9969 rows from UNI | 2025-10-08 02:03 to 2025-10-15 00:43 | Total: 46,493
‚úÖ Fetched 9984 rows from UNI | 2025-10-01 03:23 to 2025-10-08 02:03 | Total: 56,477
‚úÖ Fetched 9998 rows from UNI | 2025-09-24 04:43 to 2025-10-01 03:23 | Total: 66,475
‚úÖ Fetched 9995 rows from UNI | 2025-09-17 06:03 to 2025-09-24 04:43 | Total: 76,470
‚úÖ Fetched 9983 rows from UNI | 2025-09-10 07:23 to 2025-09-17 06:03 | Total: 86,453
‚úÖ Fetched 9873 rows from UNI | 2025-09-03 08:43 to 2025-09-10 07:23 | Total: 96,326
‚úÖ Fetched 9999 rows from UNI | 2025-08-27 10:03 to 2025-09-03 08:43 | Total: 106,325
‚úÖ Fetched 9994 rows from UNI | 2025-08-20 11:23 to 2

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6917 rows from AAVE | 2025-11-04 20:46 to 2025-11-11 19:26 | Total: 6,917
‚úÖ Fetched 9944 rows from AAVE | 2025-10-28 22:06 to 2025-11-04 20:46 | Total: 16,861
‚úÖ Fetched 9667 rows from AAVE | 2025-10-21 23:26 to 2025-10-28 22:06 | Total: 26,528
‚úÖ Fetched 9999 rows from AAVE | 2025-10-15 00:46 to 2025-10-21 23:26 | Total: 36,527
‚úÖ Fetched 9968 rows from AAVE | 2025-10-08 02:06 to 2025-10-15 00:46 | Total: 46,495
‚úÖ Fetched 9973 rows from AAVE | 2025-10-01 03:26 to 2025-10-08 02:06 | Total: 56,468
‚úÖ Fetched 10000 rows from AAVE | 2025-09-24 04:46 to 2025-10-01 03:26 | Total: 66,468
‚úÖ Fetched 10000 rows from AAVE | 2025-09-17 06:06 to 2025-09-24 04:46 | Total: 76,468
‚úÖ Fetched 9981 rows from AAVE | 2025-09-10 07:26 to 2025-09-17 06:06 | Total: 86,449
‚úÖ Fetched 9878 rows from AAVE | 2025-09-03 08:46 to 2025-09-10 07:26 | Total: 96,327
‚úÖ Fetched 9997 rows from AAVE | 2025-08-27 10:06 to 2025-09-03 08:46 | Total: 106,324
‚úÖ Fetched 9997 rows from AAVE | 2025-08

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 6915 rows from ARB | 2025-11-04 20:48 to 2025-11-11 19:28 | Total: 6,915
‚úÖ Fetched 9944 rows from ARB | 2025-10-28 22:08 to 2025-11-04 20:48 | Total: 16,859
‚úÖ Fetched 9658 rows from ARB | 2025-10-21 23:28 to 2025-10-28 22:08 | Total: 26,517
‚úÖ Fetched 9998 rows from ARB | 2025-10-15 00:48 to 2025-10-21 23:28 | Total: 36,515
‚úÖ Fetched 9968 rows from ARB | 2025-10-08 02:08 to 2025-10-15 00:48 | Total: 46,483
‚úÖ Fetched 9985 rows from ARB | 2025-10-01 03:28 to 2025-10-08 02:08 | Total: 56,468
‚úÖ Fetched 9993 rows from ARB | 2025-09-24 04:48 to 2025-10-01 03:28 | Total: 66,461
‚úÖ Fetched 9999 rows from ARB | 2025-09-17 06:08 to 2025-09-24 04:48 | Total: 76,460
‚úÖ Fetched 9994 rows from ARB | 2025-09-10 07:28 to 2025-09-17 06:08 | Total: 86,454
‚úÖ Fetched 9876 rows from ARB | 2025-09-03 08:48 to 2025-09-10 07:28 | Total: 96,330
‚úÖ Fetched 9993 rows from ARB | 2025-08-27 10:08 to 2025-09-03 08:48 | Total: 106,323
‚úÖ Fetched 9999 rows from ARB | 2025-08-20 11:28 to 2

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 5735 rows from MKR | 2025-11-04 20:50 to 2025-11-11 19:30 | Total: 5,735
‚úÖ Fetched 8921 rows from MKR | 2025-10-28 22:10 to 2025-11-04 20:50 | Total: 14,656
‚úÖ Fetched 7818 rows from MKR | 2025-10-21 23:30 to 2025-10-28 22:10 | Total: 22,474
‚úÖ Fetched 9052 rows from MKR | 2025-10-15 00:50 to 2025-10-21 23:30 | Total: 31,526
‚úÖ Fetched 9469 rows from MKR | 2025-10-08 02:10 to 2025-10-15 00:50 | Total: 40,995
‚úÖ Fetched 9628 rows from MKR | 2025-10-01 03:30 to 2025-10-08 02:10 | Total: 50,623
‚úÖ Fetched 9484 rows from MKR | 2025-09-24 04:50 to 2025-10-01 03:30 | Total: 60,107
‚úÖ Fetched 8838 rows from MKR | 2025-09-17 06:10 to 2025-09-24 04:50 | Total: 68,945
‚úÖ Fetched 9825 rows from MKR | 2025-09-10 07:30 to 2025-09-17 06:10 | Total: 78,770
‚úÖ Fetched 9846 rows from MKR | 2025-09-03 08:50 to 2025-09-10 07:30 | Total: 88,616
‚úÖ Fetched 9975 rows from MKR | 2025-08-27 10:10 to 2025-09-03 08:50 | Total: 98,591
‚úÖ Fetched 9993 rows from MKR | 2025-08-20 11:30 to 20

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 9998 rows from NEAR | 2025-11-04 20:52 to 2025-11-11 19:32 | Total: 9,998
‚úÖ Fetched 9609 rows from NEAR | 2025-10-28 22:12 to 2025-11-04 20:52 | Total: 19,607
‚úÖ Fetched 9987 rows from NEAR | 2025-10-21 23:32 to 2025-10-28 22:12 | Total: 29,594
‚úÖ Fetched 9993 rows from NEAR | 2025-10-15 00:52 to 2025-10-21 23:32 | Total: 39,587
‚úÖ Fetched 9998 rows from NEAR | 2025-10-08 02:12 to 2025-10-15 00:52 | Total: 49,585
‚úÖ Fetched 9989 rows from NEAR | 2025-10-01 03:32 to 2025-10-08 02:12 | Total: 59,574
‚úÖ Fetched 9994 rows from NEAR | 2025-09-24 04:52 to 2025-10-01 03:32 | Total: 69,568
‚úÖ Fetched 9999 rows from NEAR | 2025-09-17 06:12 to 2025-09-24 04:52 | Total: 79,567
‚úÖ Fetched 9989 rows from NEAR | 2025-09-10 07:32 to 2025-09-17 06:12 | Total: 89,556
‚úÖ Fetched 9869 rows from NEAR | 2025-09-03 08:52 to 2025-09-10 07:32 | Total: 99,425
‚úÖ Fetched 9937 rows from NEAR | 2025-08-27 10:12 to 2025-09-03 08:52 | Total: 109,362
‚úÖ Fetched 9998 rows from NEAR | 2025-08-2

  datetime.utcnow().replace(second=0, microsecond=0)  # Start at current minute


‚úÖ Fetched 9995 rows from BNB | 2025-11-04 20:55 to 2025-11-11 19:35 | Total: 9,995
‚úÖ Fetched 9593 rows from BNB | 2025-10-28 22:15 to 2025-11-04 20:55 | Total: 19,588
‚úÖ Fetched 9988 rows from BNB | 2025-10-21 23:35 to 2025-10-28 22:15 | Total: 29,576
‚úÖ Fetched 9990 rows from BNB | 2025-10-15 00:55 to 2025-10-21 23:35 | Total: 39,566
‚úÖ Fetched 10000 rows from BNB | 2025-10-08 02:15 to 2025-10-15 00:55 | Total: 49,566
‚úÖ Fetched 9992 rows from BNB | 2025-10-01 03:35 to 2025-10-08 02:15 | Total: 59,558
‚úÖ Fetched 9994 rows from BNB | 2025-09-24 04:55 to 2025-10-01 03:35 | Total: 69,552
‚úÖ Fetched 9998 rows from BNB | 2025-09-17 06:15 to 2025-09-24 04:55 | Total: 79,550
‚úÖ Fetched 9968 rows from BNB | 2025-09-10 07:35 to 2025-09-17 06:15 | Total: 89,518
‚úÖ Fetched 9836 rows from BNB | 2025-09-03 08:55 to 2025-09-10 07:35 | Total: 99,354
‚úÖ Fetched 9926 rows from BNB | 2025-08-27 10:15 to 2025-09-03 08:55 | Total: 109,280
‚úÖ Fetched 9996 rows from BNB | 2025-08-20 11:35 to 

In [16]:
# Combine all CSV files into final dataset
print("\nüîÑ Creating final combined dataset...")

csv_directory = "./specific_asset_data"
output_file = "specific_assets_dataset_minute_granularity.csv"

# Load all CSV files
csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]
print(f"Found {len(csv_files)} CSV files")

dataframes = []
file_stats = {}

for csv_file in csv_files:
    file_path = os.path.join(csv_directory, csv_file)
    df = pd.read_csv(file_path)
    asset_name = csv_file.split('_')[0]
    
    # Store statistics
    file_stats[asset_name] = {
        'rows': len(df),
        'date_start': df['time'].min() if 'time' in df.columns else 'Unknown',
        'date_end': df['time'].max() if 'time' in df.columns else 'Unknown'
    }
    
    # Handle different CSV formats
    if 'time_period_start' in df.columns:
        # CoinAPI format - rename columns
        df = df.rename(columns={
            "time_period_start": "time",
            "price_close": f"{asset_name}_close",
            "volume_traded": f"{asset_name}_volume"
        })
        df = df[["time", f"{asset_name}_close", f"{asset_name}_volume"]]
    
    dataframes.append(df)

# Merge all dataframes
print(f"üîÑ Merging {len(dataframes)} DataFrames...")
merged = merge_asof_many(dataframes, tolerance='5min', direction='nearest')

# Save final dataset
merged.to_csv(output_file, index=False)
print(f"‚úÖ Final dataset saved: {output_file}")
print(f"üìä Shape: {merged.shape}")


üîÑ Creating final combined dataset...
Found 15 CSV files
üîÑ Merging 15 DataFrames...
‚úÖ Final dataset saved: specific_assets_dataset_minute_granularity.csv
üìä Shape: (1046523, 31)


In [17]:
# Comprehensive Final Statistics Report
print("="*80)
print("üéâ FINAL DATA COLLECTION REPORT")
print("="*80)

# Load final dataset for analysis
final_df = pd.read_csv(output_file)

# Dataset overview
print(f"\nüìä DATASET OVERVIEW:")
print(f"Total rows: {len(final_df):,}")
print(f"Total columns: {len(final_df.columns)}")
print(f"Date range: {final_df['time'].min()} to {final_df['time'].max()}")
time_span = (pd.to_datetime(final_df['time'].max()) - pd.to_datetime(final_df['time'].min())).days
print(f"Time span: {time_span:,} days (~{time_span/365:.1f} years)")

# Asset analysis
assets = [col.split('_')[0] for col in final_df.columns if '_price' in col]
print(f"\nüéØ ASSETS: {len(assets)}/{len(SPECIFIC_ASSETS)}")
print(f"Collected: {', '.join(sorted(assets))}")

# Coverage analysis
print(f"\nüìà DATA COVERAGE:")
print(f"{'Asset':<6} {'Rows':<8} {'Coverage':<10} {'Exchange':<10} {'Start':<12} {'End':<12}")
print("-" * 70)

coverage_stats = []
for asset in sorted(assets):
    price_col = f"{asset}_price"
    if price_col in final_df.columns:
        non_null = final_df[price_col].notna().sum()
        coverage_pct = (non_null / len(final_df)) * 100
        
        # Get exchange info
        symbol_id = SPECIFIC_ASSETS.get(asset, '')
        exchange = symbol_id.split('_')[0] if symbol_id else 'Unknown'
        
        # Date range for this asset
        asset_data = final_df[final_df[price_col].notna()]
        start_date = asset_data['time'].min()[:10] if not asset_data.empty else 'N/A'
        end_date = asset_data['time'].max()[:10] if not asset_data.empty else 'N/A'
        
        coverage_stats.append((asset, non_null, coverage_pct, exchange, start_date, end_date))
        print(f"{asset:<6} {non_null:<8,} {coverage_pct:<9.1f}% {exchange:<10} {start_date:<12} {end_date:<12}")

# Summary metrics
total_data_points = sum(stat[1] for stat in coverage_stats)
avg_coverage = sum(stat[2] for stat in coverage_stats) / len(coverage_stats)
best_asset = max(coverage_stats, key=lambda x: x[2])
worst_asset = min(coverage_stats, key=lambda x: x[2])

print(f"\nüìä SUMMARY:")
print(f"Total data points: {total_data_points:,}")
print(f"Average coverage: {avg_coverage:.1f}%")
print(f"Best coverage: {best_asset[0]} ({best_asset[2]:.1f}%)")
print(f"Worst coverage: {worst_asset[0]} ({worst_asset[2]:.1f}%)")

# Exchange distribution
exchange_counts = {}
for stat in coverage_stats:
    exchange = stat[3]
    exchange_counts[exchange] = exchange_counts.get(exchange, 0) + 1

print(f"\nüè¶ EXCHANGES:")
for exchange, count in sorted(exchange_counts.items()):
    print(f"  {exchange}: {count} assets")

# Final metrics
file_size_mb = os.path.getsize(output_file) / (1024*1024)
success_rate = len(assets) / len(SPECIFIC_ASSETS) * 100

print(f"\n‚úÖ SUCCESS METRICS:")
print(f"Assets requested: {len(SPECIFIC_ASSETS)}")
print(f"Assets collected: {len(assets)}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Dataset size: {file_size_mb:.1f} MB")
print(f"\nüéØ DATASET READY FOR LSTM TRAINING!")
print(f"File: {output_file}")
print("="*80)

üéâ FINAL DATA COLLECTION REPORT

üìä DATASET OVERVIEW:
Total rows: 1,046,523
Total columns: 31
Date range: 2023-11-06 16:28:00+00:00 to 2025-11-10 16:24:00+00:00
Time span: 734 days (~2.0 years)

üéØ ASSETS: 15/15
Collected: AAVE, ADA, ARB, AVAX, BNB, BTC, ETH, LINK, MKR, NEAR, OP, POL, SOL, STRK, UNI

üìà DATA COVERAGE:
Asset  Rows     Coverage   Exchange   Start        End         
----------------------------------------------------------------------
AAVE   1,046,004 100.0    % COINBASE   2023-11-06   2025-11-10  
ADA    1,046,499 100.0    % COINBASE   2023-11-06   2025-11-10  
ARB    1,046,465 100.0    % COINBASE   2023-11-06   2025-11-10  
AVAX   1,046,488 100.0    % COINBASE   2023-11-06   2025-11-10  
BNB    271,478  25.9     % KRAKEN     2025-04-22   2025-11-10  
BTC    1,046,523 100.0    % COINBASE   2023-11-06   2025-11-10  
ETH    1,046,494 100.0    % COINBASE   2023-11-06   2025-11-10  
LINK   1,046,478 100.0    % COINBASE   2023-11-06   2025-11-10  
MKR    1,045,319 9