# Combine ETH Orderbook Datasets

Combines old (CoinAPI) and new (Coinbase) datasets sequentially.

**Key operations:**
- Calculate `eth_volume` as orderbook depth (sum of bid/ask volumes)
- Drop `btc_volume` (no BTC orderbook data available)
- Keep `btc_price` as a feature
- Append datasets sequentially (old first, then new)

In [7]:
import pandas as pd
import numpy as np

## Load Old CSV (CoinAPI Data)

In [8]:
old_df = pd.read_csv('5s_data/old_eth_orderbook_coinbase_5s_with_price_volume.csv')
print(f"Old CSV shape: {old_df.shape}")
print(f"Old CSV columns: {old_df.columns.tolist()}")
print(f"\nFirst few rows:")
old_df.head()

Old CSV shape: (241600, 53)
Old CSV columns: ['timestamp', 'bid_price_1', 'bid_vol_1', 'ask_price_1', 'ask_vol_1', 'bid_price_2', 'bid_vol_2', 'ask_price_2', 'ask_vol_2', 'bid_price_3', 'bid_vol_3', 'ask_price_3', 'ask_vol_3', 'bid_price_4', 'bid_vol_4', 'ask_price_4', 'ask_vol_4', 'bid_price_5', 'bid_vol_5', 'ask_price_5', 'ask_vol_5', 'bid_price_6', 'bid_vol_6', 'ask_price_6', 'ask_vol_6', 'bid_price_7', 'bid_vol_7', 'ask_price_7', 'ask_vol_7', 'bid_price_8', 'bid_vol_8', 'ask_price_8', 'ask_vol_8', 'bid_price_9', 'bid_vol_9', 'ask_price_9', 'ask_vol_9', 'bid_price_10', 'bid_vol_10', 'ask_price_10', 'ask_vol_10', 'eth_price_x', 'eth_volume_x', 'btc_price_x', 'btc_volume_x', 'eth_price_y', 'eth_volume_y', 'btc_price_y', 'btc_volume_y', 'eth_price', 'eth_volume', 'btc_price', 'btc_volume']

First few rows:


Unnamed: 0,timestamp,bid_price_1,bid_vol_1,ask_price_1,ask_vol_1,bid_price_2,bid_vol_2,ask_price_2,ask_vol_2,bid_price_3,...,btc_price_x,btc_volume_x,eth_price_y,eth_volume_y,btc_price_y,btc_volume_y,eth_price,eth_volume,btc_price,btc_volume
0,2025-12-01 02:57:15.727819,2841.16,0.042627,2841.17,0.017375,2841.1,0.114237,2841.27,0.175978,2841.09,...,86740.79,0.269073,2841.165,0.597915,86740.79,0.269073,2841.165,0.597915,86740.79,0.269073
1,2025-12-01 02:57:21.073133,2841.1,0.000496,2841.11,0.06746,2841.09,0.201554,2841.16,0.175985,2841.08,...,86736.72,1.937259,2841.085,0.777114,86736.72,1.937259,2841.085,0.777114,86736.72,1.937259
2,2025-12-01 02:57:26.375175,2840.82,0.000882,2841.0,0.06746,2840.81,0.042241,2841.17,0.029813,2840.8,...,86750.945,4.216855,2840.99,1.177497,86750.945,4.216855,2840.99,1.177497,86750.945,4.216855
3,2025-12-01 02:57:31.660663,2841.0,0.068,2841.01,0.05406,2840.91,0.043133,2841.29,0.0352,2840.9,...,86746.33,1.061931,2841.005,0.025164,86746.33,1.061931,2841.005,0.025164,86746.33,1.061931
4,2025-12-01 02:57:37.007031,2841.0,0.069421,2841.01,0.047244,2840.9,0.11024,2841.22,0.000891,2840.84,...,86750.475,0.440502,2841.005,0.003338,86750.475,0.440502,2841.005,0.003338,86750.475,0.440502


## Prepare Old CSV (CoinAPI Data)

In [9]:
# Remove duplicate columns (_x, _y suffixes)
cols_to_drop = [col for col in old_df.columns if col.endswith('_x') or col.endswith('_y')]
print(f"Dropping {len(cols_to_drop)} duplicate columns")
old_df = old_df.drop(columns=cols_to_drop)

# Calculate ETH orderbook depth (sum of bid/ask volumes across top 10 levels)
eth_bid_cols = [f'bid_vol_{i}' for i in range(1, 11) if f'bid_vol_{i}' in old_df.columns]
eth_ask_cols = [f'ask_vol_{i}' for i in range(1, 11) if f'ask_vol_{i}' in old_df.columns]
old_df['eth_volume'] = old_df[eth_bid_cols + eth_ask_cols].sum(axis=1)

# Drop btc_volume (no BTC orderbook available)
if 'btc_volume' in old_df.columns:
    old_df = old_df.drop(columns=['btc_volume'])

print(f"✓ Old CSV prepared: {len(old_df)} rows")
print(f"Sample: eth_volume range = {old_df['eth_volume'].min():.2f} to {old_df['eth_volume'].max():.2f}")
old_df.head()

Dropping 8 duplicate columns
✓ Old CSV prepared: 241600 rows
Sample: eth_volume range = 0.70 to 2343.72


Unnamed: 0,timestamp,bid_price_1,bid_vol_1,ask_price_1,ask_vol_1,bid_price_2,bid_vol_2,ask_price_2,ask_vol_2,bid_price_3,...,bid_vol_9,ask_price_9,ask_vol_9,bid_price_10,bid_vol_10,ask_price_10,ask_vol_10,eth_price,eth_volume,btc_price
0,2025-12-01 02:57:15.727819,2841.16,0.042627,2841.17,0.017375,2841.1,0.114237,2841.27,0.175978,2841.09,...,0.529741,2841.81,0.05,2840.8,0.072,2841.82,0.458657,2841.165,3.87627,86740.79
1,2025-12-01 02:57:21.073133,2841.1,0.000496,2841.11,0.06746,2841.09,0.201554,2841.16,0.175985,2841.08,...,0.517682,2841.55,0.001,2840.6,0.85129,2841.66,0.05,2841.085,4.352593,86736.72
2,2025-12-01 02:57:26.375175,2840.82,0.000882,2841.0,0.06746,2840.81,0.042241,2841.17,0.029813,2840.8,...,1.807218,2841.41,0.05,2840.54,0.5,2841.42,0.099998,2840.99,4.260239,86750.945
3,2025-12-01 02:57:31.660663,2841.0,0.068,2841.01,0.05406,2840.91,0.043133,2841.29,0.0352,2840.9,...,0.068,2841.72,0.001,2840.54,0.437317,2841.73,1.341443,2841.005,4.992228,86746.33
4,2025-12-01 02:57:37.007031,2841.0,0.069421,2841.01,0.047244,2840.9,0.11024,2841.22,0.000891,2840.84,...,0.068,2841.54,0.512,2840.55,0.437316,2841.55,0.001,2841.005,4.876928,86750.475


## Prepare New CSV (Coinbase Data)

In [10]:
new_df = pd.read_csv('5s_data/eth_orderbook_coinbase_5s_with_price_volume.csv')

# Drop incompatible 24h volume columns
if 'eth_volume' in new_df.columns:
    new_df = new_df.drop(columns=['eth_volume'])
if 'btc_volume' in new_df.columns:
    new_df = new_df.drop(columns=['btc_volume'])

# Calculate ETH orderbook depth (sum of bid/ask volumes across top 10 levels)
eth_bid_cols = [f'bid_vol_{i}' for i in range(1, 11) if f'bid_vol_{i}' in new_df.columns]
eth_ask_cols = [f'ask_vol_{i}' for i in range(1, 11) if f'ask_vol_{i}' in new_df.columns]
new_df['eth_volume'] = new_df[eth_bid_cols + eth_ask_cols].sum(axis=1)

print(f"✓ New CSV prepared: {len(new_df)} rows")
print(f"Sample: eth_volume range = {new_df['eth_volume'].min():.2f} to {new_df['eth_volume'].max():.2f}")
new_df.head()

✓ New CSV prepared: 101700 rows
Sample: eth_volume range = 0.89 to 2796.39


Unnamed: 0,timestamp,eth_price,bid_price_1,bid_vol_1,ask_price_1,ask_vol_1,bid_price_2,bid_vol_2,ask_price_2,ask_vol_2,...,bid_price_9,bid_vol_9,ask_price_9,ask_vol_9,bid_price_10,bid_vol_10,ask_price_10,ask_vol_10,btc_price,eth_volume
0,2025-12-21 18:16:07.574055,2969.9,2969.87,0.003145,2969.88,6.540193,2969.7,0.001,2969.89,0.168357,...,2969.18,0.512,2970.4,0.124,2969.12,0.058943,2970.44,0.00168,87979.56,9.957719
1,2025-12-21 18:16:12.817972,2969.88,2969.87,0.001523,2969.88,2.115605,2969.56,0.168375,2969.89,0.168357,...,2969.21,0.089251,2970.14,0.106921,2969.19,1.403813,2970.15,0.365282,87982.0,5.712254
2,2025-12-21 18:16:17.863189,2970.49,2970.58,0.001593,2970.59,0.156291,2970.33,0.168331,2970.6,0.458838,...,2969.86,0.063,2970.79,0.6567,2969.85,0.380971,2970.8,0.124,87991.99,9.637888
3,2025-12-21 18:16:22.807019,2970.22,2970.48,0.001543,2970.49,2.223427,2970.22,0.179197,2970.5,0.639471,...,2969.85,0.729714,2970.75,0.976,2969.74,0.3476,2970.76,0.518581,87980.48,12.723592
4,2025-12-21 18:16:27.413885,2971.16,2971.19,0.179178,2971.2,0.471433,2971.05,0.168291,2971.26,0.000431,...,2970.55,0.69631,2971.51,0.975,2970.49,0.00196,2971.54,0.134651,87994.0,11.946217


## Align Columns and Append

In [11]:
# Keep only matching columns and align order
old_df = old_df[new_df.columns]
print(f"✓ Column alignment: {len(old_df.columns)} columns match")

# Append sequentially: old data first, then new data
combined_df = pd.concat([old_df, new_df], ignore_index=True)
print(f"✓ Combined: {len(old_df):,} old rows + {len(new_df):,} new rows = {len(combined_df):,} total")

# Check for missing values
missing = combined_df.isnull().sum().sum()
if missing > 0:
    print(f"⚠ Warning: {missing} missing values detected")
else:
    print(f"✓ No missing values")
combined_df.head()

✓ Column alignment: 44 columns match
✓ Combined: 241,600 old rows + 101,700 new rows = 343,300 total


Unnamed: 0,timestamp,eth_price,bid_price_1,bid_vol_1,ask_price_1,ask_vol_1,bid_price_2,bid_vol_2,ask_price_2,ask_vol_2,...,bid_price_9,bid_vol_9,ask_price_9,ask_vol_9,bid_price_10,bid_vol_10,ask_price_10,ask_vol_10,btc_price,eth_volume
0,2025-12-01 02:57:15.727819,2841.165,2841.16,0.042627,2841.17,0.017375,2841.1,0.114237,2841.27,0.175978,...,2840.81,0.529741,2841.81,0.05,2840.8,0.072,2841.82,0.458657,86740.79,3.87627
1,2025-12-01 02:57:21.073133,2841.085,2841.1,0.000496,2841.11,0.06746,2841.09,0.201554,2841.16,0.175985,...,2840.69,0.517682,2841.55,0.001,2840.6,0.85129,2841.66,0.05,86736.72,4.352593
2,2025-12-01 02:57:26.375175,2840.99,2840.82,0.000882,2841.0,0.06746,2840.81,0.042241,2841.17,0.029813,...,2840.55,1.807218,2841.41,0.05,2840.54,0.5,2841.42,0.099998,86750.945,4.260239
3,2025-12-01 02:57:31.660663,2841.005,2841.0,0.068,2841.01,0.05406,2840.91,0.043133,2841.29,0.0352,...,2840.6,0.068,2841.72,0.001,2840.54,0.437317,2841.73,1.341443,86746.33,4.992228
4,2025-12-01 02:57:37.007031,2841.005,2841.0,0.069421,2841.01,0.047244,2840.9,0.11024,2841.22,0.000891,...,2840.6,0.068,2841.54,0.512,2840.55,0.437316,2841.55,0.001,86750.475,4.876928


## Save Combined Dataset

In [12]:
output_path = 'eth_orderbook_combined_sequential.csv'
combined_df.to_csv(output_path, index=False)

print(f"✓ Saved to: {output_path}")
print(f"  Total rows: {len(combined_df):,}")
print(f"  Total columns: {len(combined_df.columns)}")
print(f"\nDataset summary:")
print(f"  Time range: {combined_df['timestamp'].min()} to {combined_df['timestamp'].max()}")
print(f"  ETH volume range: {combined_df['eth_volume'].min():.2f} to {combined_df['eth_volume'].max():.2f}")
print(f"  Note: eth_volume = orderbook depth (sum of bid/ask volumes)")
print(f"  Note: btc_volume dropped (no orderbook data, only btc_price available)")

✓ Saved to: eth_orderbook_combined_sequential.csv
  Total rows: 343,300
  Total columns: 44

Dataset summary:
  Time range: 2025-12-01 02:57:15.727819 to 2025-12-27 15:46:38.712373
  ETH volume range: 0.70 to 2796.39
  Note: eth_volume = orderbook depth (sum of bid/ask volumes)
  Note: btc_volume dropped (no orderbook data, only btc_price available)
