In [1]:
import gzip
import datetime
import pandas as pd
from BTrees.OOBTree import OOBTree

def replay_L2(filename, fn, fn_timestamps):
    with gzip.open(filename) as f:
        bid = OOBTree()
        ask = OOBTree()
        processing_snapshot = False
        prev_timestamp = None
        i = 0
        # Skip header
        f.readline()
        while True:
            line = f.readline()
            if not line:
                break
            exchange, symbol, timestamp, local_timestamp, is_snapshot, side, price, amount = line.decode('utf-8').strip().split(',')
            
            local_timestamp = int(local_timestamp)
            price = float(price)
            if prev_timestamp != local_timestamp:
                while local_timestamp > fn_timestamps[i]:
                    fn(fn_timestamps[i], prev_timestamp, bid, ask)
                    i += 1
            prev_timestamp = local_timestamp
            
            # Reconstruct the market depth
            if is_snapshot and not processing_snapshot:
                processing_snapshot = True
                bid.clear()
                ask.clear()
            elif not is_snapshot and processing_snapshot:
                processing_snapshot = False

            if amount != '0':
                if side == 'bid':
                    bid[price] = int(amount)
                else:
                    ask[price] = int(amount)
            else:
                if side == 'bid':
                    del bid[price]
                else:
                    del ask[price]
        
        for j in range(i, len(fn_timestamps)):
            fn(fn_timestamps[j], prev_timestamp, bid, ask)

def make_timestamps(since, to, interval):
    index = [since.timestamp() * 1000000]
    while since < to:
        since += datetime.timedelta(seconds=interval)
        index.append(since.timestamp() * 1000000)
    return index

In [2]:
since = datetime.datetime(2019, 7, 1, tzinfo=datetime.timezone.utc)
to = datetime.datetime(2019, 7, 2, tzinfo=datetime.timezone.utc)
timestamps = make_timestamps(since, to, 5)
df_imbalance = pd.DataFrame(float('nan'), index=timestamps, columns=['best_bid', 'best_ask', '5%', '10%'])
        
def process_L2(callback_timestamp, timestamp, bid, ask):
    if bid and ask:
        best_bid = bid.maxKey()
        best_ask = ask.minKey()
        mid = (best_bid + best_ask) / 2
        row = [best_bid, best_ask]
        for p in [0.05, 0.1]:
            buy = sum(map(lambda x: x[1], filter(lambda x: x[0] > mid * (1 - p), bid.iteritems())))
            sell = sum(map(lambda x: x[1], filter(lambda x: x[0] < mid * (1 + p), ask.iteritems())))
            row.append(buy - sell)
        df_imbalance.loc[callback_timestamp] = row

# Download CSV files from https://docs.tardis.dev/historical-data-details/bitmex
replay_L2('bitmex_incremental_book_L2_2019-07-01_XBTUSD.csv.gz', process_L2, timestamps)

df_trades = pd.read_csv('bitmex_trades_2019-07-01_XBTUSD.csv.gz', compression='gzip', header=0, sep=',')

In [3]:
df_imbalance

Unnamed: 0,best_bid,best_ask,5%,10%
1.561939e+15,,,,
1.561939e+15,10760.5,10761.0,100607160.0,117868034.0
1.561939e+15,10760.5,10761.0,100604222.0,117865096.0
1.561939e+15,10762.0,10762.5,100633691.0,118348535.0
1.561939e+15,10767.0,10769.0,98992727.0,116391257.0
...,...,...,...,...
1.562026e+15,10570.5,10571.0,-62871067.0,17574724.0
1.562026e+15,10570.5,10571.0,-62804968.0,17640914.0
1.562026e+15,10570.5,10571.0,-62851820.0,17529659.0
1.562026e+15,10570.5,10571.0,-63188036.0,17191607.0


In [4]:
df_trades

Unnamed: 0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount
0,bitmex,XBTUSD,1561939203820000,1561939205934605,95e052c9-d8ed-1b02-2ea1-a47eb3337432,sell,10760.5,17089
1,bitmex,XBTUSD,1561939203850000,1561939205950839,d6cc5a30-d0b0-44f0-4259-2ae73021c2f7,buy,10761.0,1000
2,bitmex,XBTUSD,1561939203880000,1561939205964131,5a794de6-c89a-36d9-d42f-0ff9c314ae11,buy,10761.0,2000
3,bitmex,XBTUSD,1561939203916000,1561939205981308,c7a917d7-b584-e269-3125-0707c3299a0d,buy,10761.0,1076
4,bitmex,XBTUSD,1561939203962000,1561939206003553,896a073d-6cf8-437d-0247-e832db00d1a7,sell,10760.5,1000
...,...,...,...,...,...,...,...,...
1460958,bitmex,XBTUSD,1562025599200000,1562025599213046,3abf4dd3-6cac-b474-459e-0695eb824bc1,sell,10570.5,25
1460959,bitmex,XBTUSD,1562025599200000,1562025599213046,e856d3cb-75bc-94b6-6efa-ae95228a5cab,sell,10570.5,33
1460960,bitmex,XBTUSD,1562025599200000,1562025599213046,68bb2b5d-7026-4dc7-79d6-23435e26c8a9,sell,10570.5,1832
1460961,bitmex,XBTUSD,1562025599204000,1562025599222025,d8370369-91a8-e349-a662-58392c66c62e,sell,10570.5,1


In [9]:
df_imbalance.index = pd.to_datetime(df_imbalance.index, unit='us')
df_imbalance.index.name = 'local_timestamp'

df_trades.index = pd.to_datetime(df_trades['local_timestamp'], unit='us')
ohlc = df_trades['price'].resample('5s', closed='right', label='right').ohlc().ffill()

df = ohlc.merge(df_imbalance, left_on='local_timestamp', right_on='local_timestamp', how='outer', sort=True)
df.to_pickle('imbalance')

In [10]:
df

Unnamed: 0_level_0,open,high,low,close,best_bid,best_ask,5%,10%
local_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-07-01 00:00:00,,,,,,,,
2019-07-01 00:00:05,,,,,10760.5,10761.0,100607160.0,117868034.0
2019-07-01 00:00:10,10760.5,10761.0,10760.5,10760.5,10760.5,10761.0,100604222.0,117865096.0
2019-07-01 00:00:15,10761.0,10764.5,10760.5,10762.5,10762.0,10762.5,100633691.0,118348535.0
2019-07-01 00:00:20,10762.5,10772.0,10762.0,10770.0,10767.0,10769.0,98992727.0,116391257.0
...,...,...,...,...,...,...,...,...
2019-07-01 23:59:40,10571.0,10571.0,10570.5,10570.5,10570.5,10571.0,-62871067.0,17574724.0
2019-07-01 23:59:45,10570.5,10571.0,10570.5,10570.5,10570.5,10571.0,-62804968.0,17640914.0
2019-07-01 23:59:50,10570.5,10571.0,10570.5,10570.5,10570.5,10571.0,-62851820.0,17529659.0
2019-07-01 23:59:55,10571.0,10571.0,10570.5,10570.5,10570.5,10571.0,-63188036.0,17191607.0
