In [1]:
import gzip
import datetime
import pandas as pd
from BTrees.OOBTree import OOBTree

def replay_L2(filename, target_exchange, target_symbol, fn, fn_timestamps):
    with gzip.open(filename) as f:
        bid = OOBTree()
        ask = OOBTree()
        processing_snapshot = False
        prev_timestamp = None
        i = 0
        # Skip header
        f.readline()
        while True:
            line = f.readline()
            if not line:
                break
            exchange, symbol, timestamp, local_timestamp, is_snapshot, side, price, amount = line.decode('utf-8').strip().split(',')
            if exchange != target_exchange or symbol != target_symbol:
                continue
            
            local_timestamp = int(local_timestamp)
            price = float(price)
            if prev_timestamp != local_timestamp:
                while local_timestamp > fn_timestamps[i]:
                    fn(fn_timestamps[i], prev_timestamp, bid, ask)
                    i += 1
            prev_timestamp = local_timestamp
            
            # Reconstruct the market depth
            if is_snapshot and not processing_snapshot:
                processing_snapshot = True
                bid.clear()
                ask.clear()
            elif not is_snapshot and processing_snapshot:
                processing_snapshot = False

            if amount != '0':
                if side == 'bid':
                    bid[price] = int(amount)
                else:
                    ask[price] = int(amount)
            else:
                if side == 'bid':
                    del bid[price]
                else:
                    del ask[price]
        
        for j in range(i, len(fn_timestamps)):
            fn(fn_timestamps[j], prev_timestamp, bid, ask)

def make_timestamps(since, to, interval):
    index = [since.timestamp() * 1000000]
    while since < to:
        since += datetime.timedelta(seconds=interval)
        index.append(since.timestamp() * 1000000)
    return index

In [2]:
since = datetime.datetime(2019, 7, 1, tzinfo=datetime.timezone.utc)
to = datetime.datetime(2019, 7, 2, tzinfo=datetime.timezone.utc)
timestamps = make_timestamps(since, to, 5)

df_imbalance = pd.DataFrame(float('nan'), index=timestamps, columns=['best_bid', 'best_ask', '0.5%', '1%', '2.5%', '5%', '7.5%', '10%', '15%', '20%', '25%'])        
def process_L2(callback_timestamp, timestamp, bid, ask):
    if bid and ask:
        best_bid = bid.maxKey()
        best_ask = ask.minKey()
        mid = (best_bid + best_ask) / 2
        row = [best_bid, best_ask]
        for p in [0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25]:
            buy = sum(map(lambda x: x[1], filter(lambda x: x[0] > mid * (1 - p), bid.iteritems())))
            sell = sum(map(lambda x: x[1], filter(lambda x: x[0] < mid * (1 + p), ask.iteritems())))
            row.append(buy - sell)
        df_imbalance.loc[callback_timestamp] = row
    
#df_imbalance = pd.DataFrame(float('nan'), index=timestamps, columns=['best_bid', 'best_ask', '0.5%', '1%', '2.5%', '5%', '7.5%', '10%', '15%', '20%', '25%'])
def process_L2_normalization(callback_timestamp, timestamp, bid, ask):
    if bid and ask:
        best_bid = bid.maxKey()
        best_ask = ask.minKey()
        mid = (best_bid + best_ask) / 2
        row = [best_bid, best_ask]
        for p in [0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25]:
            buy = sum(map(lambda x: x[1], filter(lambda x: x[0] > mid * (1 - p), bid.iteritems())))
            sell = sum(map(lambda x: x[1], filter(lambda x: x[0] < mid * (1 + p), ask.iteritems())))
            row.append((buy - sell) / (buy + sell))
        df_imbalance.loc[callback_timestamp] = row

#df_imbalance = pd.DataFrame(float('nan'), index=timestamps, columns=['best_bid', 'best_ask', '50M', '60M', '70M', '80M', '90M', '100M', '110M', '120M', '130M', '140M', '150M'])
def process_L2_weighted_price(callback_timestamp, timestamp, bid, ask):
    if bid and ask:
        best_bid = bid.maxKey()
        best_ask = ask.minKey()
        mid = (best_bid + best_ask) / 2
        row = [best_bid, best_ask]
        for q in [50000000, 60000000, 70000000, 80000000, 90000000, 100000000, 110000000, 120000000, 130000000, 140000000, 150000000]:
            map_bid = map(lambda x: (x[0], x[1]), sorted(bid.iteritems(), key=lambda x: -x[0]))
            map_ask = map(lambda x: (x[0], x[1]), sorted(ask.iteritems(), key=lambda x: x[0]))

            df_bid = pd.DataFrame(map_bid, columns=['price', 'size'])
            df_bid = df_bid[df_bid['size'].cumsum() <= q]

            df_ask = pd.DataFrame(map_ask, columns=['price', 'size'])
            df_ask = df_ask[df_ask['size'].cumsum() <= q]

            row.append(((df_bid['price'] * df_bid['size']).sum() + (df_ask['price'] * df_ask['size']).sum()) \
                       / (df_bid['size'].sum() + df_ask['size'].sum()))

        df_imbalance.loc[callback_timestamp] = row

# Download CSV files from https://docs.tardis.dev/historical-data-details/
replay_L2('bitmex_incremental_book_L2_2019-07-01_XBTUSD.csv.gz', 'bitmex', 'XBTUSD', process_L2, timestamps)
# replay_L2('bitmex_incremental_book_L2_2019-07-01_XBTUSD.csv.gz', 'bitmex', 'XBTUSD', process_L2_normalization, timestamps)
# replay_L2('bitmex_incremental_book_L2_2019-07-01_XBTUSD.csv.gz', 'bitmex', 'XBTUSD', process_L2_weighted_price, timestamps)

df_trades = pd.read_csv('bitmex_trades_2019-07-01_XBTUSD.csv.gz', compression='gzip', header=0, sep=',')
df_trades = df_trades[(df_trades['exchange'] == 'bitmex') & (df_trades['symbol'] == 'XBTUSD')]

In [3]:
df_imbalance.index = pd.to_datetime(df_imbalance.index, unit='us')
df_imbalance.index.name = 'local_timestamp'

df_trades.index = pd.to_datetime(df_trades['local_timestamp'], unit='us')
ohlcv = df_trades.resample('5s', closed='right', label='right').agg({'price': 'ohlc', 'amount': 'sum'})
ohlcv.columns = ['open', 'high', 'low', 'close', 'volume']
closes = ohlcv['close'].fillna(method='pad')
ohlcv = ohlcv.apply(lambda x: x.fillna(closes))

df = ohlcv.merge(df_imbalance, left_on='local_timestamp', right_on='local_timestamp', how='outer', sort=True)

df = df.iloc[2:]

df.to_pickle('imbalance')
df.to_csv('imbalance.csv')

In [4]:
df

Unnamed: 0_level_0,open,high,low,close,volume,best_bid,best_ask,0.5%,1%,2.5%,5%,7.5%,10%,15%,20%,25%
local_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-07-01 00:00:10,10760.5,10761.0,10760.5,10760.5,56023.0,10760.5,10761.0,-1389441.0,-999943.0,48096826.0,100604222.0,108973952.0,117865096.0,152406830.0,162300964.0,167728898.0
2019-07-01 00:00:15,10761.0,10764.5,10760.5,10762.5,926323.0,10762.0,10762.5,1571316.0,1703027.0,48454239.0,100633691.0,109541316.0,118348535.0,153029937.0,162910907.0,168437137.0
2019-07-01 00:00:20,10762.5,10772.0,10762.0,10770.0,1244732.0,10767.0,10769.0,224234.0,-1414694.0,46353982.0,98992727.0,107035431.0,116391257.0,151149733.0,160812336.0,167064389.0
2019-07-01 00:00:25,10770.0,10770.5,10760.5,10760.5,1091694.0,10760.5,10761.0,-4498538.0,-3595487.0,42191754.0,94831371.0,103322263.0,112142075.0,146712372.0,156624290.0,161995052.0
2019-07-01 00:00:30,10761.5,10761.5,10750.0,10750.5,1451809.0,10750.0,10750.5,-726613.0,2568195.0,47327022.0,97825154.0,108057793.0,117547053.0,149456366.0,159482185.0,164839505.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-01 23:59:40,10571.0,10571.0,10570.5,10570.5,68201.0,10570.5,10571.0,83610.0,-3948019.0,-30913821.0,-62871067.0,-29889465.0,17574724.0,83354457.0,114446177.0,120950296.0
2019-07-01 23:59:45,10570.5,10571.0,10570.5,10570.5,54755.0,10570.5,10571.0,180408.0,-3625694.0,-30847722.0,-62804968.0,-29755246.0,17640914.0,83440647.0,114532367.0,121036486.0
2019-07-01 23:59:50,10570.5,10571.0,10570.5,10570.5,11704.0,10570.5,10571.0,222568.0,-3743971.0,-30961426.0,-62851820.0,-29869566.0,17529659.0,83329442.0,114421162.0,120925281.0
2019-07-01 23:59:55,10571.0,10571.0,10570.5,10570.5,7107.0,10570.5,10571.0,39758.0,-3951059.0,-31297108.0,-63188036.0,-30205079.0,17191607.0,82991390.0,114088310.0,120592429.0


In [5]:
df_trades

Unnamed: 0_level_0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount
local_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-07-01 00:00:05.934605,bitmex,XBTUSD,1561939203820000,1561939205934605,95e052c9-d8ed-1b02-2ea1-a47eb3337432,sell,10760.5,17089
2019-07-01 00:00:05.950839,bitmex,XBTUSD,1561939203850000,1561939205950839,d6cc5a30-d0b0-44f0-4259-2ae73021c2f7,buy,10761.0,1000
2019-07-01 00:00:05.964131,bitmex,XBTUSD,1561939203880000,1561939205964131,5a794de6-c89a-36d9-d42f-0ff9c314ae11,buy,10761.0,2000
2019-07-01 00:00:05.981308,bitmex,XBTUSD,1561939203916000,1561939205981308,c7a917d7-b584-e269-3125-0707c3299a0d,buy,10761.0,1076
2019-07-01 00:00:06.003553,bitmex,XBTUSD,1561939203962000,1561939206003553,896a073d-6cf8-437d-0247-e832db00d1a7,sell,10760.5,1000
...,...,...,...,...,...,...,...,...
2019-07-01 23:59:59.213046,bitmex,XBTUSD,1562025599200000,1562025599213046,3abf4dd3-6cac-b474-459e-0695eb824bc1,sell,10570.5,25
2019-07-01 23:59:59.213046,bitmex,XBTUSD,1562025599200000,1562025599213046,e856d3cb-75bc-94b6-6efa-ae95228a5cab,sell,10570.5,33
2019-07-01 23:59:59.213046,bitmex,XBTUSD,1562025599200000,1562025599213046,68bb2b5d-7026-4dc7-79d6-23435e26c8a9,sell,10570.5,1832
2019-07-01 23:59:59.222025,bitmex,XBTUSD,1562025599204000,1562025599222025,d8370369-91a8-e349-a662-58392c66c62e,sell,10570.5,1
