In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the original data
df = pd.read_csv('./data/20170417AEM_original.csv')
df.head()

Unnamed: 0,time,time_nanos,symbol,venue,side,book_change,price,reason,best_bid,best_offer,best_bid_size,best_offer_size,market_state
0,2017-04-17 03:25:02,413092027,AEM,TSE,Buy,200,29.97,BOD,29.97,999.99,200,0,
1,2017-04-17 03:25:02,413097585,AEM,TSE,Buy,1000,53.0,BOD,53.0,999.99,1000,0,
2,2017-04-17 03:25:02,413107773,AEM,TSE,Buy,100,52.0,BOD,53.0,999.99,1000,0,
3,2017-04-17 03:25:02,413112997,AEM,TSE,Buy,100,55.95,BOD,55.95,999.99,100,0,
4,2017-04-17 03:25:02,413123448,AEM,TSE,Buy,100,55.8,BOD,55.95,999.99,100,0,


When the market state is
* "nan" : The market is closed and just receive orders. 
* "Pre-open" : From 7am to 9:30am. Orders may be entered, but will not be executed. The COP is displayed and continuously updated.
* "Opening" :  Market on Open (MOO) at 9:30am. All matching orders are executed at a single opening trade price with any remaining orders carrying through to the continuous limit order book.
* "Open" : Continuous Trading from 9:30 to 4pm - All regular order types are accepted.
* 'MOC Imbalance' : MOC Imbalance market from 3:40pm to 4pm.
* 'CCP Determination' : Calculated Closing Price (“CCP”) Determination at 4pm.
* 'Extended Hours CXLs' : Post Market Cancel Session from 4:10pm to 4:15pm- During this session, open orders may be cancelled by the dealer.
* 'Extended Hours Open' : Extended Trading Session from 4:15pm to 5pm - Orders at the last sale price are accepted.

See https://www.tsx.com/trading/calendars-and-trading-hours/trading-hours

In [4]:
df['market_state'].unique()

array([nan, 'Pre-open', 'Opening', 'Open', 'MOC Imbalance',
       'CCP Determination', 'Extended Hours CXLs', 'Extended Hours Open'],
      dtype=object)

Order types :
* "BOD" : Orders sent when the market is closed.
* "Booked" : Booked limit orders.
* "CANCELLED" : Cancelled limit orders.
* "TRADE" : Market orders.
* "Undisclosed" : Compensator of an unfilled market order.
* "PRICE_CHANGE:COP" : 
* "PRICE_CHANGE:AssignLimit" : 

See https://www.tsx.com/resource/en/133

In [28]:
df['reason'].unique()

array(['BOD', 'Booked', 'CANCELLED', 'PRICE_CHANGE:COP',
       'PRICE_CHANGE:AssignLimit', 'TRADE', 'Undisclosed'], dtype=object)

When the market is Pre-open, we groupby orders by its price and generate a sequence of all the bids and offers.

In [None]:
def nonrecursive_order_book(position):
    slice_df = df.loc[:position]

    lob_buy = slice_df[slice_df.side == 'Buy'][['price', 'book_change']]
    lob_buy = lob_buy.groupby('price').sum()
    lob_buy = lob_buy[lob_buy['book_change'] > 0]

    lob_sell = slice_df[slice_df.side == 'Sell'][['price', 'book_change']]
    lob_sell = lob_sell.groupby('price').sum()
    lob_sell = lob_sell[lob_sell['book_change'] > 0]

    return lob_buy, lob_sell

#set depth of the limit order book
N = 5
#slice the pre-open data
start = df[df['market_state'] == 'Pre-open'].index[-1]+1
lob_buy, lob_sell = nonrecursive_order_book(start)

#convert the price and volume sequence into dictionary
lob_sell_dict = {}
for price in lob_sell.index.values:
    lob_sell_dict[price] = lob_sell.loc[price]['book_change']
lob_buy_dict = {}
for price in lob_buy.index.values:
    lob_buy_dict[price] = lob_buy.loc[price]['book_change']

After the market is open, we first build a limit order book of depth $N$ using the current biggest bid price $p_b$ and smallest ask price $p_a$.
The limit order on the ask side ranges from $p_a$ to $p_a + (N - 1) * ticksize$. The limit order on the bid side ranges from $p_b - (N-1) * ticksize$ to $p_b$.
If the limit order book at price level $p_a + i * ticksize$ is already in the sequence, record its volume; if not, set the volume equal to 0. It is the same for the bid side. And We update it whenever there comes in a new order.

In [None]:
def update_dict(d, price, bookChange):
    #When a new order comes, if its price is in the sequence, we update the corresponding dictionary; if it is not, we add a new dictionary with its price and volume.
    if price in d.keys():
        bookChange = bookChange + d[price]
    d.update({price: bookChange})
    return

def generate_lob(row):        
    #update the price sequence
    if row.side == 'Buy':
        update_dict(lob_buy_dict, row['price'], row['book_change'])
    if row.side == 'Sell':
        update_dict(lob_sell_dict, row['price'], row['book_change'])
    
    #If the volume at one price level is 0, we drop it from the price sequence.
    b = {x:y for x,y in lob_buy_dict.items() if y!=0} 
    a = {x:y for x,y in lob_sell_dict.items() if y!=0}

    r = {}
    for i in range(N):
        #The best ask price is the smallest sell price p_a in the sequence. The limit order on the ask side ranges from p_a to p_a + (N - 1) * ticksize.
        r['pa' + str(i)] =  np.round(min(a.keys())+ i*0.01,2)
        #The best bid price is the biggest buy price in the sequence. The limit order on the bid side ranges from p_b - (N-1) * ticksize to p_b.
        r['pb' + str(i)] =  np.round(max(b.keys())- i*0.01,2)
        
        #If the limit order book at price level (p_a + i * ticksize) is already in the sequence, read its volume; if not, set the volume equal to 0.
        if np.round((min(a.keys())+ i*0.01),2) in a.keys():
            r['va' + str(i)] =  a[np.round(min(a.keys())+ i*0.01,2)]
        else:
            r['va' + str(i)] =  0   
        #If the limit order book at price level (p_b - i * ticksize) is already in the sequence, read its volume; if not, set the volume equal to 0.
        if np.round((max(b.keys())- i*0.01),2) in b.keys():
            r['vb' + str(i)] = b[np.round(max(b.keys())- i*0.01,2)]
        else:
            r['vb' + str(i)] =  0
            
    #Return the limit order book
    return np.array(list(r.values())).reshape(1,4*N)

In [None]:
#Generate the limit order book
df_open = df.loc[start+1:]
df_open.reset_index(inplace = True, drop = True)
result = df_open.apply(generate_lob, axis = 1)

Concatenate the limit order book data frame with original dataframe

In [None]:
lob = pd.DataFrame(np.concatenate(result.values.tolist()))
col = {}
for i in range(N):
    col.update({(4 * i)      : 'pa'+ str(i)})
    col.update({(4 * i + 1)  : 'pb'+ str(i)})
    col.update({(4 * i + 2)  : 'va'+ str(i)})
    col.update({(4 * i + 3)  : 'vb'+ str(i)})
lob.rename(columns = col, inplace = True)
merge_lob = pd.concat([df_open, lob], axis=1).reindex(lob.index)
merge_lob.head()