In [1]:
import numpy as np
import pandas as pd

In [60]:
"""
Additional features: 
- High price
- Low price
- Log return 
- Realized volatility 
"""

def calc_high(df, stock='a'): 
    high = None
    for i in range(120): 
        if high is None: 
            high = (df[f"bid_price1_{stock}_t={i}"] + df[f"ask_price1_{stock}_t={i}"]) / 2
        else: 
            high = np.maximum(high, (df[f"bid_price1_{stock}_t={i}"] + df[f"ask_price1_{stock}_t={i}"]) / 2) 
    return high 

def calc_low(df, stock='a'): 
    low = None
    for i in range(120): 
        if low is None:
            low = (df[f"bid_price1_{stock}_t={i}"] + df[f"ask_price1_{stock}_t={i}"]) / 2
        else: 
            low = np.minimum(low, (df[f"bid_price1_{stock}_t={i}"] + df[f"ask_price1_{stock}_t={i}"]) / 2)
    return low

def calc_log_return(df, stock='a'): 
    start_price = (df[f"bid_price1_{stock}_t=0"] + df[f"ask_price1_{stock}_t=0"]) / 2
    end_price = (df[f"bid_price1_{stock}_t=119"] + df[f"ask_price1_{stock}_t=119"]) / 2 
    return np.log(end_price / start_price)

def calc_volatility(df, stock='a'): 
    # Calculate volatility based on mid prices
    mid_prices = []
    for i in range(120): 
        mid_prices.append(
           (df[f"bid_price1_{stock}_t={i}"] + df[f"ask_price1_{stock}_t={i}"]) / 2 
        )
    # Each row represents the price action for a 2 minute interval
    mid_prices = np.array(mid_prices).T
    volatility = []
    for mp in mid_prices: 
        # Volatility = standard deviation of log returns
        volatility.append(np.std(
            np.log(mp[1:] / mp[:-1])
        ))
    return np.array(volatility)

In [25]:
train_data = pd.read_csv("train.csv")

In [26]:
train_data["a_high"] = calc_high(train_data)
train_data["b_high"] = calc_high(train_data, 'b')

In [30]:
train_data["a_low"] = calc_low(train_data)
train_data["b_low"] = calc_low(train_data, 'b')

In [36]:
train_data["a_log_ret"] = calc_log_return(train_data)
train_data["b_log_ret"] = calc_log_return(train_data, 'b')

In [63]:
train_data["a_volatility"] = calc_volatility(train_data)
train_data["b_volatility"] = calc_volatility(train_data, 'b')

In [65]:
test_data = pd.read_csv("test.csv")

test_data["a_high"] = calc_high(test_data)
test_data["b_high"] = calc_high(test_data, 'b')
test_data["a_low"] = calc_low(test_data)
test_data["b_low"] = calc_low(test_data, 'b')
test_data["a_log_ret"] = calc_log_return(test_data)
test_data["b_log_ret"] = calc_log_return(test_data, 'b')
test_data["a_volatility"] = calc_volatility(test_data)
test_data["b_volatility"] = calc_volatility(test_data, 'b')

In [67]:
# Write train/test data with new features
train_data.to_csv("train_fe.csv", index=False)
test_data.to_csv("test_fe.csv", index=False)