In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import log, square, sqrt, power, arange, ones, zeros, isscalar,\
    array, outer, pi, sin, cos, expand_dims, repeat, full, concatenate, ravel
from numpy.random import normal
from scipy.optimize import least_squares, minimize
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# functions for loading the datasets
def load_df():
    df = pd.read_csv('NSQ_OneYear100test_Sept21.csv', 
                       usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                       dtype={'Local_Date_Time': str,
                              'RIC': str, 
                              'Open': np.float64,
                              'High': np.float64,
                              'Low': np.float64,
                              'Close': np.float64,
                              'VWAP': np.float64,
                              'NumberOfTrades': int,
                              'Volume': int},
                       skipinitialspace=True,
                       parse_dates=True)
    return df 

def load_qqq():
    qqq = pd.read_csv('QQQ_OneYear100test_Sept21.csv', 
                   usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                   dtype={'Local_Date_Time': str,
                          'RIC': str, 
                          'Open': np.float64,
                          'High': np.float64,
                          'Low': np.float64,
                          'Close': np.float64,
                          'VWAP': np.float64,
                          'NumberOfTrades': int,
                          'Volume': int},
                   skipinitialspace=True,
                   parse_dates=True)
    return qqq

def load_auction():
    auction = pd.read_csv('NSQ_OneYear100closeA_Sept21.csv',
                          skipinitialspace=True,
                          parse_dates=True)
    return auction

def load_auctionqqq():
    auction_qqq = pd.read_csv('QQQ_OneYear100closeA_Sept21.csv',
                      skipinitialspace=True,
                      parse_dates=True)
    return auction_qqq

In [3]:
# Trade data
def load_stockdata(df, qqq, stock_name):
    #NSQ_OneYear100closeA_Sept21.csv
    df_RIC = df[df['RIC'] == stock_name]
    df_RIC['Date'] = pd.to_datetime(df_RIC['Local_Date_Time']).dt.date
    qqq['Date'] = pd.to_datetime(qqq['Local_Date_Time']).dt.date
    
    return df_RIC, qqq

In [4]:
# Auction price data
def load_auctiondata(auction, auction_qqq, stock_name):
    auction_RIC = auction[auction['RIC'] == stock_name]
    auction_RIC['Date'] = pd.to_datetime(auction_RIC['Local_Date_Time']).dt.date
    auction_qqq['Date'] = pd.to_datetime(auction_qqq['Local_Date_Time']).dt.date
    
    return auction_RIC, auction_qqq

In [5]:
# Data Processing (10 minute intervals)

def get_voldf(df_RIC, auction_RIC):
    apple = df_RIC
    apple_auction = auction_RIC
    
    dates = apple['Date'].unique()
    dates_final = []
    stocks = []
    daily_return = []
    auction_log_returns = []
    auction_log_returns_340350 = []
    daily_volatility = []
    daily_volatility_minus4pm = []
    avg_20day_volatility = []

    for date in dates:

        apple_today = apple[apple['Date'] == date]
        apple_auction_today = apple_auction[apple_auction['Date'] == date]
        apple_today = apple_today.set_index(pd.DatetimeIndex(apple_today['Local_Date_Time']))
        apple_today_10min_max = pd.DataFrame(apple_today['High'].resample("10T").max())
        apple_today_10min_min = pd.DataFrame(apple_today['Low'].resample("10T").min())
        apple_today_10min_open = pd.DataFrame(apple_today['Open'].resample("10T").first())
        apple_today_10min_close = pd.DataFrame(apple_today['Close'].resample("10T").last())
        apple_today_10min = apple_today_10min_max.join(apple_today_10min_min)
        apple_today_10min = apple_today_10min.join(apple_today_10min_open)
        apple_today_10min = apple_today_10min.join(apple_today_10min_close)

        # Garman and Klass Volatility formula
        apple_today_10min['Volatility'] = np.sqrt(0.5*np.square(np.log(apple_today_10min['High']/apple_today_10min['Low'])) - (2*np.log(2)-1)*np.square(np.log(apple_today_10min['Close']/apple_today_10min['Open'])))
    #     apple_today_10min['log_returns'] = abs(np.log(apple_today_10min['Close']/apple_today_10min['Open'])) # alternate volatility formula

        volatility_today = [val for val in apple_today_10min['Volatility'].values]
        return_today = (apple_today_10min['Close']/apple_today_10min['Open']).values

        if len(volatility_today) != 39: # Skip if there are fewer than 390 minutes of trading data
            continue

        # Dates for which the trade data is complete
        dates_final.append(date)

        # Change the 39th 10min interval to include the auction price (instead of close)
        auction_volatility = np.sqrt(0.5*np.square(np.log(apple_today_10min['High'][-1]/apple_today_10min['Low'][-1])) - (2*np.log(2)-1)*np.square(np.log(apple_auction_today['Price'].iloc[0]/apple_today_10min['Open'][-1])))
        volatility_today[-1] = auction_volatility

        # 10 minute interval volatility * number of complete trade days
        stocks.append(volatility_today)

        # Daily raw return
        daily_return.append(return_today)

        # Auction Volatility Information
        auction_log_returns_today = abs(np.log(apple_auction_today['Price'].iloc[0]/apple_today_10min['Open'][-1]))
        auction_log_returns.append(auction_log_returns_today)

        # Naive Auction Volatility Estimate (Volatility between 3:40pm and 3:50pm)
        auction_log_returns_today_340350 = abs(np.log(apple_today_10min['Open'][-1]/apple_today_10min['Open'][-2]))
        auction_log_returns_340350.append(auction_log_returns_today_340350)

        # Average daily volatility (later used for EWMA 20Day)
        daily_volatility.append(sum(volatility_today))
        daily_volatility_minus4pm.append(sum(volatility_today[:-1]))

    daily_return = np.array(daily_return)
    volatility_df = pd.DataFrame({'Date':dates_final, 'daily_volatility': daily_volatility, 'daily_volatility_minus4pm': daily_volatility_minus4pm})
    volatility_df['volatility_ewma20'] = np.array([None] + [i for i in volatility_df['daily_volatility'].ewm(span=20).mean()][:-1])

    stocks = stocks[20:]
    stocks = np.vstack(stocks).T # Training data for Anderson model
    avg_20day_volatility_raw = np.array(volatility_df['volatility_ewma20'])
    avg_20day_volatility = np.array(volatility_df['volatility_ewma20'][20:]) # Sigma_t input for Anderson model
    
    return volatility_df


In [6]:
def mod1B_results(vol_df):
    volatility_df = vol_df
    volatility_df['auction'] = volatility_df['daily_volatility'] - volatility_df['daily_volatility_minus4pm']
    volatility_df['volatility_ewma20_auction'] = np.array([None] + [i for i in volatility_df['auction'].ewm(span=20).mean()][:-1])
    
    df_mod1B = pd.DataFrame()
    df_mod1B['Date'] = volatility_df['Date'][20:]
    df_mod1B['Actual Volatility'] = volatility_df['auction'][20:]
    df_mod1B['Predicted Volatility'] = volatility_df['volatility_ewma20_auction'][20:]
    
    MAE = mean_absolute_error(df_mod1B['Actual Volatility'][-211:], df_mod1B['Predicted Volatility'][-211:])
    MSE = mean_squared_error(df_mod1B['Actual Volatility'][-211:], df_mod1B['Predicted Volatility'][-211:])
    R2 = r2_score(df_mod1B['Actual Volatility'][-211:], df_mod1B['Predicted Volatility'][-211:])
    
    return MAE, MSE, R2

In [7]:
def mod1C_results(vol_df):
    volatility_df = vol_df
    volatility_df['daily_volatility_minus4pm_yest'] = volatility_df['daily_volatility_minus4pm'].shift(1)
    volatility_df['volatility_ewma20_auction_1c'] = volatility_df['daily_volatility_minus4pm']/volatility_df['daily_volatility_minus4pm_yest']*volatility_df['volatility_ewma20_auction']
    
    df_mod1C = pd.DataFrame()
    df_mod1C['Date'] = volatility_df['Date'][20:]
    df_mod1C['Actual Volatility'] = volatility_df['auction'][20:]
    df_mod1C['Predicted Volatility'] = volatility_df['volatility_ewma20_auction_1c'][20:]
    
    MAE = mean_absolute_error(df_mod1C['Actual Volatility'][-211:], df_mod1C['Predicted Volatility'][-211:])
    MSE = mean_squared_error(df_mod1C['Actual Volatility'][-211:], df_mod1C['Predicted Volatility'][-211:])
    R2 = r2_score(df_mod1C['Actual Volatility'][-211:], df_mod1C['Predicted Volatility'][-211:])

    return MAE, MSE, R2


In [8]:
def results(df, qqq, auction, auction_qqq, stock_name):
    
    df_RIC, qqq = load_stockdata(df, qqq, stock_name)
    auction_RIC, auction_qqq = load_auctiondata(auction, auction_qqq, stock_name)
    vol_df = get_voldf(df_RIC, auction_RIC)
    
    MAE_1b, MSE_1b, R2_1b = mod1B_results(vol_df)
    MAE_1c, MSE_1c, R2_1c = mod1C_results(vol_df)

    data = {
        "Metrics": ["EWMA Auction Vol (1B)", "EWMA Auction Vol/ratio (1C)"],
        "MAE": [MAE_1b, MAE_1c],
        "MSE": [MSE_1b, MSE_1c],
        "R^2": [R2_1b, R2_1c]
    }
    
    pd.set_option('display.float_format',
      lambda x: '{:,.4f}'.format(x) if abs(x) >0.01 else '{:,.3e}'.format(x))
    
    table = pd.DataFrame(data, index=None)
    table = table.style.set_caption(stock_name)
    display(table)

# Main function

In [9]:
# load the datasets
df = load_df()
qqq = load_qqq()
auction = load_auction()
auction_qqq = load_auctionqqq()

In [10]:
# choose interested stocks
stock_list = ["MSFT.O", "AAPL.O", "GOOGL.O", "MRNA.O", "INTC.O", "PEP.O",
              "ZM.O", "EBAY.O", "PTON.O"]

for stock_name in stock_list:
    results(df, qqq, auction, auction_qqq, stock_name)

Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.000843,1e-06,0.211904
1,EWMA Auction Vol/ratio (1C),0.0009,1e-06,0.130029


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.000967,2e-06,0.132531
1,EWMA Auction Vol/ratio (1C),0.001046,3e-06,0.081076


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.000798,1e-06,0.229674
1,EWMA Auction Vol/ratio (1C),0.000903,1e-06,0.045543


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.002012,1e-05,0.005607
1,EWMA Auction Vol/ratio (1C),0.002058,9e-06,0.026679


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.001049,4e-06,0.016919
1,EWMA Auction Vol/ratio (1C),0.001098,3e-06,0.181073


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.000646,1e-06,0.131999
1,EWMA Auction Vol/ratio (1C),0.000654,1e-06,0.169625


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.001153,3e-06,0.21253
1,EWMA Auction Vol/ratio (1C),0.001246,3e-06,0.149923


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.000888,2e-06,0.051935
1,EWMA Auction Vol/ratio (1C),0.000998,2e-06,-0.078931


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.001186,3e-06,0.145011
1,EWMA Auction Vol/ratio (1C),0.001283,3e-06,0.108816
