In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import log, square, sqrt, power, arange, ones, zeros, isscalar,\
    array, outer, pi, sin, cos, expand_dims, repeat, full, concatenate, ravel
from numpy.random import normal
from scipy.optimize import least_squares, minimize
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# functions for loading the datasets
def load_stock():
    df_stock = pd.read_csv('NSQ_OneYear100test_Sept21.csv', 
                       usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                       dtype={'Local_Date_Time': str,
                              'RIC': str, 
                              'Open': np.float64,
                              'High': np.float64,
                              'Low': np.float64,
                              'Close': np.float64,
                              'VWAP': np.float64,
                              'NumberOfTrades': int,
                              'Volume': int},
                       skipinitialspace=True,
                       parse_dates=True)
    return df_stock

def load_stockqqq():
    df_stockqqq = pd.read_csv('QQQ_OneYear100test_Sept21.csv', 
                   usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                   dtype={'Local_Date_Time': str,
                          'RIC': str, 
                          'Open': np.float64,
                          'High': np.float64,
                          'Low': np.float64,
                          'Close': np.float64,
                          'VWAP': np.float64,
                          'NumberOfTrades': int,
                          'Volume': int},
                   skipinitialspace=True,
                   parse_dates=True)
    return df_stockqqq

def load_auction():
    df_auction = pd.read_csv('NSQ_OneYear100closeA_Sept21.csv',
                          skipinitialspace=True,
                          parse_dates=True)
    return df_auction

def load_auctionqqq():
    df_auctionqqq = pd.read_csv('QQQ_OneYear100closeA_Sept21.csv',
                      skipinitialspace=True,
                      parse_dates=True)
    return df_auctionqqq

In [3]:
# Trade data
def load_stockdata(df_stock, df_stockqqq, stock_name):
    #NSQ_OneYear100closeA_Sept21.csv
    df_stockRIC = df_stock[df_stock['RIC'] == stock_name]
    df_stockRIC['Date'] = pd.to_datetime(df_stockRIC['Local_Date_Time']).dt.date
    df_stockqqq['Date'] = pd.to_datetime(df_stockqqq['Local_Date_Time']).dt.date
    
    return df_stockRIC, df_stockqqq

In [4]:
# Auction price data
def load_auctiondata(df_auction, df_auctionqqq, stock_name):
    df_auctionRIC = df_auction[df_auction['RIC'] == stock_name]
    df_auctionRIC['Date'] = pd.to_datetime(df_auctionRIC['Local_Date_Time']).dt.date
    df_auctionqqq['Date'] = pd.to_datetime(df_auctionqqq['Local_Date_Time']).dt.date
    
    return df_auctionRIC, df_auctionqqq

In [5]:
# Data Processing (10 minute intervals)

def get_voldf(df_stockRIC, df_auctionRIC):
    dates = df_stockRIC['Date'].unique()
    dates_final = []
    stocks = []
    daily_return = []
    auction_log_returns = []
    auction_log_returns_340350 = []
    daily_volatility = []
    daily_volatility_minus4pm = []
    avg_20day_volatility = []

    for date in dates:

        stock_today = df_stockRIC[df_stockRIC['Date'] == date]
        df_auctionRIC_today = df_auctionRIC[df_auctionRIC['Date'] == date]
        stock_today = stock_today.set_index(pd.DatetimeIndex(stock_today['Local_Date_Time']))
        stock_today_10min_max = pd.DataFrame(stock_today['High'].resample("10T").max())
        stock_today_10min_min = pd.DataFrame(stock_today['Low'].resample("10T").min())
        stock_today_10min_open = pd.DataFrame(stock_today['Open'].resample("10T").first())
        stock_today_10min_close = pd.DataFrame(stock_today['Close'].resample("10T").last())
        stock_today_10min = stock_today_10min_max.join(stock_today_10min_min)
        stock_today_10min = stock_today_10min.join(stock_today_10min_open)
        stock_today_10min = stock_today_10min.join(stock_today_10min_close)

        # Garman and Klass Volatility formula
        stock_today_10min['Volatility'] = np.sqrt(0.5*np.square(np.log(stock_today_10min['High']/stock_today_10min['Low'])) - (2*np.log(2)-1)*np.square(np.log(stock_today_10min['Close']/stock_today_10min['Open'])))
        #stock_today_10min['log_returns'] = abs(np.log(stock_today_10min['Close']/stock_today_10min['Open'])) # alternate volatility formula

        vol_today = [val for val in stock_today_10min['Volatility'].values]
        return_today = (stock_today_10min['Close']/stock_today_10min['Open']).values

        if len(vol_today) != 39: # Skip if there are fewer than 390 minutes of trading data
            continue

        # Dates for which the trade data is complete
        dates_final.append(date)

        # Change the 39th 10min interval to include the auction price (instead of close)
        auction_volatility = np.sqrt(0.5*np.square(np.log(stock_today_10min['High'][-1]/stock_today_10min['Low'][-1])) - (2*np.log(2)-1)*np.square(np.log(df_auctionRIC_today['Price'].iloc[0]/stock_today_10min['Open'][-1])))
        vol_today[-1] = auction_volatility

        # 10 minute interval volatility * number of complete trade days
        stocks.append(vol_today)

        # Daily raw return
        daily_return.append(return_today)

        # Auction Volatility Information
        auction_log_returns_today = abs(np.log(df_auctionRIC_today['Price'].iloc[0]/stock_today_10min['Open'][-1]))
        auction_log_returns.append(auction_log_returns_today)

        # Naive Auction Volatility Estimate (Volatility between 3:40pm and 3:50pm)
        auction_log_returns_today_340350 = abs(np.log(stock_today_10min['Open'][-1]/stock_today_10min['Open'][-2]))
        auction_log_returns_340350.append(auction_log_returns_today_340350)

        # Average daily volatility (later used for EWMA 20Day)
        daily_volatility.append(sum(vol_today))
        daily_volatility_minus4pm.append(sum(vol_today[:-1]))

    daily_return = np.array(daily_return)
    vol_df = pd.DataFrame({'Date':dates_final, 'daily_volatility': daily_volatility, 'daily_volatility_minus4pm': daily_volatility_minus4pm})
    vol_df['volatility_ewma20'] = np.array([None] + [i for i in vol_df['daily_volatility'].ewm(span=20).mean()][:-1])

    stocks = stocks[20:]
    stocks = np.vstack(stocks).T # Training data for Anderson model
    avg_20day_volatility_raw = np.array(vol_df['volatility_ewma20'])
    avg_20day_volatility = np.array(vol_df['volatility_ewma20'][20:]) # Sigma_t input for Anderson model
    
    return vol_df


In [6]:
def mod1B_results(vol_df):
    vol_df = vol_df
    vol_df['auction'] = vol_df['daily_volatility'] - vol_df['daily_volatility_minus4pm']
    vol_df['volatility_ewma20_auction'] = np.array([None] + [i for i in vol_df['auction'].ewm(span=20).mean()][:-1])
    
    df_mod1B = pd.DataFrame()
    df_mod1B['Date'] = vol_df['Date'][20:]
    df_mod1B['Actual Volatility'] = vol_df['auction'][20:]
    df_mod1B['Predicted Volatility'] = vol_df['volatility_ewma20_auction'][20:]
    
    MAE = mean_absolute_error(df_mod1B['Actual Volatility'][-211:], df_mod1B['Predicted Volatility'][-211:])
    MSE = mean_squared_error(df_mod1B['Actual Volatility'][-211:], df_mod1B['Predicted Volatility'][-211:])
    R2 = r2_score(df_mod1B['Actual Volatility'][-211:], df_mod1B['Predicted Volatility'][-211:])
    
    return MAE, MSE, R2

In [7]:
def mod1C_results(vol_df):
    vol_df = vol_df
    vol_df['daily_volatility_minus4pm_yest'] = vol_df['daily_volatility_minus4pm'].shift(1)
    vol_df['volatility_ewma20_auction_1c'] = vol_df['daily_volatility_minus4pm']/vol_df['daily_volatility_minus4pm_yest']*vol_df['volatility_ewma20_auction']
    
    df_mod1C = pd.DataFrame()
    df_mod1C['Date'] = vol_df['Date'][20:]
    df_mod1C['Actual Volatility'] = vol_df['auction'][20:]
    df_mod1C['Predicted Volatility'] = vol_df['volatility_ewma20_auction_1c'][20:]
    
    MAE = mean_absolute_error(df_mod1C['Actual Volatility'][-211:], df_mod1C['Predicted Volatility'][-211:])
    MSE = mean_squared_error(df_mod1C['Actual Volatility'][-211:], df_mod1C['Predicted Volatility'][-211:])
    R2 = r2_score(df_mod1C['Actual Volatility'][-211:], df_mod1C['Predicted Volatility'][-211:])

    return MAE, MSE, R2


In [11]:
def results(df_stock, df_stockqqq, df_auction, df_auctionqqq, stock_name):
    
    df_stockRIC, df_stockqqq = load_stockdata(df_stock, df_stockqqq, stock_name)
    df_auctionRIC, df_auctionqqq = load_auctiondata(df_auction, df_auctionqqq, stock_name)
    vol_df = get_voldf(df_stockRIC, df_auctionRIC)
    
    MAE_1b, MSE_1b, R2_1b = mod1B_results(vol_df)
    MAE_1c, MSE_1c, R2_1c = mod1C_results(vol_df)

    data = {
        "Metrics": ["EWMA Auction Vol (1B)", "EWMA Auction Vol/ratio (1C)"],
        "MAE": [MAE_1b, MAE_1c],
        "MSE": [MSE_1b, MSE_1c],
        "R^2": [R2_1b, R2_1c]
    }
    
    pd.set_option('display.float_format',
      lambda x: '{:,.4f}'.format(x) if abs(x) >0.01 else '{:,.3e}'.format(x))
    
    table = pd.DataFrame(data, index=None)
    display(table)

# Main function

In [9]:
# load the datasets
df_stock = load_stock()
df_stockqqq = load_stockqqq()
df_auction = load_auction()
df_auctionqqq = load_auctionqqq()

In [12]:
# choose interested stocks
stock_list = ["MSFT.O", "AAPL.O", "GOOGL.O", "MRNA.O", "INTC.O", "PEP.O",
              "ZM.O", "EBAY.O", "PTON.O"]

for stock_name in stock_list:
    print(stock_name)
    results(df_stock, df_stockqqq, df_auction, df_auctionqqq, stock_name)

MSFT.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.0008432,1.271e-06,0.2119
1,EWMA Auction Vol/ratio (1C),0.0008997,1.403e-06,0.13


AAPL.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.0009671,2.497e-06,0.1325
1,EWMA Auction Vol/ratio (1C),0.001046,2.645e-06,0.0811


GOOGL.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.000798,1.18e-06,0.2297
1,EWMA Auction Vol/ratio (1C),0.0009025,1.462e-06,0.0455


MRNA.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.002012,9.559e-06,0.005607
1,EWMA Auction Vol/ratio (1C),0.002058,9.356e-06,0.0267


INTC.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.001049,3.722e-06,0.0169
1,EWMA Auction Vol/ratio (1C),0.001098,3.101e-06,0.1811


PEP.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.000646,9.931e-07,0.132
1,EWMA Auction Vol/ratio (1C),0.0006536,9.5e-07,0.1696


ZM.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.001153,3.07e-06,0.2125
1,EWMA Auction Vol/ratio (1C),0.001246,3.314e-06,0.1499


EBAY.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.0008884,1.948e-06,0.0519
1,EWMA Auction Vol/ratio (1C),0.0009985,2.217e-06,-0.0789


PTON.O


Unnamed: 0,Metrics,MAE,MSE,R^2
0,EWMA Auction Vol (1B),0.001186,2.744e-06,0.145
1,EWMA Auction Vol/ratio (1C),0.001283,2.861e-06,0.1088
