In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import log, square, sqrt, power, arange, ones, zeros, isscalar,\
    array, outer, pi, sin, cos, expand_dims, repeat, full, concatenate, ravel
from numpy.random import normal
from scipy.optimize import least_squares, minimize
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from Module.baseModule.bayesFlexibleFourier import *
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [10]:
# functions for loading the datasets
def load_stock():
    df_stock = pd.read_csv('NSQ_OneYear100test_Sept21.csv', 
                       usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                       dtype={'Local_Date_Time': str,
                              'RIC': str, 
                              'Open': np.float64,
                              'High': np.float64,
                              'Low': np.float64,
                              'Close': np.float64,
                              'VWAP': np.float64,
                              'NumberOfTrades': int,
                              'Volume': int},
                       skipinitialspace=True,
                       parse_dates=True)
    return df_stock

def load_stockqqq():
    df_stockqqq = pd.read_csv('QQQ_OneYear100test_Sept21.csv', 
                   usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                   dtype={'Local_Date_Time': str,
                          'RIC': str, 
                          'Open': np.float64,
                          'High': np.float64,
                          'Low': np.float64,
                          'Close': np.float64,
                          'VWAP': np.float64,
                          'NumberOfTrades': int,
                          'Volume': int},
                   skipinitialspace=True,
                   parse_dates=True)
    return df_stockqqq

def load_auction():
    df_auction = pd.read_csv('NSQ_OneYear100closeA_Sept21.csv',
                          skipinitialspace=True,
                          parse_dates=True)
    return df_auction

def load_auctionqqq():
    df_auctionqqq = pd.read_csv('QQQ_OneYear100closeA_Sept21.csv',
                      skipinitialspace=True,
                      parse_dates=True)
    return df_auctionqqq

In [11]:
# function for loading the trade data
def load_stockdata(df_stock, df_stockqqq, stock_name):
    #NSQ_OneYear100closeA_Sept21.csv
    df_stockRIC = df_stock[df_stock['RIC'] == stock_name]
    df_stockRIC['Date'] = pd.to_datetime(df_stockRIC['Local_Date_Time']).dt.date
    df_stockqqq['Date'] = pd.to_datetime(df_stockqqq['Local_Date_Time']).dt.date
    
    return df_stockRIC, df_stockqqq

In [12]:
# function for loading the auction price data
def load_auctiondata(df_auction, df_auctionqqq, stock_name):
    df_auctionRIC = df_auction[df_auction['RIC'] == stock_name]
    df_auctionRIC['Date'] = pd.to_datetime(df_auctionRIC['Local_Date_Time']).dt.date
    df_auctionqqq['Date'] = pd.to_datetime(df_auctionqqq['Local_Date_Time']).dt.date
    
    return df_auctionRIC, df_auctionqqq

In [13]:
# data processing
def get_voldf(df_stockRIC, df_auctionRIC):
    dates = df_stockRIC['Date'].unique()
    dates_final = []
    stocks = []
    daily_return = []
    auction_log_returns = []
    auction_log_returns_340350 = []
    daily_volatility = []
    daily_volatility_minus4pm = []
    avg_20day_volatility = []

    for date in dates:

        stock_today = df_stockRIC[df_stockRIC['Date'] == date]
        df_auctionRIC_today = df_auctionRIC[df_auctionRIC['Date'] == date]
        stock_today = stock_today.set_index(pd.DatetimeIndex(stock_today['Local_Date_Time']))
        stock_today_10min_max = pd.DataFrame(stock_today['High'].resample("10T").max())
        stock_today_10min_min = pd.DataFrame(stock_today['Low'].resample("10T").min())
        stock_today_10min_open = pd.DataFrame(stock_today['Open'].resample("10T").first())
        stock_today_10min_close = pd.DataFrame(stock_today['Close'].resample("10T").last())
        stock_today_10min = stock_today_10min_max.join(stock_today_10min_min)
        stock_today_10min = stock_today_10min.join(stock_today_10min_open)
        stock_today_10min = stock_today_10min.join(stock_today_10min_close)

        # Garman and Klass Volatility formula
        stock_today_10min['Volatility'] = np.sqrt(0.5*np.square(np.log(stock_today_10min['High']/stock_today_10min['Low'])) - (2*np.log(2)-1)*np.square(np.log(stock_today_10min['Close']/stock_today_10min['Open'])))
        #stock_today_10min['log_returns'] = abs(np.log(stock_today_10min['Close']/stock_today_10min['Open'])) # alternate volatility formula

        vol_today = [val for val in stock_today_10min['Volatility'].values]
        return_today = (stock_today_10min['Close']/stock_today_10min['Open']).values

        if len(vol_today) != 39: # Skip if there are fewer than 390 minutes of trading data
            continue

        # Dates for which the trade data is complete
        dates_final.append(date)

        # Change the 39th 10min interval to include the auction price (instead of close)
        auction_volatility = np.sqrt(0.5*np.square(np.log(stock_today_10min['High'][-1]/stock_today_10min['Low'][-1])) - (2*np.log(2)-1)*np.square(np.log(df_auctionRIC_today['Price'].iloc[0]/stock_today_10min['Open'][-1])))
        vol_today[-1] = auction_volatility

        # 10 minute interval volatility * number of complete trade days
        stocks.append(vol_today)

        # Daily raw return
        daily_return.append(return_today)

        # Auction Volatility Information
        auction_log_returns_today = abs(np.log(df_auctionRIC_today['Price'].iloc[0]/stock_today_10min['Open'][-1]))
        auction_log_returns.append(auction_log_returns_today)

        # Naive Auction Volatility Estimate (Volatility between 3:40pm and 3:50pm)
        auction_log_returns_today_340350 = abs(np.log(stock_today_10min['Open'][-1]/stock_today_10min['Open'][-2]))
        auction_log_returns_340350.append(auction_log_returns_today_340350)

        # Average daily volatility (later used for EWMA 20Day)
        daily_volatility.append(sum(vol_today))
        daily_volatility_minus4pm.append(sum(vol_today[:-1]))

    daily_return = np.array(daily_return)
    vol_df = pd.DataFrame({'Date':dates_final, 'daily_volatility': daily_volatility, 'daily_volatility_minus4pm': daily_volatility_minus4pm})
    vol_df['volatility_ewma20'] = np.array([None] + [i for i in vol_df['daily_volatility'].ewm(span=20).mean()][:-1])

    stocks = stocks[20:]
    stocks = np.vstack(stocks).T # Training data for Anderson model
    avg_20day_volatility_raw = np.array(vol_df['volatility_ewma20'])
    avg_20day_volatility = np.array(vol_df['volatility_ewma20'][20:]) # Sigma_t input for Anderson model
    
    stocks200 = stocks.T
    stocks200 = stocks200[:200].T
    avg_20day_volatility200 = avg_20day_volatility[:200]
    
    return vol_df


In [14]:
########### FOURIER MOVING WINDOW (20Days) ###########
def process_data_moving_window(df_stockRIC, df_auctionRIC):
    dates = df_stockRIC['Date'].unique()
    dates_final = []
    stocks = []
    daily_return = []
    auction_log_returns = []
    auction_log_returns_340350 = []
    daily_volatility = []
    daily_volatility_minus4pm = []
    avg_20day_volatility = []

    for date in dates:
        
        stock_today = df_stockRIC[df_stockRIC['Date'] == date]
        df_auctionRIC_today = df_auctionRIC[df_auctionRIC['Date'] == date]
        stock_today = stock_today.set_index(pd.DatetimeIndex(stock_today['Local_Date_Time']))
        stock_today_10min_max = pd.DataFrame(stock_today['High'].resample("10T").max())
        stock_today_10min_min = pd.DataFrame(stock_today['Low'].resample("10T").min())
        stock_today_10min_open = pd.DataFrame(stock_today['Open'].resample("10T").first())
        stock_today_10min_close = pd.DataFrame(stock_today['Close'].resample("10T").last())
        stock_today_10min = stock_today_10min_max.join(stock_today_10min_min)
        stock_today_10min = stock_today_10min.join(stock_today_10min_open)
        stock_today_10min = stock_today_10min.join(stock_today_10min_close)
        
        # Garman and Klass Volatility formula
        stock_today_10min['Volatility'] = np.sqrt(0.5*np.square(np.log(stock_today_10min['High']/stock_today_10min['Low'])) - (2*np.log(2)-1)*np.square(np.log(stock_today_10min['Close']/stock_today_10min['Open'])))   
        vol_today = [val for val in stock_today_10min['Volatility'].values]
        
        if len(vol_today) != 39: # Skip if there are fewer than 390 minutes of trading data
            continue
            
        # Dates for which the trade data is complete
        dates_final.append(date)
        
        # Change the 39th 10min interval to include the auction price (instead of close)
        auction_volatility = np.sqrt(0.5*np.square(np.log(stock_today_10min['High'][-1]/stock_today_10min['Low'][-1])) - (2*np.log(2)-1)*np.square(np.log(df_auctionRIC_today['Price'].iloc[0]/stock_today_10min['Open'][-1])))
        vol_today[-1] = auction_volatility
        
        # 10 minute interval volatility * number of complete trade days
        stocks.append(vol_today)
            
        # Auction Log Return Information
        auction_log_returns_today = abs(np.log(df_auctionRIC_today['Price'].iloc[0]/stock_today_10min['Open'][-1]))
        auction_log_returns.append(auction_log_returns_today)
        
        # Naive Auction Log Return Estimate (Volatility between 3:40pm and 3:50pm)
        auction_log_returns_today_340350 = abs(np.log(stock_today_10min['Open'][-1]/stock_today_10min['Open'][-2]))
        auction_log_returns_340350.append(auction_log_returns_today_340350)
        
        # Average daily volatility (later used for EWMA 20Day)
        daily_volatility.append(sum(vol_today))
        daily_volatility_minus4pm.append(sum(vol_today[:-1]))

    # daily_return = np.array(daily_return)
    vol_df = pd.DataFrame({'Date':dates_final, 'daily_volatility': daily_volatility, 'daily_volatility_minus4pm': daily_volatility_minus4pm})
    vol_df['volatility_ewma20'] = np.array([None] + [i for i in vol_df['daily_volatility'].ewm(span=20).mean()][:-1])

    stocks = np.vstack(stocks)[20:].T # Training data for Anderson model
    avg_20day_volatility = np.array(vol_df['volatility_ewma20'])[20:]

    return stocks, avg_20day_volatility


In [50]:
# fourier 2A - Sliding window 
def fourier2A_results(stocks, avg_20day_volatility, J, P):
    actual_fourier_window = []
    predicted_fourier_window = []
    
    for i in range(211):
        
        if i % 50 == 0:
            print(i/210, "done")
            
        stocks_window = stocks[:,i:i+20]
        avg_20day_volatility_window = avg_20day_volatility[i:i+20]
        fourier_window = flexible_fourier_regression(N=39, di=[], J=J, P=P) # 39 10-minute intervals in each trade day
        res_window = fourier_window.train(stocks_window, avg_20day_volatility_window, 0.0000005)
        #print("optimizer success: {}".format(res_window.success))
        #print("objective function (mse): {:.8f}".format(res_window.fun))
        
        # No Bayes
        result = fourier_window.predict(avg_20day_volatility[i+20])
        
        actual_fourier_window.append(stocks.T[i+20][-1])
        predicted_fourier_window.append(result[-1][0])
        #print(i)

    MAE = mean_absolute_error(actual_fourier_window, predicted_fourier_window)
    MSE = mean_squared_error(actual_fourier_window, predicted_fourier_window)
    R2 =r2_score(actual_fourier_window, predicted_fourier_window)
    
    return MAE, MSE, R2, fourier_window, predicted_fourier_window


In [34]:
# fourier 2B - Sliding window - Adaptive
def fourier2B_results(stocks, avg_20day_volatility, fourier_window, J, P):
    actual_fourier_window = []
    predicted_fourier_window = []

    for i in range(211):
        # After Bayes
        #print(i)
        
        bayes_dayVol = fourier_window.vol_update(stocks.T[i+20,:39], avg_20day_volatility[i+20], tol=1e-8)
        result = fourier_window.predict(bayes_dayVol)
        
        actual_fourier_window.append(stocks.T[i+20][-1])
        predicted_fourier_window.append(result[-1][0])
        
        
    MAE = mean_absolute_error(actual_fourier_window, predicted_fourier_window)
    MSE = mean_squared_error(actual_fourier_window, predicted_fourier_window)
    R2 =r2_score(actual_fourier_window, predicted_fourier_window)

    return MAE, MSE, R2, actual_fourier_window

In [42]:
# fourier 2C - Sliding window - Adaptive
def fourier2C_results(vol_df, actual_fourier_window, predicted_fourier_window):
    vol_df_window = vol_df[40:]
    vol_df_window['volatility_fourier_window'] = predicted_fourier_window
    vol_df_window['daily_volatility_minus4pm_yest'] = vol_df_window['daily_volatility_minus4pm'].shift(1)
    vol_df_window['volatility_fourier_2c'] = vol_df_window['daily_volatility_minus4pm']/vol_df_window['daily_volatility_minus4pm_yest']*vol_df_window['volatility_fourier_window']
    predicted_fourier_window2C = np.array(vol_df_window['volatility_fourier_2c'])

    MAE = mean_absolute_error(actual_fourier_window[1:], predicted_fourier_window2C[1:])
    MSE = mean_squared_error(actual_fourier_window[1:], predicted_fourier_window2C[1:])
    R2 = r2_score(actual_fourier_window[1:], predicted_fourier_window2C[1:])

    return MAE, MSE, R2

In [56]:
# combine all functions 
def results(df_stock, df_stockqqq, df_auction, df_auctionqqq, stock_name, J=1, P=15):
    
    df_stockRIC, df_stockqqq = load_stockdata(df_stock, df_stockqqq, stock_name)
    df_auctionRIC, df_auctionqqq = load_auctiondata(df_auction, df_auctionqqq, stock_name)
    vol_df = get_voldf(df_stockRIC, df_auctionRIC)
    
    stocks, avg_20day_volatility = process_data_moving_window(df_stockRIC, df_auctionRIC)
    MAE_2a, MSE_2a, R2_2a, fourier_window, predicted_fourier_window = fourier2A_results(stocks, avg_20day_volatility, J, P)
    MAE_2b, MSE_2b, R2_2b, actual_fourier_window = fourier2B_results(stocks, avg_20day_volatility, fourier_window, J, P)
    MAE_2c, MSE_2c, R2_2c = fourier2C_results(vol_df, actual_fourier_window, predicted_fourier_window)

    data = {
        "Metrics": ["Fourier Auction Vol (2A)", "Fourier Auction Vol Adaptive (2B)", "Fourier Auction Vol/ratio (2C)"],
        "MAE": [MAE_2a, MAE_2b, MAE_2c],
        "MSE": [MSE_2a, MSE_2b, MSE_2c],
        "R^2": [R2_2a, R2_2b, R2_2c]
    }
    
    pd.set_option('display.float_format',
      lambda x: '{:,.4f}'.format(x) if abs(x) >0.01 else '{:,.3e}'.format(x))
    
    table = pd.DataFrame(data, index=None)
    print(stock_name)
    display(table)

# Run the code

In [51]:
# load the datasets
df_stock = load_stock()
df_stockqqq = load_stockqqq()
df_auction = load_auction()
df_auctionqqq = load_auctionqqq()

In [57]:
# choose interested stocks
stock_list = ["MSFT.O", "AAPL.O", "GOOGL.O", "MRNA.O", "INTC.O", "PEP.O",
              "ZM.O", "EBAY.O", "PTON.O"]

for stock_name in stock_list:
    results(df_stock, df_stockqqq, df_auction, df_auctionqqq, stock_name, J=1, P=15)

MSFT.O
0.0 done
0.23809523809523808 done
0.47619047619047616 done
0.7142857142857143 done
0.9523809523809523 done
0
Model fit success: True
1
Model fit success: True
2
Model fit success: True
3
Model fit success: True
4
Model fit success: True
5
Model fit success: True
6
Model fit success: True
7
Model fit success: True
8
Model fit success: True
9
Model fit success: True
10
Model fit success: True
11
Model fit success: True
12
Model fit success: True
13
Model fit success: True
14
Model fit success: True
15
Model fit success: True
16
Model fit success: True
17
Model fit success: True
18
Model fit success: True
19
Model fit success: True
20
Model fit success: True
21
Model fit success: True
22
Model fit success: True
23
Model fit success: True
24
Model fit success: True
25
Model fit success: True
26
Model fit success: True
27
Model fit success: True
28
Model fit success: True
29
Model fit success: True
30
Model fit success: True
31
Model fit success: True
32
Model fit success: True
33
Mo

Unnamed: 0,Metrics,MAE,MSE,R^2
0,Fourier Auction Vol (2A),0.001689,5.164e-06,-2.2031
1,Fourier Auction Vol Adaptive (2B),0.00133,3.522e-06,-1.1846
2,Fourier Auction Vol/ratio (2C),0.001671,4.775e-06,-2.0954


AAPL.O
0.0 done


KeyboardInterrupt: 

# Check

In [21]:
stock_name = "MSFT.O"
df_stock = load_stock()
df_stockqqq = load_stockqqq()
df_auction = load_auction()
df_auctionqqq = load_auctionqqq()

In [23]:
df_stockRIC, df_stockqqq = load_stockdata(df_stock, df_stockqqq, stock_name)
df_auctionRIC, df_auctionqqq = load_auctiondata(df_auction, df_auctionqqq, stock_name)
vol_df = process_dfs(df_stockRIC, df_auctionRIC)

In [26]:
stocks, avg_20day_volatility = process_data_moving_window(df_stockRIC, df_auctionRIC)

In [28]:
stocks.shape

(39, 231)

In [29]:
MAE_2a, MSE_2a, R2_2a, fourier_window, predicted_fourier_window = fourier2A_results(stocks, avg_20day_volatility, 1, 15)

optimizer success: True
objective function (mse): 0.00000066
0
optimizer success: True
objective function (mse): 0.00000126
1
optimizer success: True
objective function (mse): 0.00000113
2
optimizer success: True
objective function (mse): 0.00000103
3
optimizer success: True
objective function (mse): 0.00000090
4
optimizer success: True
objective function (mse): 0.00000075
5
optimizer success: True
objective function (mse): 0.00000073
6
optimizer success: True
objective function (mse): 0.00000072
7
optimizer success: True
objective function (mse): 0.00000069
8
optimizer success: True
objective function (mse): 0.00000077
9
optimizer success: True
objective function (mse): 0.00000083
10
optimizer success: True
objective function (mse): 0.00000095
11
optimizer success: True
objective function (mse): 0.00000102
12
optimizer success: True
objective function (mse): 0.00000105
13
optimizer success: True
objective function (mse): 0.00000105
14
optimizer success: True
objective function (mse): 

optimizer success: True
objective function (mse): 0.00000115
128
optimizer success: True
objective function (mse): 0.00000111
129
optimizer success: True
objective function (mse): 0.00000106
130
optimizer success: True
objective function (mse): 0.00000100
131
optimizer success: True
objective function (mse): 0.00000099
132
optimizer success: True
objective function (mse): 0.00000102
133
optimizer success: True
objective function (mse): 0.00000107
134
optimizer success: True
objective function (mse): 0.00000115
135
optimizer success: True
objective function (mse): 0.00000119
136
optimizer success: True
objective function (mse): 0.00000120
137
optimizer success: True
objective function (mse): 0.00000123
138
optimizer success: True
objective function (mse): 0.00000121
139
optimizer success: True
objective function (mse): 0.00000120
140
optimizer success: True
objective function (mse): 0.00000118
141
optimizer success: True
objective function (mse): 0.00000113
142
optimizer success: True
o

In [35]:
MAE_2b, MSE_2b, R2_2b, actual_fourier_window = fourier2B_results(stocks, avg_20day_volatility, fourier_window, 1, 15)

0
Model fit success: True
1
Model fit success: True
2
Model fit success: True
3
Model fit success: True
4
Model fit success: True
5
Model fit success: True
6
Model fit success: True
7
Model fit success: True
8
Model fit success: True
9
Model fit success: True
10
Model fit success: True
11
Model fit success: True
12
Model fit success: True
13
Model fit success: True
14
Model fit success: True
15
Model fit success: True
16
Model fit success: True
17
Model fit success: True
18
Model fit success: True
19
Model fit success: True
20
Model fit success: True
21
Model fit success: True
22
Model fit success: True
23
Model fit success: True
24
Model fit success: True
25
Model fit success: True
26
Model fit success: True
27
Model fit success: True
28
Model fit success: True
29
Model fit success: True
30
Model fit success: True
31
Model fit success: True
32
Model fit success: True
33
Model fit success: True
34
Model fit success: True
35
Model fit success: True
36
Model fit success: True
37
Model fi

In [43]:
MAE_2c, MSE_2c, R2_2c = fourier2C_results(vol_df, actual_fourier_window, predicted_fourier_window)

In [44]:
MAE_2c

0.0016707010972491635

In [None]:
# ## Hyper-parameter tuning using first 200 days

# params_MAE = (0,0)
# params_MSE = (0,0)
# params_r_squared = (0,0)

# MAE_min = float('inf')
# MSE_min = float('inf')
# R_squared_max = -float('inf')

# for j in [1,2,3,4]:
#     for p in [15,17,19,21,23,25]:      
#         print('Checking for:', (j,p))
        
#         # Train the model based on first 200 days
#         fourier = flexible_fourier_regression(N=39, di=[], J=j, P=p) # 39 10-minute intervals in each trade day
#         res = fourier.train(stocks200, avg_20day_volatility200, 0.0000005) 

#         # Prediction on the last 31 days
#         results = fourier.predict(avg_20day_volatility[-31:])

#         if mean_absolute_error(stocks.T[-31:,-1], results[-1,:]) < MAE_min:
#             params_MAE = (j,p)
#             MAE_min = mean_absolute_error(stocks.T[-31:,-1], results[-1,:])
            
#         if mean_squared_error(stocks.T[-31:,-1], results[-1,:]) < MSE_min:
#             params_MSE = (j,p)
#             MSE_min = mean_squared_error(stocks.T[-31:,-1], results[-1,:])
            
#         if r2_score(stocks.T[-31:,-1], results[-1,:]) > R_squared_max:
#             params_r_squared = (j,p)
#             R_squared_max = r2_score(stocks.T[-31:,-1], results[-1,:])

# print(params_MAE)
# print(params_MSE)
# print(params_r_squared)
# print(R_squared_max)

(3, 19)
(4, 19)
(4, 19)
