In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import log, square, sqrt, power, arange, ones, zeros, isscalar,\
    array, outer, pi, sin, cos, expand_dims, repeat, full, concatenate, ravel
from numpy.random import normal
from scipy.optimize import least_squares, minimize
%matplotlib inline

In [2]:
# Trade data

stock_name = 'MSFT.O'
#NSQ_OneYear100closeA_Sept21.csv
df = pd.read_csv('NSQ_OneYear100test_Sept21 2.csv', 
                   usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                   dtype={'Local_Date_Time': str,
                          'RIC': str, 
                          'Open': np.float64,
                          'High': np.float64,
                          'Low': np.float64,
                          'Close': np.float64,
                          'VWAP': np.float64,
                          'NumberOfTrades': int,
                          'Volume': int},
                   skipinitialspace=True,
                   parse_dates=True)
apple = df[df['RIC'] == stock_name]
apple['Date'] = pd.to_datetime(apple['Local_Date_Time']).dt.date

qqq = pd.read_csv('QQQ_OneYear100test_Sept21.csv', 
                   usecols=['Local_Date_Time','RIC','Open','High','Low','Close','VWAP','NumberOfTrades','Volume'],
                   dtype={'Local_Date_Time': str,
                          'RIC': str, 
                          'Open': np.float64,
                          'High': np.float64,
                          'Low': np.float64,
                          'Close': np.float64,
                          'VWAP': np.float64,
                          'NumberOfTrades': int,
                          'Volume': int},
                   skipinitialspace=True,
                   parse_dates=True)
qqq['Date'] = pd.to_datetime(qqq['Local_Date_Time']).dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple['Date'] = pd.to_datetime(apple['Local_Date_Time']).dt.date


In [3]:
# Auction price data
auction = pd.read_csv('NSQ_OneYear100closeA_Sept21.csv',
                      skipinitialspace=True,
                      parse_dates=True)
apple_auction = auction[auction['RIC'] == stock_name]
apple_auction['Date'] = pd.to_datetime(apple_auction['Local_Date_Time']).dt.date

auction_qqq = pd.read_csv('QQQ_OneYear100closeA_Sept21.csv',
                      skipinitialspace=True,
                      parse_dates=True)
auction_qqq['Date'] = pd.to_datetime(auction_qqq['Local_Date_Time']).dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_auction['Date'] = pd.to_datetime(apple_auction['Local_Date_Time']).dt.date


In [4]:
# Data Processing (10 minute intervals)
df_vol = pd.DataFrame(columns = ['High', 'Low', 'Open', 'Close', 'Volume', 'log_returns'])
dates = apple['Date'].unique()
daily_volatility = []
daily_volatility_minus4pm = []
daily_return = []
avg_20day_volatility = []
stocks = []
dates_final = []
auction_log_returns = []
auction_log_returns_340350 = []
volumes = []
i = 1
for date in dates:
    
    apple_today = apple[apple['Date'] == date]
    apple_auction_today = apple_auction[apple_auction['Date'] == date]
    
    apple_today = apple_today.set_index(pd.DatetimeIndex(apple_today['Local_Date_Time']))
    apple_today_10min_max = pd.DataFrame(apple_today['High'].resample("10T").max())
    apple_today_10min_min = pd.DataFrame(apple_today['Low'].resample("10T").min())
    apple_today_10min_open = pd.DataFrame(apple_today['Open'].resample("10T").first())
    apple_today_10min_close = pd.DataFrame(apple_today['Close'].resample("10T").last())
    apple_today_10min_volume = pd.DataFrame(apple_today['Volume'].resample("10T").sum())
    apple_today_10min = apple_today_10min_max.join(apple_today_10min_min)
    apple_today_10min = apple_today_10min.join(apple_today_10min_open)
    apple_today_10min = apple_today_10min.join(apple_today_10min_close)
    apple_today_10min = apple_today_10min.join(apple_today_10min_volume)
#     apple_today_10min['log_returns'] = abs(np.log(apple_today_10min['Close']/apple_today_10min['Open'])) # alternate volatility formula
    apple_today_10min['log_returns'] = np.sqrt(0.5*np.square(np.log(apple_today_10min['High']/apple_today_10min['Low'])) - (2*np.log(2)-1) * np.square(np.log(apple_today_10min['Close']/apple_today_10min['Open'])))
    volatility_today = apple_today_10min['log_returns'].values
    if len(volatility_today) != 39: # Skip if there are fewer than 390 minutes of trading data
        continue
    frames = [df_vol,apple_today_10min]
    df_vol = pd.concat(frames)



In [5]:
volatility_10min = df_vol['log_returns']

In [6]:
volatility_10min = volatility_10min.reset_index()

In [7]:
volatility_10min.rename(columns={'index':'DateTime'}, inplace=True)

In [11]:
volatility_10min_all = pd.DataFrame(columns=['DateTime','log_returns'])
for i in range(39):
    df_temp1 = volatility_10min.iloc[i::39, :]
    frames = [volatility_10min_all,df_temp1]
    volatility_10min_all = pd.concat(frames)

In [12]:
volatility_10min_all

Unnamed: 0,DateTime,log_returns
0,2020-09-01 09:30:00,0.004888
39,2020-09-02 09:30:00,0.005690
78,2020-09-03 09:30:00,0.006692
117,2020-09-04 09:30:00,0.009842
156,2020-09-08 09:30:00,0.011081
...,...,...
9632,2021-08-26 15:50:00,0.001514
9671,2021-08-27 15:50:00,0.001553
9710,2021-08-30 15:50:00,0.001232
9749,2021-08-31 15:50:00,0.003015


### Model 1A

Question 1A: How well does $\sigma_{t}^{2}$ forecast $\sigma_{t+1}^{2}$ ? Is it biased?

In [14]:
# Data Processing (10 minute intervals)

dates = apple['Date'].unique()
daily_volatility = []
daily_volatility_minus4pm = []
daily_return = []
avg_20day_volatility = []
stocks = []
dates_final = []
auction_log_returns = []
auction_log_returns_340350 = []
volumes = []

for date in dates:
    
    apple_today = apple[apple['Date'] == date]
    apple_auction_today = apple_auction[apple_auction['Date'] == date]
    
    apple_today = apple_today.set_index(pd.DatetimeIndex(apple_today['Local_Date_Time']))
    apple_today_10min_max = pd.DataFrame(apple_today['High'].resample("10T").max())
    apple_today_10min_min = pd.DataFrame(apple_today['Low'].resample("10T").min())
    apple_today_10min_open = pd.DataFrame(apple_today['Open'].resample("10T").first())
    apple_today_10min_close = pd.DataFrame(apple_today['Close'].resample("10T").last())
    apple_today_10min_volume = pd.DataFrame(apple_today['Volume'].resample("10T").sum())
    apple_today_10min = apple_today_10min_max.join(apple_today_10min_min)
    apple_today_10min = apple_today_10min.join(apple_today_10min_open)
    apple_today_10min = apple_today_10min.join(apple_today_10min_close)
    apple_today_10min = apple_today_10min.join(apple_today_10min_volume)
#     apple_today_10min['log_returns'] = abs(np.log(apple_today_10min['Close']/apple_today_10min['Open'])) # alternate volatility formula
    apple_today_10min['log_returns'] = np.sqrt(0.5*np.square(np.log(apple_today_10min['High']/apple_today_10min['Low'])) - (2*np.log(2)-1) * np.square(np.log(apple_today_10min['Close']/apple_today_10min['Open'])))
    volatility_today = apple_today_10min['log_returns'].values
    return_today = (apple_today_10min['Close']/apple_today_10min['Open']).values
    
    if len(volatility_today) != 39: # Skip if there are fewer than 390 minutes of trading data
        continue
        
    # Dates for which the trade data is complete
    dates_final.append(date)
    
    auction_log_returns_today = abs(np.log(apple_auction_today['Price'].iloc[0]/apple_today_10min['Open'][-1]))
    auction_log_returns.append(auction_log_returns_today)
    
    auction_log_returns_today_340350 = abs(np.log(apple_today_10min['Open'][-1]/apple_today_10min['Open'][-2]))
    auction_log_returns_340350.append(auction_log_returns_today_340350)
    
    # 10 minute interval volatility * number of complete trade days
    stocks.append(volatility_today)
    
    # Average daily volatility (later used for 20 day moving average)
    daily_volatility.append(np.sqrt(39)*np.sqrt(np.mean(np.square(apple_today_10min['log_returns'].values))))
    daily_volatility_minus4pm.append(np.sqrt(38)*np.sqrt(np.mean(np.square(apple_today_10min['log_returns'].values[:-1]))))
    daily_return.append(return_today)
    
    # Total volume for each complete trade day between 930 and 1550
    volumes.append(sum(apple_today_10min[:-1]['Volume']))

daily_return = np.array(daily_return)
volatility_df = pd.DataFrame({'Date':dates_final, 'daily_volatility': daily_volatility, 'daily_volatility_minus4pm': daily_volatility_minus4pm})
volatility_df['diff'] = volatility_df['daily_volatility']-volatility_df['daily_volatility_minus4pm']
volatility_df['volatility_ewma20'] = np.array([None] + [i for i in volatility_df['daily_volatility'].ewm(span=20).mean()][:-1])

stocks = stocks[20:]
stocks = np.vstack(stocks).T # Training data for Anderson model
avg_20day_volatility_raw = np.array(volatility_df['volatility_ewma20'])
avg_20day_volatility = np.array(volatility_df['volatility_ewma20'][20:]) # Sigma_t input for Anderson model

In [16]:
df_mod1A = pd.DataFrame()
r1 = volatility_df['Date'][20:].values
df_mod1A['Date'] = r1
df_mod1A['Actual Volatility'] = avg_20day_volatility
avg = avg_20day_volatility
avg = np.insert(avg,0,None)
avg = avg[:-1]
df_mod1A['Predicted Volatility'] = avg

In [17]:
df_mod1A

Unnamed: 0,Date,Actual Volatility,Predicted Volatility
0,2020-09-30,0.0182034,
1,2020-10-01,0.0179497,0.0182034
2,2020-10-02,0.0174781,0.0179497
3,2020-10-05,0.0173409,0.0174781
4,2020-10-06,0.0163856,0.0173409
...,...,...,...
226,2021-08-26,0.00880319,0.008986
227,2021-08-27,0.00865206,0.00880319
228,2021-08-30,0.00860936,0.00865206
229,2021-08-31,0.00844978,0.00860936


### Model 1B

Question 1B: How well does $\sigma_{t,B}^{2}$ forecast $\sigma_{t+1,B}^{2}$ ? Is it biased?

In [18]:
df_mod1B = pd.DataFrame()
r1 = volatility_df['Date'][-231:].values
df_mod1B['Date'] = r1
df_mod1B['Actual Volatility'] = volatility_10min_all['log_returns'][-231:].values
avg = volatility_10min_all['log_returns'][-231:].values
avg = np.insert(avg,0,None)
avg = avg[:-1]
df_mod1B['Predicted Volatility'] = avg

In [19]:
df_mod1B

Unnamed: 0,Date,Actual Volatility,Predicted Volatility
0,2020-09-30,0.004056,
1,2020-10-01,0.002168,0.004056
2,2020-10-02,0.003102,0.002168
3,2020-10-05,0.001469,0.003102
4,2020-10-06,0.004327,0.001469
...,...,...,...
226,2021-08-26,0.001514,0.001018
227,2021-08-27,0.001553,0.001514
228,2021-08-30,0.001232,0.001553
229,2021-08-31,0.003015,0.001232


### Model 1C

Question 1C: How well does $\gamma\sigma_{t}^{2}$, where $\gamma = \dfrac{\sigma_{t+1,A}^{2}}{\sigma_{t,A}^{2}}$ forecast $\sigma_{t+1,B}^{2}$ ? Is it biased? \
(A corresponds to 9:30:00 to 15:49:59\
B corresponds to 15:50:00 to 16:00:00)

In [None]:
df_A = pd.DataFrame()
temp = 0
for i in range(38):
    temp += volatility_10min.iloc[i::39, :]
    frames = [df_temp,df_temp1]
    df_temp = pd.concat(frames)

In [21]:
df_A = []  #List containing aggregated volatilities from 9:30 am to 15:50 pm
df_B = []  #List containing aggregated volatilities from 15:50 pm to 16:00 pm
temp = 0
j = 0
while j <= 9750: #Looping j to get the volaitlity value for every 39th row(at 15:50 pm)
    for i in range(38):
        temp += volatility_10min_all['log_returns'][j+i] #Aggregating the 10 min volatilities from 9:30 to 15:40
    df_A.append(temp)
    df_B.append(volatility_10min_all['log_returns'][j+38])
    j += 39

In [22]:
df_mod1C = pd.DataFrame()
r1 = volatility_df['Date'][-231:].values
df_mod1C['Date'] = r1
df_mod1C['Vol_A'] = df_A[20:]
df_mod1C['Vol_B'] = df_B[20:]

In [23]:
df_mod1C

Unnamed: 0,Date,Vol_A,Vol_B
0,2020-09-30,2.299281,0.004056
1,2020-10-01,2.374427,0.002168
2,2020-10-02,2.466752,0.003102
3,2020-10-05,2.513299,0.001469
4,2020-10-06,2.598020,0.004327
...,...,...,...
226,2021-08-26,15.659527,0.001514
227,2021-08-27,15.698810,0.001553
228,2021-08-30,15.734988,0.001232
229,2021-08-31,15.769956,0.003015


In [24]:
temp_pred = np.insert(df_A,0,None)
temp_pred = temp_pred[:-1]
predicted = np.divide(temp_pred,df_A)
temp_vol_B = np.insert(df_B,0,None)
temp_vol_B = temp_vol_B[:-1]

In [26]:
df_mod1C['Predicted_Vol_B'] = np.multiply(np.divide(temp_pred,df_A), df_B)[20:]

In [28]:
df_mod1C

Unnamed: 0,Date,Vol_A,Vol_B,Predicted_Vol_B
0,2020-09-30,2.299281,0.004056,0.003907
1,2020-10-01,2.374427,0.002168,0.002099
2,2020-10-02,2.466752,0.003102,0.002986
3,2020-10-05,2.513299,0.001469,0.001442
4,2020-10-06,2.598020,0.004327,0.004186
...,...,...,...,...
226,2021-08-26,15.659527,0.001514,0.001510
227,2021-08-27,15.698810,0.001553,0.001549
228,2021-08-30,15.734988,0.001232,0.001229
229,2021-08-31,15.769956,0.003015,0.003008
