In [30]:
# 16/06/2020
# TASI.AI
# Raphael Mourad

# ARIMA + Exogenous variables = ARIMAX

###### IMPORT LIBRARIES AND SET UP PARAMETERS

# Import libraries
import os
import pandas as pd
import numpy as np
import sklearn.metrics as mt
import random
from pandas_datareader import DataReader
from datetime import datetime
from matplotlib import pyplot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.arima_model import ARIMA
from pmdarima import auto_arima
from scipy.stats import norm

In [2]:
# Set up directory
dir="/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri"
os.chdir(dir)
print(os.getcwd())

/media/mourad/diskSave/MCF_Toulouse/recherche/ConsulProj/RamCiri


In [3]:
# Parameters
symbol="GOOG" # Set up the symbol you want
kdays=30 # Number of days for forecasting

In [4]:
# Create folder for symbol
symbolFolder="results/ARIMA/"+symbol
if(os.path.isdir(symbolFolder)==False):
    os.mkdir(symbolFolder, mode=0o777)

In [5]:
###### LOAD AND PREPROCESS DATA

### STOCK PRICE
# Import price
path="data/stockquotes/samples_L3_stockquotes_sample.csv.gz"
data_stockquotes = pd.read_csv(path, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
data_stockquotes["quotedate"] = pd.to_datetime(data_stockquotes["quotedate"])
#print(data_stockquotes)
print(np.unique(data_stockquotes["symbol"]))

# Choose the stock
data_stockquotes_sel=data_stockquotes[data_stockquotes["symbol"]==symbol]
data_stockquotes_sel=data_stockquotes_sel.sort_values('quotedate')
data_stockquotes_sel=data_stockquotes_sel[["symbol","quotedate","close"]]
#data_stockquotes_sel=data_stockquotes_sel.set_index('quotedate')
print(data_stockquotes_sel)

# Check the absence of missing data 
NAcount=data_stockquotes.isnull().sum().sum()
print("Missing data=",NAcount)

['AAPL' 'AMZN' 'AXP' 'BA' 'CAT' 'DIS' 'GOOG' 'GS' 'HD' 'IBM' 'JNJ' 'JPM'
 'KO' 'MCD' 'MRK' 'MSFT' 'NFLX' 'NKE' 'PFE' 'PG' 'XOM']
      symbol  quotedate        close
23481   GOOG 2015-05-01   537.900024
23882   GOOG 2015-05-04   540.780029
23974   GOOG 2015-05-05   530.799988
23598   GOOG 2015-05-06   524.219971
23944   GOOG 2015-05-07   530.700012
...      ...        ...          ...
23695   GOOG 2020-05-22  1410.420000
23267   GOOG 2020-05-26  1417.020000
23804   GOOG 2020-05-27  1417.840000
23742   GOOG 2020-05-28  1416.730000
23412   GOOG 2020-05-29  1428.920000

[1278 rows x 3 columns]
Missing data= 0


In [6]:
### STOCK OPTION PRICE
# Import option
#pathOption="data/options/samples_L3_options_"+symbol+".csv.gz"
#data_option = pd.read_csv(pathOption, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
#data_option=data_option.sort_values(["Expiration","DataDate"])
#print(data_option[1:5])
#data_option["DataDate"] = pd.to_datetime(data_option["quotedate"])
#print(data_option)

# Stats 
#uniqueDateExp=np.unique(data_option["Expiration"])
#print(uniqueDateExp[0:3])

In [7]:
### STOCK OPTION STATISTICS
# Import optionstats
pathOptionStats="data/optionstats/samples_L3_optionstats_sample.csv.gz"
data_optionStats = pd.read_csv(pathOptionStats, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
data_optionStats["quotedate"] = pd.to_datetime(data_optionStats["quotedate"])

# Choose the stock
data_optionStats_sel=data_optionStats[data_optionStats["symbol"]==symbol]
data_optionStats_sel=data_optionStats_sel.sort_values(["quotedate"])
#data_optionStats_sel=data_optionStats_sel.set_index('quotedate')
data_optionStats_sel=pd.DataFrame.drop_duplicates(data_optionStats_sel)
data_optionStats_sel=data_optionStats_sel.drop(columns=["symbol"])
print(data_optionStats_sel[["iv30call","iv30put"]])


       iv30call  iv30put
202      0.2106   0.2012
34324    0.2089   0.1835
11227    0.2205   0.1993
25867    0.2121   0.2186
17833    0.2114   0.1986
...         ...      ...
21701    0.2622   0.2626
46776    0.2629   0.2587
29196    0.2646   0.2688
7300     0.2701   0.2722
10736    0.2629   0.2579

[1114 rows x 2 columns]


In [8]:
### MERGE PRICE WITH OPTION STATS
priceOptionStats=data_stockquotes_sel.merge(data_optionStats_sel, left_on='quotedate', right_on='quotedate')
priceOptionStats=priceOptionStats.drop(columns=["symbol"])
priceOptionStats=priceOptionStats.set_index('quotedate')
print(priceOptionStats[["close","iv30call","iv30put"]])

                  close  iv30call  iv30put
quotedate                                 
2015-05-01   537.900024    0.2106   0.2012
2015-05-04   540.780029    0.2089   0.1835
2015-05-05   530.799988    0.2205   0.1993
2015-05-06   524.219971    0.2121   0.2186
2015-05-07   530.700012    0.2114   0.1986
...                 ...       ...      ...
2020-05-22  1410.420000    0.2622   0.2626
2020-05-26  1417.020000    0.2629   0.2587
2020-05-27  1417.840000    0.2646   0.2688
2020-05-28  1416.730000    0.2701   0.2722
2020-05-29  1428.920000    0.2629   0.2579

[1113 rows x 3 columns]


In [9]:
###### BUILD AND AUTOSELECT ARIMA MODEL

# Split train/test data
priceOptionStatsTrain = priceOptionStats[0:(len(priceOptionStats)-kdays)]
priceOptionStatsTest = priceOptionStats[(len(priceOptionStats)-kdays):len(priceOptionStats)]
#print(priceOptionStatsTrain[])
#print(priceOptionStatsTest[0:5])

In [10]:
# Build ARIMA model and finding automatically the best ARIMA model (function called auto_arima)
priceCloseTrain=priceOptionStatsTrain["close"]
priceCloseTest=priceOptionStatsTest["close"]
exogenousVars=["close","iv30call","iv30put","iv60call","iv60put","iv90call","iv90put","iv120call","iv120put",
                "iv150call","iv150put","iv180call","iv180put","iv360call","iv360put","totalvol","totaloi"]
exogenousTrain=priceOptionStatsTrain.drop(columns=exogenousVars)
exogenousTest=priceOptionStatsTest.drop(columns=exogenousVars)

autoARIMA = auto_arima(priceCloseTrain, seasonal=False, trace=True,
                error_action='ignore', suppress_warnings=True, stepwise=True)

Performing stepwise search to minimize aic
Fit ARIMA(2,1,2)x(0,0,0,0) [intercept=True]; AIC=9172.589, BIC=9202.509, Time=0.522 seconds
Fit ARIMA(0,1,0)x(0,0,0,0) [intercept=True]; AIC=9199.601, BIC=9209.574, Time=0.020 seconds
Fit ARIMA(1,1,0)x(0,0,0,0) [intercept=True]; AIC=9170.555, BIC=9185.515, Time=0.045 seconds
Fit ARIMA(0,1,1)x(0,0,0,0) [intercept=True]; AIC=9172.672, BIC=9187.632, Time=0.118 seconds
Fit ARIMA(0,1,0)x(0,0,0,0) [intercept=False]; AIC=9199.293, BIC=9204.279, Time=0.014 seconds
Fit ARIMA(2,1,0)x(0,0,0,0) [intercept=True]; AIC=9172.257, BIC=9192.203, Time=0.115 seconds
Fit ARIMA(1,1,1)x(0,0,0,0) [intercept=True]; AIC=9172.366, BIC=9192.312, Time=0.237 seconds
Fit ARIMA(2,1,1)x(0,0,0,0) [intercept=True]; AIC=9173.401, BIC=9198.333, Time=0.552 seconds
Total fit time: 1.639 seconds


In [22]:
# Ranking of exogenous features
# The higher the loss the better the feature
print("No exo: \tAIC="+str(round(autoARIMA.aic())))

exogenousVars=exogenousTrain.columns.values
for var in exogenousVars:
    autoARIMAi = auto_arima(priceCloseTrain, exogenous=pd.DataFrame(exogenousTrain[var]), seasonal=False, trace=False,
                error_action='ignore', suppress_warnings=True, stepwise=True)
    print("Exo "+var+": \tAIC loss="+str(round(autoARIMA.aic()-autoARIMAi.aic())))
    


No exo: 	AIC=9171.0
Exo iv30mean: 	AIC loss=339.0
Exo iv60mean: 	AIC loss=313.0
Exo iv90mean: 	AIC loss=276.0
Exo iv120mean: 	AIC loss=225.0
Exo iv150mean: 	AIC loss=204.0
Exo iv180mean: 	AIC loss=224.0
Exo iv360mean: 	AIC loss=156.0
Exo callvol: 	AIC loss=20.0
Exo putvol: 	AIC loss=6.0
Exo calloi: 	AIC loss=-1.0
Exo putoi: 	AIC loss=-2.0


In [21]:
print(exogenousVars)

['iv30mean' 'iv60mean' 'iv90mean' 'iv120mean' 'iv150mean' 'iv180mean'
 'iv360mean' 'callvol' 'putvol' 'calloi' 'putoi']


In [25]:
###### FORECAST AVERAGE PRICE USING ARIMA MODEL

# Exo varsiv150mean
exovars=["iv30mean","iv60mean","iv90mean","iv120mean","iv150mean","callvol","putvol"]
exovars=["iv30mean","iv60mean","iv90mean","iv120mean","iv150mean","iv180mean"]
exovars=["iv30mean"]

# "iv30mean","iv60mean","iv90mean","iv120mean","iv150mean"
#Test RMSE: 90.789  
#Test RMSE: 87.117 

# "iv30mean","iv60mean","iv90mean","iv120mean","iv150mean" "iv180mean"
#Test RMSE: 90.789
#Test RMSE: 87.729

# "iv30mean","iv60mean","iv90mean"
#Test RMSE: 90.789
#Test RMSE: 86.495

# "iv30mean","iv60mean"
#Test RMSE: 90.789
#Test RMSE: 91.499


# ARIMA with exogenous variables
autoARIMAexo = auto_arima(priceCloseTrain, exogenous=pd.DataFrame(exogenousTrain[exovars]), seasonal=False, trace=False,
                error_action='ignore', suppress_warnings=True, stepwise=True)

# Forecast no exo
future_forecast = autoARIMA.predict(n_periods=kdays)

# Forecast for all exogenous variables with ARIMA
future_forecast_forexo=pd.DataFrame()
for var in exogenousVars:
    autoARIMAforexoi = auto_arima(exogenousTrain[var], seasonal=False, trace=False,
                error_action='ignore', suppress_warnings=True, stepwise=True)
    future_forecast_forexoi=pd.DataFrame(autoARIMAforexoi.predict(n_periods=kdays))
    future_forecast_forexo=pd.concat([future_forecast_forexo, future_forecast_forexoi], axis=1)
future_forecast_forexo.columns=exogenousVars
    
# Forecast using ARIMA with exogenous variables
future_forecastexo = autoARIMAexo.predict(n_periods=kdays, exogenous=pd.DataFrame(future_forecast_forexo[exovars]))


# Forecast Error
print("Test RMSE: %.3f" % np.sqrt(mt.mean_squared_error(priceCloseTest, future_forecast)))
print("Test RMSE: %.3f" % np.sqrt(mt.mean_squared_error(priceCloseTest, future_forecastexo)))

KeyError: "None of [Index(['iv30mean'], dtype='object')] are in the [columns]"

In [47]:
#lossDiff=0
currentExogenousVars=np.empty()
#while lossDiff>0.001:
print(type(currentExogenousVars))
print(type(np.random.choice(exogenousVars,1)))
currentExogenousVars=np.concatenate(currentExogenousVars,np.random.choice(exogenousVars,1))
autoARIMAexok = auto_arima(priceCloseTrain, exogenous=pd.DataFrame(exogenousTrain[currentExogenousVars]), seasonal=False, trace=False,
                error_action='ignore', suppress_warnings=True, stepwise=True)
future_forecastexok = autoARIMAexo.predict(n_periods=kdays, exogenous=pd.DataFrame(future_forecast_forexo[exovars]))
np.sqrt(mt.mean_squared_error(priceCloseTest, future_forecastexo))


TypeError: Required argument 'shape' (pos 1) not found

In [28]:
#
exogenousTrainVar=exogenousTrain[exovars]
exogenousTrainVarLastK=exogenousTrainVar[(len(exogenousTrainVar)-kdays):len(exogenousTrainVar)]
#print(exogenousTrainVarLastK)
future_forecastexo = autoARIMAexo.predict(n_periods=kdays, exogenous=pd.DataFrame(exogenousTrainVarLastK))

# Forecast Error
print("Test RMSE: %.3f" % np.sqrt(mt.mean_squared_error(priceCloseTest, future_forecast)))
print("Test RMSE: %.3f" % np.sqrt(mt.mean_squared_error(priceCloseTest, future_forecastexo)))


Test RMSE: 90.789
Test RMSE: 111.152


