# Dependencies and notebook settings

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from tqdm import tqdm
from IPython.core.display import display, HTML
import ngboost
import talib

display(HTML("<style>.container { width:90% !important; }</style>"))
plt.style.use("ggplot")
mpl.rcParams["figure.dpi"] = 100
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)

# Data import and preparation

In [2]:
df = pd.read_csv(
    "../dataset/spx.csv",
    parse_dates=["Date"],
    names=["Date", "Open", "High", "Low", "Close", "Volume"],
    header=0,
    index_col="Date")
df = df[df.index < "2020-10-01"]
df["rr"] = (np.log(df.Close) - np.log(df.Close.shift(1))) * 100
df.dropna(inplace=True)

# Feature engineering

## Time variables

In [3]:
df["day_of_week"] = df.index.dayofweek
df["day_of_year"] = df.index.dayofyear
df["week"] = df.index.week
df["quarter"] = df.index.quarter

## Stationarization 

In [4]:
df["Open_stationary"] = df["Open"].diff()
df["High_stationary"]= df["High"].diff()
df["Low_stationary"]= df["Low"].diff()
df["Close_stationary"]= df["Close"].diff()

## Intra day relations

In [5]:
df["Close_minus_Open"] = df["Close"] - df["Open"]
df["High_minus_Low"] = df["High"] - df["Low"]

## Shallow technical analysis variables and ARMA proxy 

### Simple statistics 

In [6]:
df["MA"] = talib.MA(df["rr"], timeperiod=2)
df["EMA"] = talib.EMA(df["rr"], timeperiod=2)
df["STD"] = talib.STDDEV(df["rr"], timeperiod=2)
df["MA_2"] = talib.MA(df["rr"], timeperiod=3)
df["EMA_2"] = talib.EMA(df["rr"], timeperiod=3)
df["STD_2"] = talib.STDDEV(df["rr"], timeperiod=3)
df["MA_W"] = talib.MA(df["rr"], timeperiod=6)
df["EMA_W"] = talib.EMA(df["rr"], timeperiod=6)
df["STD_W"] = talib.STDDEV(df["rr"], timeperiod=6)

### Volatility Indicators 

In [7]:
df["ATR"] = talib.ATR(df["High"], df["Low"], df["Close"], timeperiod=7)
df["TRANGE"] = talib.TRANGE(df["High"], df["Low"], df["Close"])

### Volume Indicators 

In [8]:
df["OBV"] = talib.OBV(df["Close"],df["Volume"])
df["ADOSC"] = talib.ADOSC(df["High"], df["Low"], df["Close"], df["Volume"])

## Dataset cleaning 

In [9]:
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'rr', 'day_of_week',
       'day_of_year', 'week', 'quarter', 'Open_stationary', 'High_stationary',
       'Low_stationary', 'Close_stationary', 'Close_minus_Open',
       'High_minus_Low', 'MA', 'EMA', 'STD', 'MA_2', 'EMA_2', 'STD_2', 'MA_W',
       'EMA_W', 'STD_W', 'ATR', 'TRANGE', 'OBV', 'ADOSC'],
      dtype='object')

In [10]:
df = df[['rr', 'day_of_week','day_of_year', 'week', 'quarter', 
         'Volume','Open_stationary', 'High_stationary',
       'Low_stationary', 'Close_stationary', 'Close_minus_Open',
       'High_minus_Low', 'MA', 'EMA', 'STD', 'MA_2', 'EMA_2', 'STD_2', 'MA_W',
       'EMA_W', 'STD_W', 'ATR', 'TRANGE', 'OBV', 'ADOSC']]

## Variables lagging/shifting

In [11]:
to_lag = ['rr', 'Volume', 'Open_stationary', 'High_stationary', 'Low_stationary','Close_stationary', 
          'Close_minus_Open', 'High_minus_Low', 'MA', 'EMA',  'STD', 'MA_2', 'EMA_2', 'STD_2', 'MA_W', 'EMA_W', 
          'STD_W', 'ATR', 'TRANGE', 'OBV', 'ADOSC']

for i in to_lag:
    for j in range(1,11):
        col_name = i + "_L" + str(j)
        df[col_name] = df[i].shift(j)
    if i != "rr":
        df.drop(columns = [i], inplace=True)

In [14]:
df.shape

(7664, 215)

# Feature selection using XAI methods