# Dependencies and notebook settings

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from tqdm import tqdm
from IPython.core.display import display, HTML
import ngboost
import talib
from sklearn.tree import DecisionTreeRegressor

display(HTML("<style>.container { width:90% !important; }</style>"))
plt.style.use("ggplot")
mpl.rcParams["figure.dpi"] = 100
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)

# Data import and preparation

In [2]:
df = pd.read_csv(
    "../dataset/spx.csv",
    parse_dates=["Date"],
    names=["Date", "Open", "High", "Low", "Close", "Volume"],
    header=0,
    index_col="Date")
df = df[df.index < "2020-10-01"]
df["rr"] = (np.log(df.Close) - np.log(df.Close.shift(1))) * 100
df.dropna(inplace=True)

# Feature engineering

## Time variables

In [3]:
df["day_of_week"] = df.index.dayofweek
df["day_of_year"] = df.index.dayofyear
df["week"] = df.index.week
df["quarter"] = df.index.quarter

## Stationarization 

In [4]:
df["Open_stationary"] = df["Open"].diff()
df["High_stationary"]= df["High"].diff()
df["Low_stationary"]= df["Low"].diff()
df["Close_stationary"]= df["Close"].diff()

## Intra day relations

In [5]:
df["Close_minus_Open"] = df["Close"] - df["Open"]
df["High_minus_Low"] = df["High"] - df["Low"]

## Shallow technical analysis variables and ARMA proxy 

### Simple statistics 

In [6]:
df["MA"] = talib.MA(df["rr"], timeperiod=2)
df["EMA"] = talib.EMA(df["rr"], timeperiod=2)
df["STD"] = talib.STDDEV(df["rr"], timeperiod=2)
df["MA_2"] = talib.MA(df["rr"], timeperiod=3)
df["EMA_2"] = talib.EMA(df["rr"], timeperiod=3)
df["STD_2"] = talib.STDDEV(df["rr"], timeperiod=3)
df["MA_W"] = talib.MA(df["rr"], timeperiod=6)
df["EMA_W"] = talib.EMA(df["rr"], timeperiod=6)
df["STD_W"] = talib.STDDEV(df["rr"], timeperiod=6)

### Volatility Indicators 

In [7]:
df["ATR"] = talib.ATR(df["High"], df["Low"], df["Close"], timeperiod=7)
df["TRANGE"] = talib.TRANGE(df["High"], df["Low"], df["Close"])

### Volume Indicators 

In [8]:
df["OBV"] = talib.OBV(df["Close"],df["Volume"])
df["ADOSC"] = talib.ADOSC(df["High"], df["Low"], df["Close"], df["Volume"])

## Dataset cleaning 

In [9]:
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'rr', 'day_of_week',
       'day_of_year', 'week', 'quarter', 'Open_stationary', 'High_stationary',
       'Low_stationary', 'Close_stationary', 'Close_minus_Open',
       'High_minus_Low', 'MA', 'EMA', 'STD', 'MA_2', 'EMA_2', 'STD_2', 'MA_W',
       'EMA_W', 'STD_W', 'ATR', 'TRANGE', 'OBV', 'ADOSC'],
      dtype='object')

In [10]:
df = df[['rr', 'day_of_week','day_of_year', 'week', 'quarter', 
         'Volume','Open_stationary', 'High_stationary',
       'Low_stationary', 'Close_stationary', 'Close_minus_Open',
       'High_minus_Low', 'MA', 'EMA', 'STD', 'MA_2', 'EMA_2', 'STD_2', 'MA_W',
       'EMA_W', 'STD_W', 'ATR', 'TRANGE', 'OBV', 'ADOSC']]

## Variables lagging/shifting

In [11]:
to_lag = ['rr', 'Volume', 'Open_stationary', 'High_stationary', 'Low_stationary','Close_stationary', 
          'Close_minus_Open', 'High_minus_Low', 'MA', 'EMA',  'STD', 'MA_2', 'EMA_2', 'STD_2', 'MA_W', 'EMA_W', 
          'STD_W', 'ATR', 'TRANGE', 'OBV', 'ADOSC']

for i in to_lag:
    for j in range(1,4):
        col_name = i + "_L" + str(j)
        df[col_name] = df[i].shift(j)
    if i != "rr":
        df.drop(columns = [i], inplace=True)

In [12]:
df.shape

(7664, 68)

In [13]:
features = set(df.columns)
features.remove("rr")

# Feature selection using tree-based feature selection

"NGBoost does provide methods to interpret models fit with regression tree base learners. Since each parameter in the distribution is fit by a separate sequence of learners, there will be multiple model interpretation results, one for each parameter. The default distribution used is Normal so the following example shows results for the loc and scale parameters." [source](https://stanfordmlgroup.github.io/ngboost/3-interpretation.html)

Taking above into consideration I will assume that my default NGBoost model consists of: 
* Normal distribution as the output distribution
* 3-depth Decision Tree as the base learner
* 500 iterations as the number of boosting iterations
* 0.01 learning rate
* negative log likelihood score as scoring function

During next modeling steps I will tune this parameters but now I will fix them!

Here I would like to find the best variables on average in each of the training-validation periods and in each of distribution parameters.

In [14]:
starts = ["2006-01-12", "2008-01-15", "2014-01-13"]

## Training-validation period 1

In [15]:
loc_list = list()
scale_list = list()

start = starts[0]
df_tmp = df.loc[start:].head(252 * 4).copy()

for i in tqdm(range(0, 252)):
    train = df_tmp.iloc[i : i + 252 * 3].copy()
    ngb = ngboost.NGBRegressor(
        Dist=ngboost.distns.Normal,
        Score=ngboost.scores.LogScore,
        Base=DecisionTreeRegressor(criterion="friedman_mse", max_depth=3),
        n_estimators=500,
        learning_rate=0.01,
        minibatch_frac=1.0,
        col_sample=1.0,
        verbose=False,
        verbose_eval=500,
        tol=0.0001,
        random_state=2021)
    ngb.fit(train[features], train.rr)
    
    feature_importance_loc = ngb.feature_importances_[0]
    feature_importance_scale = ngb.feature_importances_[1]
    df_loc = pd.DataFrame({'feature':list(features),
                           'importance':feature_importance_loc}).sort_values('importance',ascending=False)
    df_scale = pd.DataFrame({'feature':list(features),
                           'importance':feature_importance_scale}).sort_values('importance',ascending=False)    
    df_loc["index"] = np.arange(1,68)
    df_loc.drop(columns=["importance"], inplace=True)
    df_scale["index"] = np.arange(1,68)
    df_scale.drop(columns=["importance"], inplace=True)
    loc_list.append(df_loc)
    scale_list.append(df_scale)

loc_df1 = pd.concat(loc_list)
scale_df1 = pd.concat(scale_list)

100%|██████████████████████████████████████████████████████████████████████████████| 252/252 [1:05:49<00:00, 15.67s/it]


## Training-validation period 2

In [16]:
loc_list = list()
scale_list = list()

start = starts[1]
df_tmp = df.loc[start:].head(252 * 4).copy()

for i in tqdm(range(0, 252)):
    train = df_tmp.iloc[i : i + 252 * 3].copy()
    ngb = ngboost.NGBRegressor(
        Dist=ngboost.distns.Normal,
        Score=ngboost.scores.LogScore,
        Base=DecisionTreeRegressor(criterion="friedman_mse", max_depth=3),
        n_estimators=500,
        learning_rate=0.01,
        minibatch_frac=1.0,
        col_sample=1.0,
        verbose=False,
        verbose_eval=500,
        tol=0.0001,
        random_state=2021)
    ngb.fit(train[features], train.rr)
    
    feature_importance_loc = ngb.feature_importances_[0]
    feature_importance_scale = ngb.feature_importances_[1]
    df_loc = pd.DataFrame({'feature':list(features),
                           'importance':feature_importance_loc}).sort_values('importance',ascending=False)
    df_scale = pd.DataFrame({'feature':list(features),
                           'importance':feature_importance_scale}).sort_values('importance',ascending=False)    
    df_loc["index"] = np.arange(1,68)
    df_loc.drop(columns=["importance"], inplace=True)
    df_scale["index"] = np.arange(1,68)
    df_scale.drop(columns=["importance"], inplace=True)
    loc_list.append(df_loc)
    scale_list.append(df_scale)

loc_df2 = pd.concat(loc_list)
scale_df2 = pd.concat(scale_list)

100%|██████████████████████████████████████████████████████████████████████████████| 252/252 [1:04:25<00:00, 15.34s/it]


## Training-validation period 3

In [17]:
loc_list = list()
scale_list = list()

start = starts[2]
df_tmp = df.loc[start:].head(252 * 4).copy()

for i in tqdm(range(0, 252)):
    train = df_tmp.iloc[i : i + 252 * 3].copy()
    ngb = ngboost.NGBRegressor(
        Dist=ngboost.distns.Normal,
        Score=ngboost.scores.LogScore,
        Base=DecisionTreeRegressor(criterion="friedman_mse", max_depth=3),
        n_estimators=500,
        learning_rate=0.01,
        minibatch_frac=1.0,
        col_sample=1.0,
        verbose=False,
        verbose_eval=500,
        tol=0.0001,
        random_state=2021)
    ngb.fit(train[features], train.rr)
    
    feature_importance_loc = ngb.feature_importances_[0]
    feature_importance_scale = ngb.feature_importances_[1]
    df_loc = pd.DataFrame({'feature':list(features),
                           'importance':feature_importance_loc}).sort_values('importance',ascending=False)
    df_scale = pd.DataFrame({'feature':list(features),
                           'importance':feature_importance_scale}).sort_values('importance',ascending=False)    
    df_loc["index"] = np.arange(1,68)
    df_loc.drop(columns=["importance"], inplace=True)
    df_scale["index"] = np.arange(1,68)
    df_scale.drop(columns=["importance"], inplace=True)
    loc_list.append(df_loc)
    scale_list.append(df_scale)

loc_df3 = pd.concat(loc_list)
scale_df3 = pd.concat(scale_list)

100%|██████████████████████████████████████████████████████████████████████████████| 252/252 [1:04:43<00:00, 15.41s/it]
