In [1]:
import requests
import numpy as np
import pandas as pd
import io
!pip install plotly==4.12
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn import preprocessing
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.io as pio



In [2]:
pio.templates.default = "plotly_dark"

In [3]:
#%%shell
#jupyter nbconvert --to latex /DIX.ipynb

In [4]:
data = requests.get('https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol=SPY&outputsize=full&apikey=1JIZ8QID66LEDMHB').json()

In [456]:
open_df       = pd.DataFrame.from_dict(data['Time Series (Daily)']).T
open_df.index = pd.to_datetime(open_df.index)
open_df       = open_df.apply(pd.to_numeric, errors='coerce')
open_df.rename(columns={'1. open':'open'}, inplace=True)
open_df = open_df['open']
open_df = open_df.shift(1)

In [457]:
sm_data = requests.get('https://squeezemetrics.com/monitor/static/DIX.csv')

In [458]:
sm_df = pd.read_csv(io.BytesIO(sm_data.content), encoding='utf-8')  
sm_df.set_index(['date'], inplace=True)
sm_df = sm_df.apply(pd.to_numeric, errors='coerce')
sm_df.index = pd.to_datetime(sm_df.index)
sm_df.drop(['price'], axis=1, inplace=True)

In [459]:
vix_data = requests.get('http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/vixcurrent.csv')

In [460]:
vix_df = pd.read_csv(io.BytesIO(vix_data.content), encoding='utf-8')  
vix_df.columns = vix_df.iloc[0]
vix_df.drop(vix_df.index[0], inplace=True)
vix_df['Date'] = pd.to_datetime(vix_df['Date'], format='%m/%d/%Y')
vix_df.set_index(['Date'], inplace=True)
vix_df = vix_df.apply(pd.to_numeric, errors='coerce')
vix_df.rename(columns={"VIX Close": "vix"}, inplace=True)
vix_df['vix_change'] = vix_df['vix'].pct_change(1)

In [461]:
df = pd.concat((sm_df, open_df, vix_df[['vix', 'vix_change']]), axis=1, join='inner')

periods = [1, 5, 10, 15, 30, 50, 100]

for p in periods:
    df[ str(p) + 'd_return'] = df['open'].pct_change(p).shift(periods=-p)

In [462]:
df

Unnamed: 0,dix,gex,open,vix,vix_change,1d_return,5d_return,10d_return,15d_return,30d_return,50d_return,100d_return
2011-05-02,0.378842,1.897313e+09,135.96,15.99,0.084068,-0.002133,-0.005811,-0.024051,-0.025890,-0.056781,-0.027876,-0.175419
2011-05-03,0.383411,1.859731e+09,135.67,16.70,0.044403,-0.011720,0.000000,-0.017911,-0.031326,-0.063463,-0.029557,-0.155230
2011-05-04,0.392122,1.717764e+09,134.08,17.08,0.022754,0.006414,0.000000,0.005370,-0.015289,-0.045868,-0.022375,-0.115976
2011-05-05,0.405457,1.361864e+09,134.94,18.20,0.065574,-0.005558,0.001556,-0.004521,-0.011635,-0.061657,-0.026679,-0.127168
2011-05-06,0.418649,1.490329e+09,134.19,18.40,0.010989,0.007303,-0.004695,-0.016469,0.004322,-0.043446,-0.008346,-0.127729
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-16,0.414971,5.568672e+09,359.97,22.45,-0.028139,0.002611,,,,,,
2020-11-17,0.440556,5.804662e+09,360.91,22.71,0.011581,-0.014713,,,,,,
2020-11-18,0.416999,3.498179e+09,355.60,23.84,0.049758,0.005343,,,,,,
2020-11-19,0.438597,5.497968e+09,357.50,23.11,-0.030621,0.000000,,,,,,


In [463]:
colors =[[
'#00bfff',
'#00abe5',
'#0098cc',
'#0085b2',
'#007299',
'#005f7f',
'#004c66',
'#00394c',
'#002633',
'#001319',
'#000000'
],
[
'#ff0074',
'#e50068',
'#cc005c',
'#b20051',
'#990045',
'#7f003a',
'#66002e',
'#4c0022',
'#330017',
'#19000b',
'#000000',
]]


In [464]:
fig = px.histogram(df, x='dix', y=df.columns[5:], nbins=50, histfunc='avg',color_discrete_sequence=colors[1], barmode='overlay', title="DIX return histogram")
fig.update_layout(
    hovermode="x"
    )
fig.show()

In [465]:
fig = px.histogram(df, x='gex', y=df.columns[5:], nbins=50, histfunc='avg', color_discrete_sequence=colors[0], barmode='overlay', title="GEX return histogram")
fig.update_layout(
    hovermode="x"
    )
fig.show()

In [466]:
def df_to_plotly(df):
    return {'z': df.corr().values.tolist(),
            'x': df.corr().columns.tolist(),
            'y': df.corr().index.tolist()}

fig = go.Figure(data=go.Heatmap(df_to_plotly(df), colorscale=['#4900ff', '#ff0074']))
fig.update_layout(
    title="DIX & GEX correlation to returns",
    template="plotly_dark",
    hovermode="x"
    )
fig.show()

In [467]:
""" We want to know if the DIX was above the threshold at any point during the n prior days.
    Returns: 1 if DIX > threshold during prior n days, else 0.
"""

thresholds = [0.35, 0.40, 0.45, 0.5]
forward_df = pd.DataFrame()

def n_prior(x, t):
    result = False
    for i in x:
        if i > t:
            result = True
    return result

for p in periods:
    for t in thresholds:
        forward_df[str(p) + 'd_prior_' + 'abv' + str(int(t*100))] = df['dix'].rolling(p).apply(lambda x: n_prior(x, t))

forward_df.head()

Unnamed: 0,1d_prior_abv35,1d_prior_abv40,1d_prior_abv45,1d_prior_abv50,5d_prior_abv35,5d_prior_abv40,5d_prior_abv45,5d_prior_abv50,10d_prior_abv35,10d_prior_abv40,10d_prior_abv45,10d_prior_abv50,15d_prior_abv35,15d_prior_abv40,15d_prior_abv45,15d_prior_abv50,30d_prior_abv35,30d_prior_abv40,30d_prior_abv45,30d_prior_abv50,50d_prior_abv35,50d_prior_abv40,50d_prior_abv45,50d_prior_abv50,100d_prior_abv35,100d_prior_abv40,100d_prior_abv45,100d_prior_abv50
2011-05-02,1.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,
2011-05-03,1.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,
2011-05-04,1.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,
2011-05-05,1.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,
2011-05-06,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,


In [468]:
""" If DIX was above the threshold, we include the return in the cumulative return"""

forward_return_df = pd.DataFrame()

for p in periods:
    for t in thresholds:
        forward_return_df[str(p) + 'd_cumprod_' + 'abv' + str(int(t * 100))] = (df[forward_df[str(p) + 'd_prior_' + 'abv' + str(int(t*100))] == 1]['1d_return'] + 1).cumprod()

forward_return_df.ffill(axis=0, inplace=True)
forward_return_df.tail()

Unnamed: 0,1d_cumprod_abv35,1d_cumprod_abv40,1d_cumprod_abv45,1d_cumprod_abv50,5d_cumprod_abv35,5d_cumprod_abv40,5d_cumprod_abv45,5d_cumprod_abv50,10d_cumprod_abv35,10d_cumprod_abv40,10d_cumprod_abv45,10d_cumprod_abv50,15d_cumprod_abv35,15d_cumprod_abv40,15d_cumprod_abv45,15d_cumprod_abv50,30d_cumprod_abv35,30d_cumprod_abv40,30d_cumprod_abv45,30d_cumprod_abv50,50d_cumprod_abv35,50d_cumprod_abv40,50d_cumprod_abv45,50d_cumprod_abv50,100d_cumprod_abv35,100d_cumprod_abv40,100d_cumprod_abv45,100d_cumprod_abv50
2020-11-16,2.682978,3.090612,1.91497,1.150855,2.689545,2.720898,2.190798,1.159729,2.702231,2.591416,2.403808,1.159029,2.734581,2.687248,2.997418,1.20764,2.800574,2.792647,2.920335,1.496496,2.732304,2.877213,2.673959,1.555723,3.186843,3.186843,2.752262,1.634526
2020-11-17,2.643504,3.045141,1.91497,1.150855,2.649974,2.680866,2.190798,1.159729,2.662474,2.553289,2.403808,1.159029,2.694348,2.647711,2.997418,1.20764,2.75937,2.751559,2.920335,1.496496,2.692104,2.834881,2.673959,1.555723,3.139956,3.139956,2.711769,1.634526
2020-11-18,2.657628,3.061411,1.91497,1.150855,2.664133,2.69519,2.190798,1.159729,2.6767,2.566932,2.403808,1.159029,2.708744,2.661858,2.997418,1.20764,2.774113,2.766261,2.920335,1.496496,2.706488,2.850028,2.673959,1.555723,3.156733,3.156733,2.726258,1.634526
2020-11-19,2.657628,3.061411,1.91497,1.150855,2.664133,2.69519,2.190798,1.159729,2.6767,2.566932,2.403808,1.159029,2.708744,2.661858,2.997418,1.20764,2.774113,2.766261,2.920335,1.496496,2.706488,2.850028,2.673959,1.555723,3.156733,3.156733,2.726258,1.634526
2020-11-20,2.657628,3.061411,1.91497,1.150855,2.664133,2.69519,2.190798,1.159729,2.6767,2.566932,2.403808,1.159029,2.708744,2.661858,2.997418,1.20764,2.774113,2.766261,2.920335,1.496496,2.706488,2.850028,2.673959,1.555723,3.156733,3.156733,2.726258,1.634526


In [469]:
def make_return_plot(forward_return_df, df, thresholds, p):

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=(df['1d_return'] + 1).cumprod(),
                        mode='lines',
                        name='S&P500 return',
                        line=dict(color='#8900ff', width=2)
                        ))

    i = 2
    for t in thresholds:
        fig.add_trace(go.Scatter(x=forward_return_df.index, y=forward_return_df[str(p) + 'd_cumprod_' + 'abv' + str(int(t * 100))],
                            mode='lines',
                            name='DIX above ' + str(int(t * 100)),
                            line=dict(color=colors[0][i], width=2)
                            ))
        i+=2


    fig.update_layout(
    title=str(p) + 'd_holding',
    yaxis_title="Cumulative returns",
    hovermode="x"
    )
    fig.show()

In [470]:
    for p in periods:
      make_return_plot(forward_return_df, df, thresholds, p)


In [472]:
rfc_data = np.array(df.dropna())

train_l = int(rfc_data.shape[0] * 0.3)

def quantiles(x):
  return pd.qcut(x, 10, labels=False)

features = np.hstack((rfc_data[:, 0:2], rfc_data[:, 4, np.newaxis]))
#features = rfc_data[:, 0:2]
labels = np.apply_along_axis(quantiles, 0, rfc_data[:, 5:])

X_train, X_test, y_train, y_test = features[0:train_l], features[train_l:], labels[0:train_l], labels[train_l:]

rfc = RandomForestClassifier(n_estimators=200, random_state=42, max_leaf_nodes=10, min_impurity_split=0.05)
multi_target_forest = MultiOutputClassifier(rfc, n_jobs=-1)
multi_target_forest.fit(X_train, y_train)

MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=10,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=0.05,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                    

In [473]:
pred = multi_target_forest.predict(X_test)

In [474]:
print(pred.shape)
print(pred)

(1616, 7)
[[8 9 9 ... 8 0 8]
 [9 9 9 ... 9 9 9]
 [8 9 9 ... 8 0 8]
 ...
 [9 9 9 ... 9 8 9]
 [2 8 7 ... 1 1 4]
 [1 2 6 ... 1 1 2]]


In [475]:
idx = df.dropna().index[train_l:]
columns = [str(p) + 'd' for p in periods]
rfc_df = pd.DataFrame(pred, columns=columns, index=idx)

q_thresholds = [5, 6, 7, 8, 9]

pred_df = pd.DataFrame(pred)
forward_rfc_df = pd.DataFrame()

def n_prior(x, q):
    result = False
    for i in x:
        if i >= q:
            result = True
    return result


for p in range(len(periods)):
    for q in q_thresholds:
        forward_rfc_df[str(periods[p]) + 'd_prior_' + 'abv' + str(q)] = pred_df.iloc[:, p].rolling(periods[p]).apply(lambda x: n_prior(x, q))


forward_rfc_df.set_index(idx, inplace=True)
forward_rfc_df


Unnamed: 0,1d_prior_abv5,1d_prior_abv6,1d_prior_abv7,1d_prior_abv8,1d_prior_abv9,5d_prior_abv5,5d_prior_abv6,5d_prior_abv7,5d_prior_abv8,5d_prior_abv9,10d_prior_abv5,10d_prior_abv6,10d_prior_abv7,10d_prior_abv8,10d_prior_abv9,15d_prior_abv5,15d_prior_abv6,15d_prior_abv7,15d_prior_abv8,15d_prior_abv9,30d_prior_abv5,30d_prior_abv6,30d_prior_abv7,30d_prior_abv8,30d_prior_abv9,50d_prior_abv5,50d_prior_abv6,50d_prior_abv7,50d_prior_abv8,50d_prior_abv9,100d_prior_abv5,100d_prior_abv6,100d_prior_abv7,100d_prior_abv8,100d_prior_abv9
2014-01-31,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-03,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-04,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-05,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-06,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-06-26,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-06-29,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-06-30,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [476]:
forward_rfc_return_df = pd.DataFrame()
df_cut = df.loc[forward_rfc_df.index]

for p in periods:
    for q in q_thresholds:
        forward_rfc_return_df[str(p) + 'd_cumprod_' + 'abv' + str(q)] = (df_cut[forward_rfc_df[str(p) + 'd_prior_' + 'abv' + str(q)] == 1]['1d_return'] + 1).cumprod()

forward_rfc_return_df.ffill(axis=0, inplace=True)
forward_rfc_return_df

Unnamed: 0,1d_cumprod_abv5,1d_cumprod_abv6,1d_cumprod_abv7,1d_cumprod_abv8,1d_cumprod_abv9,5d_cumprod_abv5,5d_cumprod_abv6,5d_cumprod_abv7,5d_cumprod_abv8,5d_cumprod_abv9,10d_cumprod_abv5,10d_cumprod_abv6,10d_cumprod_abv7,10d_cumprod_abv8,10d_cumprod_abv9,15d_cumprod_abv5,15d_cumprod_abv6,15d_cumprod_abv7,15d_cumprod_abv8,15d_cumprod_abv9,30d_cumprod_abv5,30d_cumprod_abv6,30d_cumprod_abv7,30d_cumprod_abv8,30d_cumprod_abv9,50d_cumprod_abv5,50d_cumprod_abv6,50d_cumprod_abv7,50d_cumprod_abv8,50d_cumprod_abv9,100d_cumprod_abv5,100d_cumprod_abv6,100d_cumprod_abv7,100d_cumprod_abv8,100d_cumprod_abv9
2014-01-31,0.983031,0.983031,0.983031,0.983031,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-03,0.982076,0.982076,0.982076,0.982076,0.999028,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-04,0.986571,0.986571,0.986571,0.986571,0.999028,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-05,1.001910,1.001910,1.001910,1.001910,0.999028,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2014-02-06,1.009721,1.009721,1.009721,1.009721,0.999028,1.007795,1.007795,1.007795,1.007795,1.007795,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-23,1.748294,1.940843,1.838411,1.877831,1.617364,1.527653,1.527653,1.392342,1.392342,1.482820,1.804888,1.804888,1.653449,1.733179,1.797817,1.537454,1.515661,1.490401,1.681688,1.571160,1.622725,1.669853,1.669853,1.549191,1.838414,1.664704,1.664704,1.602035,1.602035,1.734063,1.561943,1.561943,1.561943,1.698946,1.566163
2020-06-24,1.763791,1.958047,1.854707,1.894476,1.631701,1.541194,1.541194,1.404684,1.404684,1.495963,1.820887,1.820887,1.668106,1.748542,1.813753,1.551083,1.529096,1.503612,1.696595,1.585087,1.637109,1.684655,1.684655,1.562923,1.854710,1.679460,1.679460,1.616236,1.616236,1.749434,1.575789,1.575789,1.575789,1.714005,1.580045
2020-06-25,1.736426,1.927668,1.825932,1.865084,1.606385,1.517283,1.517283,1.382891,1.382891,1.472754,1.792636,1.792636,1.642226,1.721414,1.785613,1.527018,1.505372,1.480284,1.670273,1.560495,1.611709,1.658518,1.658518,1.538675,1.825934,1.653404,1.653404,1.591160,1.591160,1.722292,1.551341,1.551341,1.551341,1.687413,1.555531
2020-06-26,1.751290,1.944168,1.841561,1.881049,1.620136,1.530270,1.530270,1.394728,1.394728,1.485360,1.807981,1.807981,1.656283,1.736148,1.800897,1.540089,1.518258,1.492954,1.684570,1.573852,1.625505,1.672714,1.672714,1.551846,1.841564,1.667556,1.667556,1.604780,1.604780,1.737034,1.564620,1.564620,1.564620,1.701857,1.568846


In [477]:
def make_rfc_return_plot(forward_return_df, df, thresholds, p):

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=(df['1d_return'] + 1).cumprod(),
                        mode='lines',
                        name='S&P500 return',
                        line=dict(color='#8900ff', width=2)
                        ))

    i = 2
    for t in thresholds:
        fig.add_trace(go.Scatter(x=forward_return_df.index, y=forward_return_df[str(p) + 'd_cumprod_' + 'abv' + str(t)],
                            mode='lines',
                            name='DIX above ' + str(t),
                            line=dict(color=colors[0][i], width=2)
                            ))
        i+=1


    fig.update_layout(
    title=str(p) + 'd_holding',
    yaxis_title="Cumulative returns",
    hovermode="x"
    )
    fig.show()

In [480]:
    for p in periods:
      make_rfc_return_plot(forward_rfc_return_df, df_cut, q_thresholds, p)

In [479]:
df

Unnamed: 0,dix,gex,open,vix,vix_change,1d_return,5d_return,10d_return,15d_return,30d_return,50d_return,100d_return
2011-05-02,0.378842,1.897313e+09,135.96,15.99,0.084068,-0.002133,-0.005811,-0.024051,-0.025890,-0.056781,-0.027876,-0.175419
2011-05-03,0.383411,1.859731e+09,135.67,16.70,0.044403,-0.011720,0.000000,-0.017911,-0.031326,-0.063463,-0.029557,-0.155230
2011-05-04,0.392122,1.717764e+09,134.08,17.08,0.022754,0.006414,0.000000,0.005370,-0.015289,-0.045868,-0.022375,-0.115976
2011-05-05,0.405457,1.361864e+09,134.94,18.20,0.065574,-0.005558,0.001556,-0.004521,-0.011635,-0.061657,-0.026679,-0.127168
2011-05-06,0.418649,1.490329e+09,134.19,18.40,0.010989,0.007303,-0.004695,-0.016469,0.004322,-0.043446,-0.008346,-0.127729
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-16,0.414971,5.568672e+09,359.97,22.45,-0.028139,0.002611,,,,,,
2020-11-17,0.440556,5.804662e+09,360.91,22.71,0.011581,-0.014713,,,,,,
2020-11-18,0.416999,3.498179e+09,355.60,23.84,0.049758,0.005343,,,,,,
2020-11-19,0.438597,5.497968e+09,357.50,23.11,-0.030621,0.000000,,,,,,
