In [1]:
import yfinance as yf
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from statsmodels.tsa.api import SimpleExpSmoothing



## Set the Notebook Parameters
- Start and End Date
- Model File Name
- Exponential Smoothing Value

In [2]:
# Start and End Dates for Ticker Data
start_date = "2019-01-01"
end_date = "2021-06-27"

# Model File
model_file = "models/vix_prediction_20210627.pl"

# Alpha Value (for Smoothing)
alpha = 0.2

## Load the Yahoo Finance Data

In [3]:
# Helper Function
def download_data(ticker, start, end):
    return yf.download(ticker, start, end)

# List of Indices to Retrieve Data
#indices = ["^VIX", "^GSPC", "^DJI", "^TNX", "DX-Y.NYB", "GLD", "TIP", "VNQ"]
indices = ["^VIX", "^GSPC", "^IXIC", "^DJI", "^RUT", "CL=F", "GC=F", "SI=F", "^TNX", "BTC-USD", "ETH-USD", "DXY", "VNQ", "QQQ", "URA", "XAR", "AAPL", "EURUSD=X", "VEMAX", "ICVT", "XLY", "XLP", "VYM", "VFH"]
# List of lists
indices_data = [download_data(index, start_date, end_date) for index in indices]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

## Scale the Independent Values
This uses a percent movement

In [4]:
# Previous Closing
def previous_close(df):
    df['Previous Close'] = df['Close'].shift(1)
    return df

# Percent Movement Helper Function
def market_movement(row):
    return (row['Close'] - row['Previous Close'])/row['Previous Close']

# Get Previous Close to Account for Futures
indices_data_prev = [previous_close(df) for df in indices_data]

# Apply helper function to all index dataframes
percent_movement = [index.apply(market_movement, axis=1) for index in indices_data_prev]

## Exponentially Smoothing

In [5]:
#smooth_percent_movement = [SimpleExpSmoothing(series[1:], initialization_method="heuristic").fit(smoothing_level=0,optimized=False).fittedvalues for series in percent_movement]

## Transform into a DataFrame

In [6]:
# Inital Correlation Dataframe
#all_indices_df = pd.concat(smooth_percent_movement, axis=1)
all_indices_df = pd.concat(percent_movement, axis=1)
all_indices_df.columns = [index[1:] + " Percent Movement" for index in indices]
original_columns = all_indices_df.columns
all_indices_df['Day'] = all_indices_df.index

# Direction Change
all_indices_df['GSPC Same Direction'] = (all_indices_df['GSPC Percent Movement'].shift(1) > 0)^(all_indices_df['GSPC Percent Movement'] < 0)
all_indices_df['VIX Same Direction'] = (all_indices_df['VIX Percent Movement'].shift(1) > 0)^(all_indices_df['VIX Percent Movement'] < 0)

# Dependent Variable
all_indices_df['VIX Same Direction Next Day'] = all_indices_df['VIX Same Direction'].shift(-1)

In [7]:
all_indices_df.columns

Index(['VIX Percent Movement', 'GSPC Percent Movement',
       'IXIC Percent Movement', 'DJI Percent Movement', 'RUT Percent Movement',
       'L=F Percent Movement', 'C=F Percent Movement', 'I=F Percent Movement',
       'TNX Percent Movement', 'TC-USD Percent Movement',
       'TH-USD Percent Movement', 'XY Percent Movement', 'NQ Percent Movement',
       'QQ Percent Movement', 'RA Percent Movement', 'AR Percent Movement',
       'APL Percent Movement', 'URUSD=X Percent Movement',
       'EMAX Percent Movement', 'CVT Percent Movement', 'LY Percent Movement',
       'LP Percent Movement', 'YM Percent Movement', 'FH Percent Movement',
       'Day', 'GSPC Same Direction', 'VIX Same Direction',
       'VIX Same Direction Next Day'],
      dtype='object')

## Train New Model

In [8]:
#independent_variables = ['GSPC Percent Movement', 'TNX Percent Movement', 'X-Y.NYB Percent Movement', 'LD Percent Movement', 'IP Percent Movement', 'NQ Percent Movement']
independent_variables = ['GSPC Percent Movement', 'IXIC Percent Movement', 'DJI Percent Movement', 'RUT Percent Movement', 'L=F Percent Movement', 'C=F Percent Movement', 'I=F Percent Movement', 'TNX Percent Movement', 'TC-USD Percent Movement', 'TH-USD Percent Movement', 'XY Percent Movement', 'NQ Percent Movement', 'QQ Percent Movement', 'RA Percent Movement', 'AR Percent Movement', 'APL Percent Movement', 'URUSD=X Percent Movement', 'EMAX Percent Movement', 'CVT Percent Movement', 'LY Percent Movement', 'LP Percent Movement', 'YM Percent Movement', 'FH Percent Movement']

X = all_indices_df[independent_variables]
y = all_indices_df['VIX Same Direction Next Day']

# Fill Non null values with mean
X.fillna(X.mean(), inplace=True)

# Categorical Label Encoder for Dependent Variable
le = preprocessing.LabelEncoder()

# Encode Categorical Dependent Variable
le.fit(y)
y = le.transform(y)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


## Score Model Accuracy with Test Split

In [9]:
X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, random_state=0, shuffle=False)

clf = RandomForestClassifier(max_depth=4, random_state=0)

# Train the Model
model = clf.fit(X_train, y_train)

model.predict(X_test)
model.predict_proba(X_test)

# Score the Model
model.score(X_test, y_test)

0.6318681318681318

## Save Model

In [10]:
model_file_handler = open(model_file,'wb')
pickle.dump(model, model_file_handler)
model_file_handler.close()