### APS 1052 Final Project

The focus of the team's model is going to be the seed model from the book Machine Learning and Data Science Blueprints for Finance. The model is going to be using stock data provided via the Google Drive, sub-folder > IntraDayData > TradeAndQuoteETFData. The goal of our model is to use the stock regression model provided from the book to predict the future price of an ETF. 


For this case study, the independent variables used are the following potentially correlated assets:

Stocks: Apple, Meta, Microsoft, Tesla

ETFs: QQQ, SPY

Indices: S&P500, Dow Jones, VIX

In [240]:
# Load libraries
import numpy as np
import pandas as pd

## Note -  Nov 25 meeting:
Features: SMA, RSI,ROC,MOM,lags

Technical Indicators: Talib (metioned above) VIX, historical(https://www.cboe.com/tradable_products/vix/vix_historical_data/), Alternative Money Flow, Put-Call Parity

Input processing: smoothing(exp)

Metrics: same as homework



In [241]:
# Load libraries
import numpy as np
import pandas as pd
import pandas_datareader.data as web
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

#Libraries for Deep Learning Models
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from keras.layers import LSTM
from keras.wrappers.scikit_learn import KerasRegressor

#Libraries for Statistical Models
import statsmodels.api as sm

#Libraries for Saving the Model
from pickle import dump
from pickle import load

# Time series Models
from statsmodels.tsa.arima_model import ARIMA
#from statsmodels.tsa.statespace.sarimax import SARIMAX

# Error Metrics
from sklearn.metrics import mean_squared_error

# Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression


#Plotting 
from pandas.plotting import scatter_matrix
from statsmodels.graphics.tsaplots import plot_acf


In [254]:
from sklearn.preprocessing import FunctionTransformer
from scipy.stats import spearmanr

from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit

from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [243]:
# stocks
appl_data = pd.read_csv('AAPL.csv')
jpm_data = pd.read_csv('JPM.csv')
msft_data = pd.read_csv('MSFT.csv') 
amzn_data = pd.read_csv('AMZN.csv')

# ETFs
spy_data = pd.read_csv('SPY.csv')
qqq_data = pd.read_csv('QQQ.csv')
ief_data = pd.read_csv('IEF.csv')
# indices 
vix_data = pd.read_csv('^VIX.csv')
dia_data = pd.read_csv('DIA.csv')
sp500_data = pd.read_csv('SP500.csv')

# predicting stock (y-variable)
ivv_data = pd.read_csv('IVV.csv')

In [244]:
# function to clean the data 
# changes dates to datetime format & ensures all numbers are type float64 
def clean_data(df): 
    cleaned_df = pd.DataFrame()
    cleaned_df["Date"] = pd.to_datetime(df.iloc[:, 0])
    cleaned_df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] = df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]].astype(str).astype(float)
    return cleaned_df
 
    
df_list = [appl_data, jpm_data, msft_data, amzn_data, spy_data, qqq_data,
        vix_data, dia_data, sp500_data, ivv_data]

# iterates through list of all dataframes and cleans format 
for stock in df_list: 
    df = pd.DataFrame()
    df = clean_data(stock)
    stock = df

In [245]:
stk_tickers = ['AAPL', 'JPM', 'MSFT','AMZN']
etf_tickers = ['SPY', 'QQQ']
idx_tickers = ['SP500', 'DIA','^VIX']

In [246]:
import talib as ta
def get_indicator(df,indicator):
    ret_df = df
    timeperiod = 14
    if 'MACD' in indicator: #Moving Average Convergence Divergence
        macd, macdsignal, macdhist = ta.MACD(df.Close.values, fastperiod=12, slowperiod=26, signalperiod=9)
        ret_df = ret_df.join(pd.DataFrame([macd, macdsignal, macdhist]).T.rename(columns={0: "Macddif", 1: "Macddem", 2: "Macdhist"}))
    if 'MFI' in indicator: 
        mfi = ta.MFI(np.reshape(df.High.values.astype('float64'),-1), \
            np.reshape(df.Low.values.astype('float64'),-1), np.reshape(df.Close.values.astype('float64'),-1),\
                 np.reshape(df.Volume.values.astype('float64'),-1), timeperiod=timeperiod)
        ret_df = ret_df.join(pd.DataFrame([mfi]).T.rename(columns={0: "Mfi"}))
    if 'SMA' in indicator: 
        sma = ta.SMA(df.Close.values)
        ret_df = ret_df.join(pd.DataFrame([sma]).T.rename(columns={0: "Sma"}))
    if 'MOM' in indicator: 
        mom = ta.MOM(df.Close.values, timeperiod=timeperiod)
        ret_df = ret_df.join(pd.DataFrame([mom]).T.rename(columns={0: "Mom"}))  
    if 'ROC' in indicator: 
        roc = ta.ROC(df.Close.values, timeperiod=timeperiod)
        ret_df = ret_df.join(pd.DataFrame([roc]).T.rename(columns={0: "Roc"}))  
    if 'RSI' in indicator: 
        rsi = ta.RSI(df.Close.values, timeperiod=timeperiod)
        ret_df = ret_df.join(pd.DataFrame([rsi]).T.rename(columns={0: "Rsi"}))  
    if 'ATR' in indicator: #Volatility Indicator   -   Average True Range
        rsi = ta.ATR(df.High.values,df.Low.values,df.Close.values, timeperiod=timeperiod)
        ret_df = ret_df.join(pd.DataFrame([rsi]).T.rename(columns={0: "Atr"}))  
    return ret_df

In [247]:
#### TECHNICAL INDICATOR ########################################################################################################### 
#### LAGS 

# create function to calculate lags based on return period (value can be changed)
return_period = 3

def calc_general_lag(df):
    temp = pd.DataFrame(np.log(df["Adj Close"]).diff(return_period).shift(-return_period))
    return temp

# iterate through entire list except for ivv_data 
appended_data = []
for stock in df_list:
    temp = calc_general_lag(stock)
    appended_data.append(temp)


dataset = pd.DataFrame()
dataset = pd.concat(appended_data, axis=1)
col_names = ["AAPL", "JPM", "MSFT", "AMZN", "SPY", "QQQ", "VIX", "DIA", "SP500", "IVV_pred"]
dataset.columns = col_names


# calculate lags specifically for the y-variable (IVV) - lags are in multiples of 3 
ivv_lags = pd.concat([np.log(ivv_data["Adj Close"]).diff(i) for i in [return_period, return_period*3, return_period*6, return_period*10]], axis=1)
ivv_lags.columns = ["IVV_" + str(return_period), "IVV_" + str(return_period*3), "IVV_" + str(return_period*6), "IVV_" + str(return_period*10)]

# full dataset with all X and Y variables 
full_data = pd.concat([dataset, ivv_lags], axis=1).dropna().iloc[::return_period, :]

# data description 
data_descrip = full_data.describe()


# set the X and Y variables 
Y = full_data[["IVV_pred"]].reset_index(drop= True)
X = full_data.loc[:, full_data.columns != 'IVV_pred'].reset_index(drop= True)

In [248]:
# calculate mfi using yahoo data
p1=pd.DataFrame(np.log(spy_data['Adj Close']))
p2=pd.DataFrame(np.log(ief_data['Adj Close']))
mfi=(p1.subtract(p2).dropna())
mfi=mfi.rolling(20).mean()
mfi.columns = ['Mfi']
X = X.join(mfi)

In [249]:
ivv = get_indicator(ivv_data, ['SMA','RSI','ROC','MOM','ATR'])
X = X.join(ivv.iloc[:,-5:])
X

Unnamed: 0,AAPL,JPM,MSFT,AMZN,SPY,QQQ,VIX,DIA,SP500,IVV_3,IVV_9,IVV_18,IVV_30,Mfi,Sma,Mom,Roc,Rsi,Atr
0,0.058253,-0.075365,-0.003345,0.065253,-0.030803,-0.047204,0.046368,-0.025585,-0.023136,-0.024613,-0.025049,-0.064056,0.011124,,,,,,
1,-0.011380,-0.059983,-0.054218,-0.099549,-0.034099,-0.042381,0.055973,-0.030358,-0.041999,-0.030727,-0.018245,-0.108627,0.006382,,,,,,
2,-0.010155,0.001038,0.021219,0.066289,0.024442,0.027219,-0.136994,0.012735,0.025169,-0.033184,-0.088524,-0.112120,-0.083386,,,,,,
3,-0.013010,0.024098,0.000649,-0.011730,-0.000116,0.017440,-0.013970,0.002380,-0.008268,0.025398,-0.038512,-0.063561,-0.068887,,,,,,
4,-0.033636,-0.159492,-0.054681,-0.023882,-0.059109,-0.077052,0.145404,-0.051820,-0.057438,0.000234,-0.007552,-0.025797,-0.102335,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607,-0.022461,-0.002562,-0.015776,-0.010553,-0.010535,-0.023069,0.127979,-0.006157,-0.010877,0.014216,0.028331,0.077686,0.053848,-0.081495,87.863333,1.919998,2.151982,52.433775,4.372908
1608,0.013964,-0.006344,0.015806,0.018109,0.008184,0.013008,-0.127372,0.000471,0.007737,-0.010385,0.011711,0.039119,0.065051,-0.081245,87.650333,-0.650002,-0.721263,50.035584,4.309843
1609,0.051137,-0.021299,0.017182,0.041537,0.004908,0.017932,0.064576,-0.005246,0.004632,0.008002,0.011832,0.032552,0.086740,-0.076192,87.594667,6.990005,8.499520,49.683890,4.165569
1610,0.022176,0.031572,-0.010575,-0.031893,-0.003284,-0.010685,0.096911,-0.001755,-0.002946,0.004910,0.002527,0.030858,0.068722,-0.074741,87.421333,2.559998,2.994850,47.886576,4.097314


In [250]:
X = X.fillna(0)

In [258]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=1)