In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

In [None]:
!pip install yfinance
!pip install quandl

In [4]:
import yfinance as yf
import quandl

In [5]:
#symbols = ['HG=F', 'PL=F','SI=F', 'GC=F', 'LBS=F', 'CL=F']
#start = '2011-01-01'
#end = '2021-07-10'

In [None]:
#df = pd.DataFrame()
#for s in symbols:
 #   df[s] = yf.download(s,start,end)['Adj Close']

In [6]:
quandl.ApiConfig.api_key = "dFWX9cPgKNkz_shZZ2x3"

In [7]:
# Industrial Production
indpro = quandl.get("FRED/INDPRO", start_date='2000-07-01', end_date = '2021-07-01', collapse = 'monthly')
indpro.rename(columns={ "Value" : "INDPRO"}, inplace = True)
indpro.tail()

Unnamed: 0_level_0,INDPRO
Date,Unnamed: 1_level_1
2021-02-28,96.372
2021-03-31,98.94
2021-04-30,98.966
2021-05-31,99.6536
2021-06-30,100.0951


In [8]:
# CPI for All Urban Consumers
CPI = quandl.get("FRED/CPIAUCSL", start_date='2000-07-01', end_date = '2021-07-01', collapse = 'monthly')
CPI.rename(columns={ "Value" : "CPI_All"}, inplace = True)
CPI.tail()

Unnamed: 0_level_0,CPI_All
Date,Unnamed: 1_level_1
2021-02-28,263.161
2021-03-31,264.793
2021-04-30,266.832
2021-05-31,268.551
2021-06-30,270.981


In [9]:
# Aggregate data into single dataframe
fred_data = pd.concat([indpro, CPI], axis=1, join= 'inner')

In [10]:
# Set the lag period for lagging the data
return_period = 3

In [11]:
# Reframe data into supervised regression-based framework 

Y = np.log(fred_data.loc[:, ('CPI_All')]).diff(return_period).shift(-return_period)
Y.name = Y.name[-1]+'_pred'

X = np.log(fred_data.loc[:, ( 'INDPRO')]).diff(return_period)

dataset = pd.concat([Y, X], axis=1).dropna().iloc[::return_period, :]
Y = dataset.loc[:, Y.name]
X = dataset.loc[:, X.name]

In [None]:
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

#Libraries for Deep Learning Models
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.layers import LSTM
from keras.wrappers.scikit_learn import KerasRegressor

#Libraries for Statistical Models
import statsmodels.api as sm

#Libraries for Saving the Model
from pickle import dump
from pickle import load

# Error Metrics
from sklearn.metrics import mean_squared_error

# Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression

In [13]:
# Split data into training and test sets
validation_size = 0.2

#In case the data is not dependent on the time series, then train and test split randomly
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)

#In case the data is not dependent on the time series, then train and test split should be done based on sequential sample
#This can be done by selecting an arbitrary split point in the ordered list of observations and creating two new datasets.
train_size = int(len(X) * (1-validation_size))
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]

In [14]:
num_folds = 10
seed = 7
# scikit is moving away from mean_squared_error. 
# In order to avoid confusion, and to allow comparison with other models, we invert the final scores
scoring = 'neg_mean_squared_error' 

In [15]:
# Model selection
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))
models.append(('MLP', MLPRegressor()))
# Boosting methods
models.append(('ABR', AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
# Bagging methods
models.append(('RFR', RandomForestRegressor()))
models.append(('ETR', ExtraTreesRegressor()))

In [19]:
# Loop over the models 
names = []
kfold_results = []
test_results = []
train_results = []
for name, model in models:
    names.append(name)
   
    ## K Fold analysis:
    
    kfold = KFold(n_splits=num_folds)
    #converted mean square error to positive. The lower the beter
    cv_results = -1* cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    kfold_results.append(cv_results)
    

    # Full Training period
    #lr.fit(x.reshape(-1, 1), y)
    
    res = model.fit(X_train.array.reshape(-1,1), Y_train)
    train_result = mean_squared_error(res.predict(X_train), Y_train)
    train_results.append(train_result)
    
    # Test results
    test_result = mean_squared_error(res.predict(X_test), Y_test)
    test_results.append(test_result)
    
    msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result)
    print(msg)


Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Expected 2D array, got 1D array instead:
array=[ 0.01198668 -0.00272389  0.00831039 -0.00813497  0.00613703  0.00584237
  0.00897557  0.00604451  0.00683934  0.01099468  0.01377789  0.00774591
  0.00184316 -0.00410973  0.01734377  0.00576864  0.00245492  0.00146946
  0.00569312  0.01874667 -0.00028309  0.00163451  0.00418703 -0.01382729
 -0.01294039 -0.05053464 -0.06681205 -0.03028361 -0.00225409  0.02205772
  0.01825478  0.01439611  0.01998038  0.00340126  0.00822076  0.00239665
  0.00942175  0.01158421  0.01129554  0.00553041  0.00374066 -0.00147736
  0.00645886  0.00838211 -0.00073062  0.01002235  0.00121227  0.01779098
  0.00991011  0.00139138 -0.00139138 -0.01530399 -0.00101546 -0.01017482
 -0.00680332 -0.01022909  0.00437664 -0.00231716].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it cont

ValueError: ignored