In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn import metrics
import pandas_datareader.data as web 
from time import time
from sklearn.model_selection import KFold

#Read stock data use pandas_datareader.data from web
#Get Stock Data

In [2]:
stk_tickers = ['MSFT', 'IBM', 'GOOGL']
ccy_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA', 'VIXCLS']
stk_data = web.DataReader(stk_tickers, 'yahoo')
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers, 'fred')

In [3]:
#Select columns
base = stk_data.loc[:, ('Adj Close', 'MSFT')]
X1 = stk_data.loc[:, ('Adj Close', ('GOOGL', 'IBM'))]
X2 = ccy_data
X3 = idx_data
print(X1)
print(X2)
print(X2)

Attributes   Adj Close            
Symbols          GOOGL         IBM
Date                              
2017-09-22   47.162998  109.384438
2017-09-25   46.714001  109.942169
2017-09-26   46.871498  110.462219
2017-09-27   47.994999  109.783897
2017-09-28   48.240501  109.783897
...                ...         ...
2022-09-15  102.910004  125.489998
2022-09-16  102.800003  127.269997
2022-09-19  103.070000  127.730003
2022-09-20  101.139999  126.300003
2022-09-21   99.279999  124.930000

[1258 rows x 2 columns]
            DEXJPUS  DEXUSUK
DATE                        
2017-09-25   111.64   1.3457
2017-09-26   112.16   1.3422
2017-09-27   112.76   1.3401
2017-09-28   112.66   1.3435
2017-09-29   112.64   1.3402
...             ...      ...
2022-09-12   142.41   1.1701
2022-09-13   144.30   1.1526
2022-09-14   142.93   1.1564
2022-09-15   143.57   1.1472
2022-09-16   143.04   1.1419

[1300 rows x 2 columns]
            DEXJPUS  DEXUSUK
DATE                        
2017-09-25   111.64   1.3

In [4]:
#Standardized data (X1, X2, X3) with kept index (date)
standard_scaler = preprocessing.StandardScaler()
x1 = standard_scaler.fit_transform(X1.values)
X1 = pd.DataFrame(data=x1,index=X1.index,columns=X1.columns)
X1

Attributes,Adj Close,Adj Close
Symbols,GOOGL,IBM
Date,Unnamed: 1_level_2,Unnamed: 2_level_2
2017-09-22,-1.143638,-0.560755
2017-09-25,-1.157853,-0.509278
2017-09-26,-1.152867,-0.461280
2017-09-27,-1.117297,-0.523886
2017-09-28,-1.109524,-0.523886
...,...,...
2022-09-15,0.621321,0.925725
2022-09-16,0.617838,1.090012
2022-09-19,0.626387,1.132469
2022-09-20,0.565282,1.000485


#Calculate ความแตกต่างของค่า ราคา 'Adj Close', 'MSFT’)ย้อนหลัง backHistory วัน

In [5]:

backHistory = [30, 45, 60, 90, 180, 240] #-> ทดลองหยิบ 3 ค่า 3 รูปแบบ เพื่อดูระยะเวลาการดูค่าข้อมูลย้อนหลงัหลายๆแบบและเปรียบเทียบ MSE
BH1, BH2, BH3 = backHistory[1], backHistory[3], backHistory[4]
return_period = 1
Y = base.shift(-return_period)
X4_BH1 = base.diff( BH1).shift( - BH1)
X4_BH2 = base.diff( BH2).shift( - BH2)
X4_BH3 = base.diff( BH3).shift( - BH3)

X4 = pd.concat([X4_BH1, X4_BH2, X4_BH3], axis=1)
X4.columns = ['MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT']
X4 = pd.DataFrame(standard_scaler.fit_transform(X4.values), index = X4.index,columns=X4.columns)

In [6]:
# Forming Dataset
X = pd.concat([X1, X2, X3, X4], axis=1)
dataset = pd.concat([Y, X], axis=1)
dataset.dropna(inplace=True)
dataset.describe()


Unnamed: 0,"(Adj Close, MSFT)","(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
count,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0,1060.0
mean,165.595168,-0.206168,-0.223745,109.250689,1.318665,3244.556321,27812.927142,19.6245,0.128459,0.153403,0.001555
std,73.141771,0.91913,0.872918,2.738197,0.053738,639.726308,3910.742829,8.980875,0.88643,0.870535,0.994364
min,68.838745,-1.157853,-3.286945,102.52,1.1492,2237.4,18591.93,9.14,-3.837785,-3.767676,-3.700736
25%,102.671837,-0.860913,-0.698382,107.3,1.283825,2761.265,25016.9675,13.74,-0.269064,-0.23321,-0.416738
50%,140.403389,-0.646407,-0.248734,109.345,1.31405,2975.975,26473.955,17.38,0.089913,0.087077,0.051284
75%,214.511066,0.138314,0.262102,111.2225,1.363175,3668.3275,30003.3225,22.525,0.531877,0.590334,0.626406
max,340.882812,2.107085,1.898889,115.34,1.4332,4796.56,36585.06,82.69,2.578861,2.210963,2.43157


In [7]:
# Assign X, Y (drop datetime index)
Y = pd.DataFrame(dataset[dataset.columns[0]])
X = pd.DataFrame(dataset[dataset.columns[1:]])
print(Y)
print(X)

             Adj Close
                  MSFT
2017-09-25   68.838745
2017-09-26   69.393127
2017-09-27   69.411934
2017-09-28   69.994507
2017-09-29   70.107277
...                ...
2021-12-27  339.034882
2021-12-28  339.730377
2021-12-29  337.117401
2021-12-30  334.136902
2022-01-03  326.874359

[1060 rows x 1 columns]
            (Adj Close, GOOGL)  (Adj Close, IBM)  DEXJPUS  DEXUSUK    SP500  \
2017-09-25           -1.157853         -0.509278   111.64   1.3457  2496.66   
2017-09-26           -1.152867         -0.461280   112.16   1.3422  2496.84   
2017-09-27           -1.117297         -0.523886   112.76   1.3401  2507.04   
2017-09-28           -1.109524         -0.523886   112.66   1.3435  2510.06   
2017-09-29           -1.095420         -0.564233   112.64   1.3402  2519.36   
...                        ...               ...      ...      ...      ...   
2021-12-27            2.045917          1.055486   114.85   1.3438  4791.19   
2021-12-28            2.007308          1.14

In [8]:
X.corr()

Unnamed: 0,"(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
"(Adj Close, GOOGL)",1.0,0.483811,0.089856,0.483306,0.982652,0.952017,0.047985,0.048202,-0.051809,-0.366191
"(Adj Close, IBM)",0.483811,1.0,0.2323,0.588254,0.503895,0.587237,-0.5327,0.004238,-0.040437,-0.371538
DEXJPUS,0.089856,0.2323,1.0,0.075791,0.005813,0.052438,-0.408304,-0.281791,-0.424017,-0.572584
DEXUSUK,0.483306,0.588254,0.075791,1.0,0.48615,0.528725,-0.304119,0.065906,0.029387,-0.197994
SP500,0.982652,0.503895,0.005813,0.48615,1.0,0.982796,0.003188,0.019159,-0.027135,-0.275541
DJIA,0.952017,0.587237,0.052438,0.528725,0.982796,1.0,-0.126048,-0.000189,-0.021588,-0.265506
VIXCLS,0.047985,-0.5327,-0.408304,-0.304119,0.003188,-0.126048,1.0,0.285457,0.339549,0.260343
MSFT_3DT,0.048202,0.004238,-0.281791,0.065906,0.019159,-0.000189,0.285457,1.0,0.650676,0.382723
MSFT_6DT,-0.051809,-0.040437,-0.424017,0.029387,-0.027135,-0.021588,0.339549,0.650676,1.0,0.667093
MSFT_12DT,-0.366191,-0.371538,-0.572584,-0.197994,-0.275541,-0.265506,0.260343,0.382723,0.667093,1.0


In [9]:
lower = pd.DataFrame(np.tril(X.corr(),-1),columns = X.columns)
to_drop = [column for column in lower.columns if any(lower[column] > 0.9)]
X.drop(columns=to_drop,inplace=True)
to_drop

[('Adj Close', 'GOOGL'), 'SP500']

In [10]:
X

Unnamed: 0,"(Adj Close, IBM)",DEXJPUS,DEXUSUK,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2017-09-25,-0.509278,111.64,1.3457,22296.09,10.21,0.237056,0.143192,-0.153080
2017-09-26,-0.461280,112.16,1.3422,22284.32,10.17,0.153310,-0.001935,-0.137317
2017-09-27,-0.523886,112.76,1.3401,22340.71,9.87,0.166523,0.103376,-0.189077
2017-09-28,-0.523886,112.66,1.3435,22381.20,9.55,0.170334,0.036575,-0.169437
2017-09-29,-0.564233,112.64,1.3402,22405.09,9.51,-0.036141,-0.163721,-0.186342
...,...,...,...,...,...,...,...,...
2021-12-27,1.055486,114.85,1.3438,36302.38,17.68,-2.796915,-3.188239,-3.498090
2021-12-28,1.145360,114.75,1.3432,36398.21,17.54,-2.973052,-3.245813,-3.661982
2021-12-29,1.209427,114.97,1.3475,36488.63,16.95,-3.360510,-3.684912,-3.700736
2021-12-30,1.259260,115.17,1.3500,36398.08,17.33,-3.837785,-3.379543,-3.631295


In [11]:
# Train / Test Preparation (try 2 Option)Option#1
from math import ceil, floor
test_size = floor(0.3 * len( X ))
train_size = floor(0.7 * len( X ))
print(f'test size = {test_size}\ntrain size = {train_size}')
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]

# Option #2
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=seed)

test size = 318
train size = 742


#Create Model List and Parameter Dictionary

In [12]:
# Create Model List
regression = { 'LR': LinearRegression(), 'SVR': SVR(), }
# Create Parameter Dictionary for Linear Regression
fit_intercept = [True, False]
normalize = [True, False]
params_LR = dict( fit_intercept = fit_intercept, normalize = normalize)

#Create Parameter Dictionary for SVR

In [13]:
kernel = ['linear', 'rbf', 'poly']
C_list = [10, 100]
ep_list = [0.1, 1, 5]
gamma = [0.01, 0.1]
degree = [2, 3]
params_SVR = dict( kernel = kernel, C = C_list, epsilon = ep_list, gamma = gamma, degree = degree )

# Cross Validation Model
# set k-fold crossvalidation with shuffle
num_fold = 3
seed = 500
kfold = KFold(n_splits=num_fold, shuffle = True, random_state=seed)
import warnings
warnings.filterwarnings('ignore')

#6.2 GridSearchCV() -> (a)

In [14]:
for EST in regression:
    model = regression[EST]
    if (EST == 'LR'):
        params = params_LR
    else:
        params = params_SVR

    grid = GridSearchCV( estimator=model, n_jobs = 1,
    verbose = 10,
    cv = kfold,
    scoring = 'neg_mean_squared_error',
    param_grid = params )
    grid_result = grid.fit(X_train, Y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3; 1/4] START fit_intercept=True, normalize=True..........................
[CV 1/3; 1/4] END fit_intercept=True, normalize=True;, score=-294.575 total time=   0.0s
[CV 2/3; 1/4] START fit_intercept=True, normalize=True..........................
[CV 2/3; 1/4] END fit_intercept=True, normalize=True;, score=-240.997 total time=   0.0s
[CV 3/3; 1/4] START fit_intercept=True, normalize=True..........................
[CV 3/3; 1/4] END fit_intercept=True, normalize=True;, score=-259.753 total time=   0.0s
[CV 1/3; 2/4] START fit_intercept=True, normalize=False.........................
[CV 1/3; 2/4] END fit_intercept=True, normalize=False;, score=-294.575 total time=   0.0s
[CV 2/3; 2/4] START fit_intercept=True, normalize=False.........................
[CV 2/3; 2/4] END fit_intercept=True, normalize=False;, score=-240.997 total time=   0.0s
[CV 3/3; 2/4] START fit_intercept=True, normalize=False.........................
[CV 3/3

# Show Best Parameters for both models

In [None]:
print('Best params: ',grid_result.best_params_)
print('Best score: ', grid_result.best_score_)

# Show Score for each parameter combination for both model

In [None]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Display Mean, std, params

#6.2 Create Model List and Parameter Dictionary

In [None]:
# Create Model List
regression = { 'LR': LinearRegression(), 'SVR': SVR(), }

In [None]:
# Create Parameter Dictionary for Linear Regression
fit_intercept = [True, False]
normalize = [True, False]
params_LR = dict( fit_intercept = fit_intercept, normalize = normalize)

In [None]:
# Create Parameter Dictionary for SVR
kernel = ['linear', 'rbf', 'poly']
C_list = list(np.linspace(0.1, 150, 5, dtype = float))
ep_list = list(np.linspace(0.1, 1, 5, dtype = float))
gamma = list(np.linspace(0.01, 0.1, 5, dtype = float))
degree = [2, 3]
params_SVR = dict( kernel = kernel, C = C_list, epsilon = ep_list, gamma = gamma, degree = degree )

# Show scatter plot compare y_test vs each model prediction

#6.3 RandomizedSearchCV() -> (a)

In [None]:
for EST in regression:
    model = regression[EST]
    if (EST == 'LR'):
        params = params_LR
    else:
        params = params_SVR

    grid_rand = RandomizedSearchCV( estimator=model, n_jobs = 1,

    verbose = 10,
    cv = k,
    scoring = 'neg_mean_squared_error',
    param_distribution = params )

grid_rand_result = grid_rand.fit(X_train, Y_train)

In [None]:
# Show Best Parameters for both models
print('Best params: ',grid_rand_result.best_params_)
print('Best score: ', grid_rand_result.best_score_)

In [None]:
# Show Score for each parameter combination for both model
means = grid_rand_result.cv_results_['mean_test_score']
stds = grid_rand_result.cv_results_['std_test_score']
params = grid_rand_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Display Mean, std, params

# Show scatter plot compare y_test vs each model prediction