In [233]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn import metrics
import pandas_datareader.data as web 
from time import time
from sklearn.cluster import KMeans

# Get Stock Data

In [234]:
stk_tickers = ['MSFT', 'IBM', 'GOOGL']
ccy_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA', 'VIXCLS']
stk_data = web.DataReader(stk_tickers, 'yahoo')
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers, 'fred')

In [235]:
#Select columns
base = stk_data.loc[:, ('Adj Close', 'MSFT')]
X1 = stk_data.loc[:, ('Adj Close', ('GOOGL', 'IBM'))]
X2 = ccy_data
X3 = idx_data
print(X1)
print(X2)
print(X2)

Attributes   Adj Close            
Symbols          GOOGL         IBM
Date                              
2017-09-22   47.162998  109.384445
2017-09-25   46.714001  109.942177
2017-09-26   46.871498  110.462219
2017-09-27   47.994999  109.783913
2017-09-28   48.240501  109.783913
...                ...         ...
2022-09-16  102.800003  127.269997
2022-09-19  103.070000  127.730003
2022-09-20  101.139999  126.300003
2022-09-21   99.279999  124.930000
2022-09-22  100.489998  125.730003

[1259 rows x 2 columns]
            DEXJPUS  DEXUSUK
DATE                        
2017-09-25   111.64   1.3457
2017-09-26   112.16   1.3422
2017-09-27   112.76   1.3401
2017-09-28   112.66   1.3435
2017-09-29   112.64   1.3402
...             ...      ...
2022-09-12   142.41   1.1701
2022-09-13   144.30   1.1526
2022-09-14   142.93   1.1564
2022-09-15   143.57   1.1472
2022-09-16   143.04   1.1419

[1300 rows x 2 columns]
            DEXJPUS  DEXUSUK
DATE                        
2017-09-25   111.64   1.3

In [236]:
#Standardized data (X1, X2, X3) with kept index (date)
standard_scaler = preprocessing.StandardScaler()
sX1 = standard_scaler.fit_transform(X1.values)
sX2 = standard_scaler.fit_transform(X2.values)
sX3 = standard_scaler.fit_transform(X3.values)
X1 = pd.DataFrame(index = X1.index,data = sX1,columns=X1.columns)
X2 = pd.DataFrame(index = X2.index,data = sX2,columns=X2.columns)
X3 = pd.DataFrame(index = X3.index,data = sX3,columns=X3.columns)

## Calculate ความแตกต่างของค่า ราคา 'Adj Close', 'MSFT’)ย้อนหลัง backHistory วัน

In [237]:

backHistory = [30, 45, 60, 90, 180, 240] #-> ทดลองหยิบ 3 ค่า 3 รูปแบบ เพื่อดูระยะเวลาการดูค่าข้อมูลย้อนหลงัหลายๆแบบและเปรียบเทียบ MSE
BH1, BH2, BH3 = backHistory[1], backHistory[3], backHistory[4]
return_period = 1
Y = base.shift(-return_period)
X4_BH1 = base.diff( BH1).shift( - BH1)
X4_BH2 = base.diff( BH2).shift( - BH2)
X4_BH3 = base.diff( BH3).shift( - BH3)

X4 = pd.concat([X4_BH1, X4_BH2, X4_BH3], axis=1)
X4.columns = ['MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT']
X4 = pd.DataFrame(standard_scaler.fit_transform(X4.values), index = X4.index,columns=X4.columns)

In [238]:
# Forming Dataset
X = pd.concat([X1, X2, X3, X4], axis=1)
dataset = pd.concat([Y, X], axis=1)
dataset.dropna(inplace=True)
dataset.describe()

Unnamed: 0,"(Adj Close, MSFT)","(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
count,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0,1061.0
mean,165.735348,-0.20464,-0.222764,-0.338575,0.130736,-0.197839,-0.184988,-0.101468,0.126858,0.151234,0.001493
std,73.249715,0.921302,0.874385,0.35705,0.913578,0.931764,0.9504,1.034052,0.890223,0.87628,0.994518
min,68.838737,-1.15861,-3.287831,-1.214892,-2.751596,-1.663539,-2.42345,-1.308869,-3.834154,-3.765301,-3.679393
25%,102.69104,-0.861349,-0.698637,-0.593148,-0.461011,-0.901739,-0.865044,-0.779002,-0.269338,-0.232158,-0.412228
50%,140.447128,-0.646918,-0.248777,-0.3265,0.052543,-0.590221,-0.510803,-0.362019,0.090991,0.087606,0.053752
75%,214.606918,0.14001,0.261673,-0.081964,0.885793,0.416846,0.34716,0.232355,0.532588,0.590275,0.626092
max,340.882782,2.107241,1.898212,0.554088,2.07785,2.055378,1.992546,7.163257,2.578588,2.211375,2.422772


In [239]:
# Assign X, Y (drop datetime index)
Y = pd.DataFrame(dataset[dataset.columns[0]])
X = pd.DataFrame(dataset[dataset.columns[1:]])
print(Y)
print(X)

             Adj Close
                  MSFT
2017-09-25   68.838737
2017-09-26   69.393120
2017-09-27   69.411942
2017-09-28   69.994514
2017-09-29   70.107254
...                ...
2021-12-28  339.730377
2021-12-29  337.117432
2021-12-30  334.136932
2022-01-03  326.874359
2022-01-04  314.326324

[1061 rows x 1 columns]
            (Adj Close, GOOGL)  (Adj Close, IBM)   DEXJPUS   DEXUSUK  \
2017-09-25           -1.158610         -0.510051 -0.028635  0.589904   
2017-09-26           -1.153622         -0.462052  0.039003  0.530386   
2017-09-27           -1.118042         -0.524659  0.117046  0.494676   
2017-09-28           -1.110267         -0.524659  0.104039  0.552493   
2017-09-29           -1.096159         -0.565007  0.101437  0.496376   
...                        ...               ...       ...       ...   
2021-12-28            2.007436          1.144654  0.375889  0.547391   
2021-12-29            2.006423          1.208725  0.404505  0.620513   
2021-12-30            1.9920

In [240]:
X.corr()

Unnamed: 0,"(Adj Close, GOOGL)","(Adj Close, IBM)",DEXJPUS,DEXUSUK,SP500,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
"(Adj Close, GOOGL)",1.0,0.486192,0.094845,0.48343,0.98274,0.952258,0.047198,0.04046,-0.059934,-0.370735
"(Adj Close, IBM)",0.486192,1.0,0.236112,0.588217,0.506262,0.589115,-0.532156,-0.002491,-0.047912,-0.375523
DEXJPUS,0.094845,0.236112,1.0,0.07712,0.011478,0.057563,-0.407793,-0.287386,-0.428954,-0.575805
DEXUSUK,0.48343,0.588217,0.07712,1.0,0.486225,0.52874,-0.304232,0.063434,0.026694,-0.199003
SP500,0.98274,0.506262,0.011478,0.486225,1.0,0.982879,0.002491,0.011353,-0.035838,-0.281055
DJIA,0.952258,0.589115,0.057563,0.52874,0.982879,1.0,-0.126383,-0.007451,-0.029897,-0.27078
VIXCLS,0.047198,-0.532156,-0.407793,-0.304232,0.002491,-0.126383,1.0,0.284879,0.338162,0.259866
MSFT_3DT,0.04046,-0.002491,-0.287386,0.063434,0.011353,-0.007451,0.284879,1.0,0.654932,0.389416
MSFT_6DT,-0.059934,-0.047912,-0.428954,0.026694,-0.035838,-0.029897,0.338162,0.654932,1.0,0.671242
MSFT_12DT,-0.370735,-0.375523,-0.575805,-0.199003,-0.281055,-0.27078,0.259866,0.389416,0.671242,1.0


In [241]:
lower = pd.DataFrame(np.tril(X.corr(),-1),columns = X.columns)
to_drop = [column for column in lower.columns if any(lower[column] > 0.9)]
X.drop(columns=to_drop,inplace=True)
to_drop

[('Adj Close', 'GOOGL'), 'SP500']

In [242]:
X

Unnamed: 0,"(Adj Close, IBM)",DEXJPUS,DEXUSUK,DJIA,VIXCLS,MSFT_3DT,MSFT_6DT,MSFT_12DT
2017-09-25,-0.510051,-0.028635,0.589904,-1.525064,-1.185618,0.238209,0.144284,-0.149173
2017-09-26,-0.462052,0.039003,0.530386,-1.527919,-1.190225,0.154514,-0.000797,-0.133489
2017-09-27,-0.524659,0.117046,0.494676,-1.514242,-1.224782,0.167719,0.104480,-0.184994
2017-09-28,-0.524659,0.104039,0.552493,-1.504422,-1.261642,0.171527,0.037701,-0.165449
2017-09-29,-0.565007,0.101437,0.496376,-1.498628,-1.266250,-0.034823,-0.162530,-0.182272
...,...,...,...,...,...,...,...,...
2021-12-28,1.144654,0.375889,0.547391,1.895183,-0.341285,-2.969945,-3.243612,-3.640829
2021-12-29,1.208725,0.404505,0.620513,1.917113,-0.409246,-3.357167,-3.682566,-3.679393
2021-12-30,1.258558,0.430520,0.663026,1.895151,-0.365474,-3.834154,-3.377298,-3.610294
2022-01-03,1.448097,0.443527,0.610310,1.940500,-0.449562,-3.023422,-3.765301,-3.640026


# Train / Test Preparation (try 2 Option)    

In [243]:

from math import ceil, floor
test_size = floor(0.3 * len( X ))
train_size = floor(0.7 * len( X ))
print(f'test size = {test_size}\ntrain size = {train_size}')
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]
# Option #2
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=seed)

test size = 318
train size = 742


# 6.2 GridSearchCV()

In [244]:
# Create Model List
regression = { 'LR': LinearRegression(), 'SVR': SVR(), }
# Create Parameter Dictionary for Linear Regression
fit_intercept = [True, False]
normalize = [True, False]
params_LR = dict( fit_intercept = fit_intercept, normalize = normalize)
print(params_LR)

{'fit_intercept': [True, False], 'normalize': [True, False]}


In [245]:
#Create Parameter Dictionary for SVR
kernel = ['linear', 'rbf', 'poly']
C_list = [10, 100]
ep_list = [0.1, 1, 5]
gamma = [0.01, 0.1]
degree = [2, 3]       #3x2x3x2x2 = 72
params_SVR = dict( kernel = kernel, C = C_list, epsilon = ep_list, gamma = gamma, degree = degree )

In [246]:
for EST in regression:
    model = regression[EST]
    if (EST == 'LR'):
        params = params_LR
    else:
        params = params_SVR

    grid = GridSearchCV( 
    estimator=model,                        #model
    n_jobs = 8,                             #thread ที่ใช้               
    verbose = 10,
    cv = 3,                                 #kfold
    scoring = 'neg_mean_squared_error',     #mse
    param_grid = params)                    #parameter ที่ใช้
    grid_result = grid.fit(X_train, Y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [247]:
# Show Best Parameters for both models
print('Best params: ',grid_result.best_params_)
print('Best score: ', grid_result.best_score_)

Best params:  {'C': 100, 'degree': 2, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'linear'}
Best score:  -1208.8689962960705


## Show Score for each parameter combination for both model

In [248]:
means   = grid_result.cv_results_['mean_test_score']
stds    = grid_result.cv_results_['std_test_score']
params  = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

-1225.817949 (658.480453) with: {'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'linear'}
-1678.130501 (1266.647329) with: {'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
-2594.322478 (2144.303949) with: {'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'poly'}
-1225.817949 (658.480453) with: {'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
-1697.742504 (1135.229966) with: {'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
-2458.360483 (1790.037870) with: {'C': 10, 'degree': 2, 'epsilon': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
-1231.369831 (646.473911) with: {'C': 10, 'degree': 2, 'epsilon': 1, 'gamma': 0.01, 'kernel': 'linear'}
-1662.000794 (1242.012497) with: {'C': 10, 'degree': 2, 'epsilon': 1, 'gamma': 0.01, 'kernel': 'rbf'}
-2602.479674 (2145.296073) with: {'C': 10, 'degree': 2, 'epsilon': 1, 'gamma': 0.01, 'kernel': 'poly'}
-1231.369831 (646.473911) with: {'C': 10, 'degree': 2, 'epsilon'

## Display Mean, std, params

In [249]:
#barn_clusters=np.unique(Y_test_op1)
kmeans = KMeans(n_clusters= n_clusters.size, random_state=0)
clusters_test = kmeans.fit_predict(X_test_op1)
### y_train
n_clusters=np.unique(Y_train_op1)
kmeans = KMeans(n_clusters= n_clusters.size, random_state=0)
clusters_train = kmeans.fit_predict(X_train_op1)
### LM_pred
Model_LM = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
LM_pred = Model_LM.fit(X_train_op1, Y_train_op1).predict(X_test_op1)
n_clusters_LM = np.unique(LM_pred)
kmeans_LM = KMeans(n_clusters=n_clusters_LM.size,random_state=0)
clusters_LM = kmeans_LM.fit_predict(X_test_op1)
plt.scatter(np.arange(len(LM_pred)),LM_pred,c=clusters_LM, edgecolors='r',alpha=0.75,s=3)
plt.scatter(np.arange(len(Y_train_op1)),Y_train_op1,c=clusters_train, edgecolors='b',alpha=0.75,s=2)
plt.scatter(np.arange(len(Y_test_op1)),Y_test_op1,c=clusters_test, edgecolors='m',alpha=0.75,s=1)
plt.show()

NameError: name 'n_clusters' is not defined

# 6.3 RandomizedSearchCV()

In [None]:
# Create Model List
regression = { 'LR': LinearRegression(), 'SVR': SVR()}

In [None]:
# Create Parameter Dictionary for Linear Regression
fit_intercept = [True, False]
normalize = [True, False]
params_LR = dict( fit_intercept = fit_intercept, normalize = normalize)

In [None]:
# Create Parameter Dictionary for SVR
kernel = ['linear', 'rbf', 'poly']
C_list = list(np.linspace(0.1, 150, 5, dtype = float))
ep_list = list(np.linspace(0.1, 1, 5, dtype = float))
gamma = list(np.linspace(0.01, 0.1, 5, dtype = float))
degree = [2, 3]
params_SVR = dict( kernel = kernel, C = C_list, epsilon = ep_list, gamma = gamma, degree = degree )

## Show scatter plot compare y_test vs each model prediction

In [None]:
for EST in regression:
    model = regression[EST]
    if (EST == 'LR'):
        params = params_LR
    else:
        params = params_SVR

    grid_rand = RandomizedSearchCV( 
                                    estimator=model, 
                                    n_jobs = 8,
                                    verbose = 10,
                                    cv = 5,
                                    scoring = 'neg_mean_squared_error',
                                    param_distributions = params )
    grid_rand_result = grid_rand.fit(X_train, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
# Show Best Parameters for both models
print('Best params: ',grid_rand_result.best_params_)
print('Best score: ', grid_rand_result.best_score_)

Best params:  {'kernel': 'rbf', 'gamma': 0.0325, 'epsilon': 1.0, 'degree': 3, 'C': 112.525}
Best score:  -894.7746876984313


In [None]:
# Show Score for each parameter combination for both model
means = grid_rand_result.cv_results_['mean_test_score']
stds = grid_rand_result.cv_results_['std_test_score']
params = grid_rand_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

-1245.777634 (1412.131269) with: {'kernel': 'linear', 'gamma': 0.01, 'epsilon': 1.0, 'degree': 2, 'C': 75.05}
-2388.738798 (2322.162828) with: {'kernel': 'rbf', 'gamma': 0.01, 'epsilon': 1.0, 'degree': 3, 'C': 0.1}
-2251.145525 (2302.916618) with: {'kernel': 'poly', 'gamma': 0.0325, 'epsilon': 0.1, 'degree': 3, 'C': 37.575}
-1236.433812 (1410.460413) with: {'kernel': 'linear', 'gamma': 0.0775, 'epsilon': 0.775, 'degree': 3, 'C': 150.0}
-1766.108337 (2534.100239) with: {'kernel': 'poly', 'gamma': 0.05500000000000001, 'epsilon': 0.325, 'degree': 3, 'C': 112.525}
-1195.829091 (1362.953497) with: {'kernel': 'linear', 'gamma': 0.0775, 'epsilon': 0.1, 'degree': 2, 'C': 75.05}
-1219.004043 (1385.493243) with: {'kernel': 'linear', 'gamma': 0.1, 'epsilon': 0.55, 'degree': 3, 'C': 150.0}
-894.774688 (1137.036632) with: {'kernel': 'rbf', 'gamma': 0.0325, 'epsilon': 1.0, 'degree': 3, 'C': 112.525}
-2241.632136 (2489.361174) with: {'kernel': 'poly', 'gamma': 0.01, 'epsilon': 0.1, 'degree': 2, 'C': 

## Display Mean, std, params

## Show scatter plot compare y_test vs each model prediction