<a href="https://colab.research.google.com/github/nohwiin/ML/blob/master/KOSPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import**

In [None]:
pip install pandas-datareader

In [None]:
import pandas_datareader as pdr
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from datetime import datetime, date, timedelta
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
import copy
from pandas import Series

In [None]:
drive.mount('/gdrive')

%cd "/gdrive/My Drive/KOSPI/Model"
model_path = "/gdrive/My Drive/KOSPI/Model"

# **Make Dataset**

> Train : Val : Test = 6 : 2 : 2

In [None]:
start_train = datetime(2015,1,1)
end_train = datetime(2018,12,31)

start_test = datetime(2019,1,1)
end_test = datetime(2019,12,31)

df_train = pdr.get_data_yahoo("^KS11", start_train, end_train)
df_test = pdr.get_data_yahoo("^KS11", start_test, end_test)

In [None]:
def make_dataset(data, label, window_size=20):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

# **Window_size Selection**
---
다른 외부 정보를 사용하기 앞서 Window_size를 결정해 주기위해 KOSPI만의 정보를 사용해 학습 진행

> [단기: 5, 10, 20][중기: 60][장기: 120, 240]

In [None]:
window_size_list = [5, 10, 20, 60, 120, 240]

> 정규화 가격지수를 반영하기 위해 Min-Max 방식이 아닌 Z-score 방식 사용

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scale_cols = ["High", "Low", "Open", "Close", "Volume", "Adj Close"]

df_train_Normalized = scaler.fit_transform(df_train[scale_cols])
df_train_Normalized = pd.DataFrame(df_train_Normalized)
df_train_Normalized.columns = scale_cols

df_test_Normalized = scaler.fit_transform(df_test[scale_cols])
df_test_Normalized = pd.DataFrame(df_test_Normalized)
df_test_Normalized.columns = scale_cols

> 10일 채택

In [None]:
trained_features = ["High", "Low", "Open", "Volume", "Adj Close"]

for window_size in window_size_list:
  # train dataset
  train_feature, train_label = make_dataset(df_train_Normalized[trained_features], df_train_Normalized["Close"], window_size)

  # train, validation set 생성
  x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)

  test_feature, test_label = make_dataset(df_test_Normalized[trained_features], df_test_Normalized["Close"], window_size)

  model = Sequential()
  model.add(LSTM(16, input_shape=(train_feature.shape[1], train_feature.shape[2]), activation='relu', return_sequences=False))
  model.add(Dense(1))

  model.compile(loss='mean_squared_error', optimizer='adam')
  early_stop = EarlyStopping(monitor='val_loss', patience=5)
  filename = os.path.join(model_path, 'tmp_checkpoint_'+str(window_size)+'.h5')
  checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

  history = model.fit(x_train, y_train, epochs=200, batch_size=16, validation_data=(x_valid, y_valid), callbacks=[early_stop, checkpoint])
  print(window_size, min(history.history['val_loss']))

  # weight 로딩
  model.load_weights(filename)

  # 예측
  pred = model.predict(test_feature)

  plt.figure(figsize=(12, 9))
  plt.plot(test_label, label='actual')
  plt.plot(pred, label='prediction'+str(window_size))
  plt.legend()
  plt.title(window_size, loc="center")
  
plt.show()

# **Parameter Selection**
---
변수선택법을 적용하기위해 SHAP를 사용하고자 했지만 버전상의 문제인지 LSTM에 적용이 안됨.  
직접 학습해가며 전진선택법으로 파라미터 추가  
파라미터는 무역이 주요한 한국의 특성상 주요 원자재 선물과 각국의 주요 경제지수 활용


In [None]:
from sklearn.preprocessing import StandardScaler

model_path = "/gdrive/My Drive/KOSPI/Model/GSPC"
window_size = 10

trained_features = ["High", "Low", "Open", "Volume", "Adj Close"]

tmp_df_train = df_train.copy()
tmp_df_test = df_test.copy()

scaler = StandardScaler()
scale_cols = tmp_df_train.columns

df_train_Normalized = scaler.fit_transform(tmp_df_train[scale_cols])
df_train_Normalized = pd.DataFrame(df_train_Normalized)
df_train_Normalized.columns = scale_cols

df_test_Normalized = scaler.fit_transform(tmp_df_test[scale_cols])
df_test_Normalized = pd.DataFrame(df_test_Normalized)
df_test_Normalized.columns = scale_cols

# train dataset
train_feature, train_label = make_dataset(df_train_Normalized[trained_features], df_train_Normalized["Close"], window_size)
test_feature, test_label = make_dataset(df_test_Normalized[trained_features], df_test_Normalized["Close"], window_size)

# train, validation set 생성
x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)
  
model = Sequential()
model.add(LSTM(16, input_shape=(train_feature.shape[1], train_feature.shape[2]), activation='relu', return_sequences=False))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
filename = os.path.join(model_path, 'tmp_checkpoint_Original.h5')
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

history = model.fit(x_train, y_train, verbose=0, epochs=200, batch_size=16, validation_data=(x_valid, y_valid), callbacks=[early_stop, checkpoint])

Original = min(history.history['val_loss'])
print(Original)

>KE=F를 포함했을때 가장 효과가 좋았음

In [None]:
left_features = ["^TWII", "^IPSA", "^TA125.TA", "SI=F", "HG=F", "CL=F", "CC=F", "^GSPC", "^IXIC", "^NYA", "^XAX", "^VIX", "^FCHI", "^N100", "^BFX", "^N225", "000001.SS", "399001.SZ", "^AXJO", "^STOXX50E"]
Selected_Futures = ["KE=F"]

best_score_history = [Original]
y_range = Original

for feature in left_features:
  compared_features = copy.deepcopy(Selected_Futures)
  compared_features.append(feature)
  tmp_df_train = df_train.copy()
  tmp_df_test = df_test.copy()

  tmp_df_train = tmp_df_train.merge(pdr.get_data_yahoo(compared_features, start_train, end_train)["Close"], left_on='Date', right_on='Date', how='left')
  tmp_df_train.fillna(method="ffill", inplace=True)
  tmp_df_train.fillna(method="bfill", inplace=True)

  tmp_df_test = tmp_df_test.merge(pdr.get_data_yahoo(compared_features, start_test, end_test)["Close"], left_on='Date', right_on='Date', how='left')
  tmp_df_test.fillna(method="ffill", inplace=True)
  tmp_df_test.fillna(method="bfill", inplace=True)

  scale_cols = tmp_df_train.columns

  df_train_Normalized = scaler.fit_transform(tmp_df_train[scale_cols])
  df_train_Normalized = pd.DataFrame(df_train_Normalized)
  df_train_Normalized.columns = scale_cols

  df_test_Normalized = scaler.fit_transform(tmp_df_test[scale_cols])
  df_test_Normalized = pd.DataFrame(df_test_Normalized)
  df_test_Normalized.columns = scale_cols

  # train dataset
  train_feature, train_label = make_dataset(df_train_Normalized[trained_features + compared_features], df_train_Normalized["Close"], window_size)
  test_feature, test_label = make_dataset(df_test_Normalized[trained_features + compared_features], df_test_Normalized["Close"], window_size)

  # train, validation set 생성
  x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)
  
  model = Sequential()
  model.add(LSTM(16, input_shape=(train_feature.shape[1], train_feature.shape[2]), activation='relu', return_sequences=False))
  model.add(Dense(1))

  model.compile(loss='mean_squared_error', optimizer='adam')
  early_stop = EarlyStopping(monitor='val_loss', patience=5)
  filename = os.path.join(model_path, 'tmp_checkpoint_'+feature+'.h5')
  checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

  history = model.fit(x_train, y_train, verbose=0, epochs=200, batch_size=16, validation_data=(x_valid, y_valid), callbacks=[early_stop, checkpoint])
  best_score_history.append(min(history.history['val_loss']))

  if min(history.history['val_loss']) < Original: 
    Original = min(history.history['val_loss'])

x = np.arange(len(left_features) + 1)
features = ['Original'] + left_features

plt.figure(figsize=(50, 9))
plt.bar(x, best_score_history)
plt.xticks(x, features)
plt.ylim(0, y_range)
plt.show()  
print(Original)

In [None]:
compared_features = copy.deepcopy(Selected_Futures)
tmp_df_test = df_test.copy()

tmp_df_test = tmp_df_test.merge(pdr.get_data_yahoo(compared_features, start_test, end_test)["Close"], left_on='Date', right_on='Date', how='left')
tmp_df_test.fillna(method="ffill", inplace=True)
tmp_df_test.fillna(method="bfill", inplace=True)

scale_cols = tmp_df_test.columns

df_test_Normalized = scaler.fit_transform(tmp_df_test[scale_cols])
df_test_Normalized = pd.DataFrame(df_test_Normalized)
df_test_Normalized.columns = scale_cols

test_feature, test_label = make_dataset(df_test_Normalized[trained_features + compared_features], df_test_Normalized["Close"], window_size)
  
model = Sequential()
model.add(LSTM(16, input_shape=(test_feature.shape[1], test_feature.shape[2]), activation='relu', return_sequences=False))
model.add(Dense(1))

model.load_weights(os.path.join(model_path, 'tmp_checkpoint_KE=F.h5'))

model.compile(loss='mean_squared_error', optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
filename = os.path.join(model_path, 'tmp_checkpoint_'+feature+'.h5')
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

pred = model.predict(test_feature)

plt.figure(figsize=(12, 9))
plt.plot(test_label, label='actual')
plt.plot(pred, label='prediction')
plt.legend()
plt.title(window_size, loc="center")

# **Final Test**

---



> 지난 10일간의 데이터를 기반으로 다음날의 종가 예상

In [None]:
def make_dataset(data, window_size=20):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
    return np.array(feature_list)

In [None]:
window_size = 10

start_test = date.today()-timedelta(days=30)
end_test = date.today()

compared_features = ["KE=F"]
tmp_df_test = pdr.get_data_yahoo("^KS11", start_test, end_test)

tmp_df_test = tmp_df_test.merge(pdr.get_data_yahoo(compared_features, start_test, end_test)["Close"], left_on='Date', right_on='Date', how='left')
tmp_df_test.fillna(method="ffill", inplace=True)
tmp_df_test.fillna(method="bfill", inplace=True)

tmp_df_test = tmp_df_test.iloc[-11:]

scale_cols = tmp_df_test.columns

df_test_Normalized = scaler.fit_transform(tmp_df_test[scale_cols])
df_test_Normalized = pd.DataFrame(df_test_Normalized)
df_test_Normalized.columns = scale_cols

test_feature = make_dataset(df_test_Normalized[trained_features + compared_features], window_size)
  
model = Sequential()
model.add(LSTM(16, input_shape=(test_feature.shape[1], test_feature.shape[2]), activation='relu', return_sequences=False))
model.add(Dense(1))

model.load_weights(os.path.join(model_path, 'tmp_checkpoint_KE=F.h5'))

model.compile(loss='mean_squared_error', optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
filename = os.path.join(model_path, 'tmp_checkpoint_'+feature+'.h5')
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

pred = model.predict(test_feature)
pred = sum(pred)
pred_series = Series(pred, index=[11])

plt.figure(figsize=(12, 9))
plt.xticks(np.arange(0, 12), labels=['D-10', 'D-9', 'D-8', 'D-7', 'D-6', 'D-5', 'D-4', 'D-3', 'D-2', 'D-1', 'D-Day', 'D+1'])
plt.plot(df_test_Normalized["Close"].append(pred_series), label='prediction')
plt.legend()