# Regression with news (only one stock included)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys
sys.version

'3.5.2 (default, Nov 23 2017, 16:37:01) \n[GCC 5.4.0 20160609]'

In [2]:
import pickle
import pandas as pd
import os
import copy
import numpy as np

from pythainlp.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn import preprocessing
from sklearn import linear_model, tree, ensemble
from sklearn import svm

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import plotly.graph_objs as go
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
% matplotlib inline

target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

df_price = pd.read_csv('merged_2013_2018.csv')
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%Y-%m-%d')
df_price = df_price.loc[df_price['Ticker'].isin(target_stocks)]
df_price['Date'] = df_price['Date'].dt.date
df_price = df_price.set_index('Date')
df_price.tail(3)
len(df_price)

df_kaohoon = pd.read_csv('data/kaohoon.csv')
df_kaohoon['Date'] = pd.to_datetime(df_kaohoon['Date'], format='%Y-%m-%d')
df_kaohoon = df_kaohoon.set_index('Date')
df_kaohoon = df_kaohoon[:'2018-2-8']
df_kaohoon.index = df_kaohoon.index.date
df_kaohoon.tail(3)
len(df_kaohoon)

df_moneych = pd.read_csv('data/moneychanel.csv')
df_moneych['Date'] = pd.to_datetime(df_moneych['Date'], format='%Y-%m-%d')
df_moneych = df_moneych.set_index('Date')
df_moneych = df_moneych[:'2018-2-8']
df_moneych.index = df_moneych.index.date
df_moneych.tail(3)
len(df_moneych)

df_news = pd.concat([df_moneych, df_kaohoon])
'Total:', len(df_news.index)

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-16,THAI,15.6,15.9,15.6,15.6,2907700
2018-02-16,TRUE,6.4,6.4,6.3,6.3,35851300
2018-02-16,TU,20.0,20.5,20.0,20.1,7299300


26331

Unnamed: 0,Ticker,Text
2018-02-01,KBANK,สำนักงาน คณะกรรมการ กำกับ หลักทรัพย์ และ ตลาดห...
2018-02-05,CPF,บริษัท เจริญ โภคภัณฑ์ อาหาร จำกัด มหาชน หรือ จ...
2018-02-06,KBANK,นาย ศีล วัต สัน ติวิสัฎฐ์ นั่ง ขวา รอง กรรมการ...


2162

Unnamed: 0,Ticker,Text
2018-01-24,SCC,แจ้ง กำไร ออกมา ดีกว่า คาด ไตรมาส จำนวน หมื่น ...
2018-02-01,PTT,ปตท ตอบรับ นโยบาย สังคม ไร้ เงินสด ตอบสนอง ไลฟ...
2018-02-08,PTT,ตลาดหุ้น ไทย เปิด บวก แรง ซื้อ หุ้น นำ ตลาด ผส...


693

('Total:', 2855)

# Lag & Horizon Construction

In [3]:
N_lags = 3
N_horizon = 1

df_train = []
df_test = []
for stock in tqdm_notebook(target_stocks):
    news_stocks = []
    df_stock = df_news.loc[df_news['Ticker'] == stock]
    prev_date = None
    prev_text = None
    for date, row in df_stock.iterrows():
        if prev_date == None:
            prev_date = date
            prev_text = row['Text']
        elif prev_date != date:
            # horizon
            tmp_date = copy.deepcopy(prev_date)
            tmp_date += timedelta(days=1)
            prices = []
            count_lags = 0 
            while count_lags < N_horizon:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date += timedelta(days=1)
                if len(price) == 0: continue
                prices.append(price[0][4]) # Close price next day(s)
                count_lags+=1
            
            # lag
            tmp_date = copy.deepcopy(prev_date)
            count_lags = 0 
            while count_lags <= N_lags:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date -= timedelta(days=1)
                if len(price) == 0: continue
                for val in price[0][:-1]: 
                    if type(val) != str: prices.append(val)
                count_lags+=1

            news_stocks.append([prev_date, stock, prev_text] + prices)
            
            prev_date = date
            prev_text = row['Text']
        elif prev_date == date:
            prev_text += ' '+row['Text']

    news_stocks = pd.DataFrame.from_records(news_stocks)
    news_stocks.columns = ['Date', 'Ticker', 'Text', 
                           'Close(t+1)', # 'Close(t+2)','Close(t+3)','Close(t+4)','Close(t+5)',
                           'Open(t)', 'High(t)', 'Low(t)', 'Close(t)',
                           'Open(t-1)', 'High(t-1)', 'Low(t-1)', 'Close(t-1)',
                           'Open(t-2)', 'High(t-2)', 'Low(t-2)', 'Close(t-2)',
                           'Open(t-3)', 'High(t-3)', 'Low(t-3)', 'Close(t-3)',
#                            'Open(t-4)', 'High(t-4)', 'Low(t-4)', 'Close(t-4)',
#                            'Open(t-5)', 'High(t-5)', 'Low(t-5)', 'Close(t-5)'
                          ]
    news_stocks = news_stocks.set_index('Date')
    
    train_size = int(len(news_stocks) * 0.80)
    test_size = len(news_stocks) - train_size
    train, test = news_stocks.iloc[:train_size], news_stocks.iloc[train_size:]
    print(stock, ':\t',len(train), len(test))    
    df_train.append(train)
    df_test.append(test)

df_train = pd.concat(df_train, axis=0)
df_test = pd.concat(df_test, axis=0)

len(df_train), len(df_test) 
df_train.head(1)
df_test.head(1)

BANPU :	 84 21
IRPC :	 72 19
PTT :	 209 53
BBL :	 101 26
KBANK :	 126 32
SCB :	 119 30
AOT :	 137 35
THAI :	 123 31
CPF :	 144 36
MINT :	 72 19
TU :	 52 14
SCC :	 88 23
CPN :	 67 17
CK :	 63 16
CPALL :	 44 11
HMPRO :	 50 13
BDMS :	 67 17
BH :	 48 13
ADVANC :	 114 29
JAS :	 101 26
TRUE :	 86 22



(1967, 503)

Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-02-12,BANPU,ใน ช่วง ภาค บ่าย จับตา หุ้น วันนี้ วอ ลุ่ม หนา...,27.75,26.0,27.75,26.0,27.5,25.75,26.0,25.25,25.75,25.5,26.0,25.25,25.75,25.5,26.0,25.5,25.5


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-01-27,BANPU,บ ลดี บีเอ สวิคเคอร์ส ระบุ ใน บทวิเคราะห์ วัน ...,19.2,19.6,19.8,19.4,19.6,20.1,20.1,19.6,19.6,19.8,20.1,19.8,20.0,19.6,19.9,19.6,19.7


# TF-IDF Vetorization

In [4]:
stop_words = stopwords.words('thai')
stop_words.remove('ขึ้น')
stop_words.remove('ลง')

vertorizer = TfidfVectorizer(stop_words=stop_words, 
                             max_df=0.9, 
                             min_df=2, 
                             max_features=3000)

tfidf_train = vertorizer.fit_transform(df_train['Text'])
tfidf_test = vertorizer.transform(df_test['Text'])

df_tfidf_train = pd.DataFrame.from_records(tfidf_train.toarray())
df_tfidf_test = pd.DataFrame.from_records(tfidf_test.toarray())

df_tfidf_train = df_tfidf_train.set_index(df_train.index)
df_tfidf_test = df_tfidf_test.set_index(df_test.index)

len(df_tfidf_train), len(df_tfidf_test)

# replace Text with TF-IDF vector
x_train = df_train.drop(['Text'], axis=1)
x_train = pd.concat([x_train, df_tfidf_train], axis=1)

x_test = df_test.drop(['Text'], axis=1)
x_test = pd.concat([x_test, df_tfidf_test], axis=1)

# Label Encoding
le = preprocessing.LabelEncoder()
x_train['Ticker'] = le.fit_transform(x_train['Ticker'])
x_test['Ticker'] = le.transform(x_test['Ticker'])
x_train.head(2)
x_test.head(2)
le.classes_

(1967, 503)

Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-12,2,27.75,26.0,27.75,26.0,27.5,25.75,26.0,25.25,25.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-02-24,2,29.75,28.5,28.75,28.0,28.5,29.0,29.25,28.25,28.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-27,2,19.2,19.6,19.8,19.4,19.6,20.1,20.1,19.6,19.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-31,2,19.3,19.2,19.6,19.1,19.4,19.6,19.7,19.1,19.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


array(['ADVANC', 'AOT', 'BANPU', 'BBL', 'BDMS', 'BH', 'CK', 'CPALL',
       'CPF', 'CPN', 'HMPRO', 'IRPC', 'JAS', 'KBANK', 'MINT', 'PTT',
       'SCB', 'SCC', 'THAI', 'TRUE', 'TU'], dtype=object)

In [5]:
pickle.dump(le, open('models/le.pkl', 'wb'))

# Create x_train and y_train

In [6]:
Horizon = 'Close(t+1)'
y_train = x_train[[Horizon]]
x_train = x_train.drop(['Close(t+1)'], axis=1).copy()
x_train.shape, y_train.shape

((1967, 3017), (1967, 1))

# Evaluate Each Stcok

In [7]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def evaluator(clf, df_test, le, isXGB=False, isLSTM=False):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        if isXGB:
            y_pred = clf.predict(xgboost.DMatrix(x_tmp))
        elif isLSTM:
            x = x_tmp.values
            x = x.reshape((x.shape[0], x.shape[1], 1))
            y_pred = clf.predict(x)
        else:
            y_pred = clf.predict(x_tmp.as_matrix())
        
        # Directional Accuracy Pred
        changes = np.reshape(y_pred, (-1,1)) -  np.reshape(x_tmp['Close(t)'].values,(-1,1))
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

In [148]:
def ensemble_evaluator(bagging, ada_dt, ada_rf,  xgb, df_test, le, feature_importances, feature_importances_da):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        # Directional Accuracy
        changes = x_tmp[Horizon] - x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
        
        y_tmp = x_tmp[Horizon].values.reshape(-1,1)
        x_tmp = x_tmp.drop([Horizon], axis=1)        
        
        # Prediction
        y_pred = np.concatenate((
                                 bagging.predict(x_tmp).reshape(-1,1),
                                 ada_dt.predict(x_tmp).reshape(-1,1),
                                 ada_rf.predict(x_tmp).reshape(-1,1),
                                 xgb.predict(xgboost.DMatrix(x_tmp)).reshape(-1,1)), 
                                axis=1)
        
        df_pred = pd.DataFrame.from_records(y_pred).round(2)
        df_pred.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred.head()

        # Directional Accuracy Pred
        close_t = np.reshape(x_tmp['Close(t)'].values, (-1, 1))
        y_changes = np.concatenate((
                                    np.array(y_pred[:,0]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,1]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,2]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,3]).reshape(-1,1)-close_t,
                                   ), axis=1)
        y_pred_da = []
        for row in y_changes:
            tmp_row = []
            for change in row:
                tmp_row.append(1 if change>=0 else 0)
            y_pred_da.append(tmp_row)

        df_pred_da = pd.DataFrame.from_records(y_pred_da)
        df_pred_da.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred_da.head()
        
        
        df_pred['weight'] = (
                     df_pred['Bagging_DT']*feature_importances[0] + 
                     df_pred['Ada_DT']*feature_importances[1] + 
                     df_pred['Ada_RF']*feature_importances[2] + 
                     df_pred['XGB']*feature_importances[3]
        )
        
        df_pred_da['vote'] = (
                     df_pred_da['Bagging_DT']*feature_importances_da[0] + 
                     df_pred_da['Ada_DT']*feature_importances_da[1] + 
                     df_pred_da['Ada_RF']*feature_importances_da[2] + 
                     df_pred_da['XGB']*feature_importances_da[3]
        ).round(0).astype(int)

        y_pred = df_pred['weight'].values.reshape(-1,1)
        y_pred_da = df_pred_da['vote'].values.reshape(-1,1)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

# Ensemble

In [149]:
ensemble_evaluator( 
                   bagging,
                   adaboost_dt_regr, 
                   adaboost_rf_regr, 
                   xgb,
                   x_test, le, feature_importances, feature_importances_da)

BANPU 	RMSE: 0.31	 MAE: 0.24 	MAPE: 1.33 	DA: 0.90
IRPC 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.40 	DA: 0.47
PTT 	RMSE: 6.11	 MAE: 4.89 	MAPE: 1.20 	DA: 0.53
BBL 	RMSE: 2.67	 MAE: 1.84 	MAPE: 0.94 	DA: 0.38
KBANK 	RMSE: 2.79	 MAE: 2.35 	MAPE: 1.18 	DA: 0.44
SCB 	RMSE: 2.24	 MAE: 1.57 	MAPE: 1.03 	DA: 0.50
AOT 	RMSE: 1.46	 MAE: 0.92 	MAPE: 1.85 	DA: 0.57
THAI 	RMSE: 0.68	 MAE: 0.44 	MAPE: 2.19 	DA: 0.61
CPF 	RMSE: 0.32	 MAE: 0.22 	MAPE: 0.86 	DA: 0.72
MINT 	RMSE: 0.90	 MAE: 0.76 	MAPE: 1.86 	DA: 0.58
TU 	RMSE: 0.49	 MAE: 0.32 	MAPE: 1.59 	DA: 0.57
SCC 	RMSE: 5.50	 MAE: 3.75 	MAPE: 0.74 	DA: 0.70
CPN 	RMSE: 3.46	 MAE: 2.84 	MAPE: 3.78 	DA: 0.59
CK 	RMSE: 0.54	 MAE: 0.37 	MAPE: 1.32 	DA: 0.50
CPALL 	RMSE: 2.06	 MAE: 1.60 	MAPE: 2.68 	DA: 0.45
HMPRO 	RMSE: 0.20	 MAE: 0.16 	MAPE: 1.50 	DA: 0.62
BDMS 	RMSE: 0.45	 MAE: 0.30 	MAPE: 1.51 	DA: 0.59
BH 	RMSE: 2.01	 MAE: 1.67 	MAPE: 0.89 	DA: 0.54
ADVANC 	RMSE: 2.07	 MAE: 1.63 	MAPE: 0.98 	DA: 0.55
JAS 	RMSE: 0.39	 MAE: 0.27 	MAPE: 3.27 	DA: 0.58
TRUE 	RM

# Linear Regression

In [9]:
lineregr = linear_model.LinearRegression()
lineregr.fit(x_train, y_train)

evaluator(lineregr, x_test, le)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

BANPU 	RMSE: 24.30	 MAE: 17.37 	MAPE: 100.63 	DA: 0.71
IRPC 	RMSE: 20.75	 MAE: 15.71 	MAPE: 273.52 	DA: 0.47
PTT 	RMSE: 41.14	 MAE: 32.50 	MAPE: 9.76 	DA: 0.57
BBL 	RMSE: 24.67	 MAE: 17.89 	MAPE: 12.72 	DA: 0.58
KBANK 	RMSE: 33.21	 MAE: 24.68 	MAPE: 16.62 	DA: 0.56
SCB 	RMSE: 28.74	 MAE: 23.79 	MAPE: 16.09 	DA: 0.53
AOT 	RMSE: 66.41	 MAE: 37.36 	MAPE: 80.20 	DA: 0.51
THAI 	RMSE: 28.35	 MAE: 21.84 	MAPE: 112.89 	DA: 0.52
CPF 	RMSE: 25.72	 MAE: 19.62 	MAPE: 76.02 	DA: 0.53
MINT 	RMSE: 21.41	 MAE: 16.85 	MAPE: 45.50 	DA: 0.58
TU 	RMSE: 37.81	 MAE: 26.42 	MAPE: 133.11 	DA: 0.57
SCC 	RMSE: 30.12	 MAE: 23.06 	MAPE: 5.97 	DA: 0.52
CPN 	RMSE: 29.56	 MAE: 25.66 	MAPE: 37.45 	DA: 0.71
CK 	RMSE: 27.40	 MAE: 19.88 	MAPE: 69.48 	DA: 0.38
CPALL 	RMSE: 26.16	 MAE: 19.37 	MAPE: 36.59 	DA: 0.55
HMPRO 	RMSE: 32.72	 MAE: 25.32 	MAPE: 251.28 	DA: 0.31
BDMS 	RMSE: 27.22	 MAE: 20.60 	MAPE: 102.24 	DA: 0.41
BH 	RMSE: 21.98	 MAE: 16.42 	MAPE: 10.83 	DA: 0.62
ADVANC 	RMSE: 37.58	 MAE: 30.41 	MAPE: 21.43 	DA: 0

# Support Vector Regressor

In [None]:
# from sklearn.svm import SVR
# svr = SVR()
# svr.fit(x_train, y_train)

# evaluator(svr, x_test, le)

# Decistion Tree Regressor

In [12]:
decis_tree_regr = tree.DecisionTreeRegressor(max_depth=None)
decis_tree_regr.fit(x_train, y_train.values.ravel())

evaluator(decis_tree_regr, x_test, le)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

BANPU 	RMSE: 0.55	 MAE: 0.39 	MAPE: 2.16 	DA: 0.81
IRPC 	RMSE: 0.15	 MAE: 0.11 	MAPE: 1.92 	DA: 0.53
PTT 	RMSE: 7.91	 MAE: 6.13 	MAPE: 1.50 	DA: 0.57
BBL 	RMSE: 4.91	 MAE: 3.58 	MAPE: 1.86 	DA: 0.42
KBANK 	RMSE: 3.92	 MAE: 3.09 	MAPE: 1.54 	DA: 0.59
SCB 	RMSE: 4.03	 MAE: 2.45 	MAPE: 1.60 	DA: 0.57
AOT 	RMSE: 1.52	 MAE: 0.98 	MAPE: 1.97 	DA: 0.57
THAI 	RMSE: 0.85	 MAE: 0.60 	MAPE: 3.04 	DA: 0.35
CPF 	RMSE: 0.64	 MAE: 0.47 	MAPE: 1.80 	DA: 0.58
MINT 	RMSE: 1.66	 MAE: 1.28 	MAPE: 3.11 	DA: 0.47
TU 	RMSE: 0.60	 MAE: 0.42 	MAPE: 2.10 	DA: 0.64
SCC 	RMSE: 8.14	 MAE: 5.48 	MAPE: 1.11 	DA: 0.70
CPN 	RMSE: 2.92	 MAE: 2.35 	MAPE: 3.31 	DA: 0.47
CK 	RMSE: 0.85	 MAE: 0.56 	MAPE: 1.97 	DA: 0.56
CPALL 	RMSE: 2.26	 MAE: 1.59 	MAPE: 2.62 	DA: 0.64
HMPRO 	RMSE: 0.24	 MAE: 0.20 	MAPE: 1.92 	DA: 0.69
BDMS 	RMSE: 0.47	 MAE: 0.35 	MAPE: 1.75 	DA: 0.65
BH 	RMSE: 4.42	 MAE: 3.42 	MAPE: 1.86 	DA: 0.46
ADVANC 	RMSE: 3.37	 MAE: 2.84 	MAPE: 1.69 	DA: 0.59
JAS 	RMSE: 0.41	 MAE: 0.28 	MAPE: 3.40 	DA: 0.54
TRUE 	RM

# Random Forest Regrssor

In [15]:
rnd_forest_regr = ensemble.RandomForestRegressor(n_jobs=-1,
                                                max_depth=None,
                                                n_estimators=30)
rnd_forest_regr.fit(x_train, y_train.values.ravel())

evaluator(rnd_forest_regr, x_test, le)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

BANPU 	RMSE: 0.32	 MAE: 0.25 	MAPE: 1.37 	DA: 0.86
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.67 	DA: 0.42
PTT 	RMSE: 7.30	 MAE: 5.65 	MAPE: 1.39 	DA: 0.58
BBL 	RMSE: 2.55	 MAE: 1.81 	MAPE: 0.92 	DA: 0.42
KBANK 	RMSE: 2.77	 MAE: 2.30 	MAPE: 1.15 	DA: 0.44
SCB 	RMSE: 2.30	 MAE: 1.66 	MAPE: 1.09 	DA: 0.47
AOT 	RMSE: 4.86	 MAE: 1.56 	MAPE: 3.43 	DA: 0.54
THAI 	RMSE: 0.65	 MAE: 0.43 	MAPE: 2.16 	DA: 0.61
CPF 	RMSE: 0.39	 MAE: 0.29 	MAPE: 1.09 	DA: 0.61
MINT 	RMSE: 0.97	 MAE: 0.83 	MAPE: 2.02 	DA: 0.42
TU 	RMSE: 0.50	 MAE: 0.33 	MAPE: 1.61 	DA: 0.43
SCC 	RMSE: 5.92	 MAE: 4.17 	MAPE: 0.83 	DA: 0.61
CPN 	RMSE: 3.94	 MAE: 3.39 	MAPE: 4.55 	DA: 0.53
CK 	RMSE: 0.69	 MAE: 0.51 	MAPE: 1.83 	DA: 0.19
CPALL 	RMSE: 1.97	 MAE: 1.36 	MAPE: 2.28 	DA: 0.55
HMPRO 	RMSE: 0.23	 MAE: 0.20 	MAPE: 1.89 	DA: 0.54
BDMS 	RMSE: 0.42	 MAE: 0.27 	MAPE: 1.32 	DA: 0.76
BH 	RMSE: 2.47	 MAE: 1.88 	MAPE: 1.01 	DA: 0.54
ADVANC 	RMSE: 2.21	 MAE: 1.72 	MAPE: 1.04 	DA: 0.59
JAS 	RMSE: 0.39	 MAE: 0.28 	MAPE: 3.44 	DA: 0.54
TRUE 	RM

# Bagging Regressor

In [17]:
bagging = ensemble.BaggingRegressor(base_estimator=None,
                                    n_estimators=30,n_jobs=-1)
bagging.fit(x_train, y_train.values.ravel())
evaluator(bagging, x_test, le)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=30, n_jobs=-1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

BANPU 	RMSE: 0.35	 MAE: 0.28 	MAPE: 1.56 	DA: 0.81
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.64 	DA: 0.53
PTT 	RMSE: 6.66	 MAE: 5.20 	MAPE: 1.28 	DA: 0.53
BBL 	RMSE: 2.53	 MAE: 1.82 	MAPE: 0.93 	DA: 0.54
KBANK 	RMSE: 2.62	 MAE: 2.15 	MAPE: 1.08 	DA: 0.47
SCB 	RMSE: 2.40	 MAE: 1.77 	MAPE: 1.16 	DA: 0.47
AOT 	RMSE: 4.70	 MAE: 1.52 	MAPE: 3.33 	DA: 0.49
THAI 	RMSE: 0.68	 MAE: 0.44 	MAPE: 2.21 	DA: 0.58
CPF 	RMSE: 0.37	 MAE: 0.27 	MAPE: 1.02 	DA: 0.58
MINT 	RMSE: 1.04	 MAE: 0.90 	MAPE: 2.18 	DA: 0.42
TU 	RMSE: 0.50	 MAE: 0.32 	MAPE: 1.56 	DA: 0.64
SCC 	RMSE: 6.49	 MAE: 4.30 	MAPE: 0.85 	DA: 0.57
CPN 	RMSE: 3.53	 MAE: 3.11 	MAPE: 4.22 	DA: 0.53
CK 	RMSE: 0.59	 MAE: 0.44 	MAPE: 1.55 	DA: 0.38
CPALL 	RMSE: 2.15	 MAE: 1.70 	MAPE: 2.85 	DA: 0.36
HMPRO 	RMSE: 0.26	 MAE: 0.22 	MAPE: 2.05 	DA: 0.62
BDMS 	RMSE: 0.43	 MAE: 0.27 	MAPE: 1.33 	DA: 0.88
BH 	RMSE: 1.76	 MAE: 1.37 	MAPE: 0.73 	DA: 0.69
ADVANC 	RMSE: 2.23	 MAE: 1.75 	MAPE: 1.05 	DA: 0.59
JAS 	RMSE: 0.39	 MAE: 0.26 	MAPE: 3.19 	DA: 0.50
TRUE 	RM

# AdaBoost Regressor

In [18]:
adaboost_dt_regr = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(),
                                           learning_rate=0.3, 
                                           n_estimators=50, 
                                           loss='linear')
adaboost_dt_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_dt_regr, x_test, le)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=0.3, loss='linear', n_estimators=50,
         random_state=None)

BANPU 	RMSE: 0.30	 MAE: 0.20 	MAPE: 1.11 	DA: 0.90
IRPC 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.38 	DA: 0.47
PTT 	RMSE: 6.53	 MAE: 5.21 	MAPE: 1.28 	DA: 0.53
BBL 	RMSE: 3.16	 MAE: 2.08 	MAPE: 1.05 	DA: 0.38
KBANK 	RMSE: 3.30	 MAE: 2.70 	MAPE: 1.35 	DA: 0.44
SCB 	RMSE: 2.17	 MAE: 1.47 	MAPE: 0.96 	DA: 0.50
AOT 	RMSE: 1.24	 MAE: 0.77 	MAPE: 1.44 	DA: 0.57
THAI 	RMSE: 0.66	 MAE: 0.43 	MAPE: 2.15 	DA: 0.61
CPF 	RMSE: 0.34	 MAE: 0.24 	MAPE: 0.92 	DA: 0.72
MINT 	RMSE: 0.94	 MAE: 0.76 	MAPE: 1.86 	DA: 0.58
TU 	RMSE: 0.49	 MAE: 0.33 	MAPE: 1.63 	DA: 0.57
SCC 	RMSE: 5.21	 MAE: 3.13 	MAPE: 0.63 	DA: 0.70
CPN 	RMSE: 3.71	 MAE: 2.71 	MAPE: 3.59 	DA: 0.59
CK 	RMSE: 0.58	 MAE: 0.42 	MAPE: 1.48 	DA: 0.50
CPALL 	RMSE: 2.01	 MAE: 1.66 	MAPE: 2.78 	DA: 0.45
HMPRO 	RMSE: 0.21	 MAE: 0.18 	MAPE: 1.69 	DA: 0.62
BDMS 	RMSE: 0.46	 MAE: 0.32 	MAPE: 1.61 	DA: 0.59
BH 	RMSE: 2.27	 MAE: 1.88 	MAPE: 1.01 	DA: 0.54
ADVANC 	RMSE: 2.07	 MAE: 1.66 	MAPE: 0.99 	DA: 0.55
JAS 	RMSE: 0.40	 MAE: 0.28 	MAPE: 3.42 	DA: 0.58
TRUE 	RM

In [19]:
adaboost_rf_regr = ensemble.AdaBoostRegressor(base_estimator=ensemble.RandomForestRegressor(n_jobs=-1),
                                           learning_rate=0.3, 
                                           n_estimators=50, 
                                           loss='linear')
adaboost_rf_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_rf_regr, x_test, le)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
         learning_rate=0.3, loss='linear', n_estimators=50,
         random_state=None)

BANPU 	RMSE: 0.32	 MAE: 0.26 	MAPE: 1.46 	DA: 0.86
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.64 	DA: 0.53
PTT 	RMSE: 6.03	 MAE: 4.81 	MAPE: 1.18 	DA: 0.57
BBL 	RMSE: 2.50	 MAE: 1.81 	MAPE: 0.92 	DA: 0.46
KBANK 	RMSE: 2.71	 MAE: 2.26 	MAPE: 1.14 	DA: 0.47
SCB 	RMSE: 2.29	 MAE: 1.66 	MAPE: 1.08 	DA: 0.40
AOT 	RMSE: 0.97	 MAE: 0.77 	MAPE: 1.51 	DA: 0.54
THAI 	RMSE: 0.70	 MAE: 0.45 	MAPE: 2.26 	DA: 0.58
CPF 	RMSE: 0.30	 MAE: 0.22 	MAPE: 0.84 	DA: 0.67
MINT 	RMSE: 0.89	 MAE: 0.76 	MAPE: 1.86 	DA: 0.53
TU 	RMSE: 0.49	 MAE: 0.33 	MAPE: 1.66 	DA: 0.50
SCC 	RMSE: 5.57	 MAE: 4.01 	MAPE: 0.80 	DA: 0.57
CPN 	RMSE: 3.83	 MAE: 2.96 	MAPE: 3.91 	DA: 0.59
CK 	RMSE: 0.50	 MAE: 0.32 	MAPE: 1.15 	DA: 0.56
CPALL 	RMSE: 2.08	 MAE: 1.55 	MAPE: 2.58 	DA: 0.45
HMPRO 	RMSE: 0.20	 MAE: 0.15 	MAPE: 1.35 	DA: 0.46
BDMS 	RMSE: 0.48	 MAE: 0.32 	MAPE: 1.60 	DA: 0.65
BH 	RMSE: 2.12	 MAE: 1.73 	MAPE: 0.92 	DA: 0.69
ADVANC 	RMSE: 2.18	 MAE: 1.68 	MAPE: 1.01 	DA: 0.66
JAS 	RMSE: 0.39	 MAE: 0.28 	MAPE: 3.43 	DA: 0.50
TRUE 	RM

# Gradient Boosting Regressor

In [28]:
# gbr = ensemble.GradientBoostingRegressor(n_estimators=300, 
#                                          learning_rate=0.01,
#                                          max_depth=8,
#                                          min_samples_split=3,
#                                          loss='ls',
#                                         )
# gbr.fit(x_train, y_train.values.ravel())

# evaluator(gbr, x_test, le)

# XGBoost Regressor

In [20]:
import xgboost
from sklearn.model_selection import train_test_split

d_train, d_valid, y_d_train, y_d_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=10)
len(d_train), len(d_valid)

d_train = xgboost.DMatrix(d_train, label=y_d_train)
d_valid = xgboost.DMatrix(d_valid, label=y_d_valid)

(1770, 197)

Parameters
http://xgboost.readthedocs.io/en/latest//parameter.html

In [21]:
params = {
    'booster':'dart',
    'max_depth': 4,
    'learning_rate': 0.01,
    'subsample': 1,
    'objective': 'reg:tweedie',
    'eval_metric': 'mae',
    'reg_lambda': 0.8,
    'reg_alpha': 0.2,
    'silent': 1,
    'sample_type':"weighted"
}

xgb = xgboost.train(params, d_train, 
                    num_boost_round=5000, 
                    evals=[(d_train, 'train'), (d_valid, 'valid')], 
                    early_stopping_rounds=100,
                    verbose_eval=100
                   )


[0]	train-mae:132.578	valid-mae:129.04
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[100]	train-mae:125.589	valid-mae:122.369
[200]	train-mae:101.302	valid-mae:99.7523
[300]	train-mae:59.43	valid-mae:59.5976
[400]	train-mae:26.6896	valid-mae:27.2333
[500]	train-mae:10.6007	valid-mae:11.089
[600]	train-mae:4.10481	valid-mae:4.47542
[700]	train-mae:1.80499	valid-mae:2.3193
[800]	train-mae:1.17024	valid-mae:1.72158
[900]	train-mae:0.99784	valid-mae:1.61305
[1000]	train-mae:0.926056	valid-mae:1.60496
Stopping. Best iteration:
[982]	train-mae:0.935969	valid-mae:1.60355



In [22]:
evaluator(xgb, x_test, le, isXGB=True)

BANPU 	RMSE: 0.31	 MAE: 0.25 	MAPE: 1.37 	DA: 0.71
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.37 	DA: 0.47
PTT 	RMSE: 5.00	 MAE: 3.91 	MAPE: 0.95 	DA: 0.60
BBL 	RMSE: 2.60	 MAE: 1.78 	MAPE: 0.90 	DA: 0.42
KBANK 	RMSE: 3.09	 MAE: 2.57 	MAPE: 1.28 	DA: 0.50
SCB 	RMSE: 2.40	 MAE: 1.77 	MAPE: 1.16 	DA: 0.40
AOT 	RMSE: 1.09	 MAE: 0.76 	MAPE: 1.49 	DA: 0.66
THAI 	RMSE: 0.69	 MAE: 0.46 	MAPE: 2.29 	DA: 0.55
CPF 	RMSE: 0.32	 MAE: 0.24 	MAPE: 0.94 	DA: 0.53
MINT 	RMSE: 0.93	 MAE: 0.75 	MAPE: 1.82 	DA: 0.53
TU 	RMSE: 0.54	 MAE: 0.32 	MAPE: 1.56 	DA: 0.57
SCC 	RMSE: 7.19	 MAE: 5.61 	MAPE: 1.11 	DA: 0.48
CPN 	RMSE: 2.93	 MAE: 2.23 	MAPE: 3.00 	DA: 0.47
CK 	RMSE: 0.57	 MAE: 0.40 	MAPE: 1.41 	DA: 0.50
CPALL 	RMSE: 2.42	 MAE: 1.86 	MAPE: 3.09 	DA: 0.27
HMPRO 	RMSE: 0.15	 MAE: 0.11 	MAPE: 1.04 	DA: 0.92
BDMS 	RMSE: 0.43	 MAE: 0.31 	MAPE: 1.51 	DA: 0.71
BH 	RMSE: 2.35	 MAE: 1.93 	MAPE: 1.01 	DA: 0.46
ADVANC 	RMSE: 2.03	 MAE: 1.53 	MAPE: 0.90 	DA: 0.62
JAS 	RMSE: 0.38	 MAE: 0.25 	MAPE: 3.08 	DA: 0.46
TRUE 	RM

# Save ML models

In [24]:
pickle.dump(lineregr, open('models/lineregr.pkl', 'wb'))
pickle.dump(bagging, open('models/bagging_regr.pkl', 'wb'))
pickle.dump(decis_tree_regr, open('models/decis_tree_regr.pkl', 'wb'))
pickle.dump(rnd_forest_regr, open('models/rnd_forest_regr.pkl', 'wb'))
pickle.dump(adaboost_dt_regr, open('models/adaboost_dt_regr.pkl', 'wb'))
pickle.dump(adaboost_rf_regr, open('models/adaboost_rf_regr.pkl', 'wb'))
# pickle.dump(gbr, open('models/gbr.pkl', 'wb'))
pickle.dump(xgb, open('models/xgb.pkl', 'wb'))

x_test.to_csv('data/x_test_unique_news.csv')

# Ensemble Stacking

In [27]:
dt = pickle.load(open('models/decis_tree_regr.pkl', 'rb'))
rf = pickle.load(open('models/rnd_forest_regr.pkl', 'rb'))
bagging_dt = pickle.load(open('models/bagging_regr.pkl', 'rb'))
ada_dt = pickle.load(open('models/adaboost_dt_regr.pkl', 'rb'))
ada_rf = pickle.load(open('models/adaboost_rf_regr.pkl', 'rb'))
xgb = pickle.load(open('models/xgb.pkl', 'rb'))

In [28]:
x_train_stack = np.concatenate((
#                          dt.predict(x_train).reshape(-1,1),
#                          rf.predict(x_train).reshape(-1,1),
                         bagging_dt.predict(x_train).reshape(-1,1),
                         ada_dt.predict(x_train).reshape(-1,1),
                         ada_rf.predict(x_train).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix(x_train)).reshape(-1,1)), axis=1)

In [53]:
y_test = x_test[Horizon].values.reshape(-1,1)

x_test_stack = np.concatenate((
#                          dt.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
#                          rf.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         bagging_dt.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_dt.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_rf.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix((x_test.drop(['Close(t+1)'], axis=1)))).reshape(-1,1)), axis=1)

In [106]:
stack = ensemble.RandomForestRegressor()
stack.fit(x_train_stack, y_train.values.ravel())

y_pred_stack = stack.predict(x_test_stack).reshape(-1,1)

for i in range(x_test_stack.shape[1]):
    RMSE = np.sqrt(mean_squared_error(y_test, x_test_stack[:,i]))
    MAE = mean_absolute_error(y_test, x_test_stack[:,i])
    MAPE = mean_absolute_percentage_error(y_test, x_test_stack[:,i].reshape(-1,1))
    print("RMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))
    
RMSE = np.sqrt(mean_squared_error(y_test, y_pred_stack))
MAE = mean_absolute_error(y_test, y_pred_stack)
MAPE = mean_absolute_percentage_error(y_test, y_pred_stack)
print("\nRMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

RMSE: 3.20 	MAE: 1.62 	MAPE: 1.75
RMSE: 2.89 	MAE: 1.53 	MAPE: 1.56
RMSE: 2.74 	MAE: 1.50 	MAPE: 1.56
RMSE: 2.70 	MAE: 1.48 	MAPE: 1.50

RMSE: 2.69 	MAE: 1.47 	MAPE: 1.52


In [108]:
feature_importances = stack.feature_importances_
feature_importances

array([0.21326136, 0.34588015, 0.39886349, 0.04199499])

# Stack DA

In [120]:
close_t = np.reshape(x_train['Close(t)'].values, (-1, 1))
changes = y_train.values.reshape(-1,1) - close_t
y_train_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_train_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_train_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_train_stack_da.append(tmp_row)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [127]:
close_t = np.reshape(x_test['Close(t)'].values, (-1, 1))
changes = y_test - close_t
y_test_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_test_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_test_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_test_stack_da.append(tmp_row)

x_test_stack_da = np.array(x_test_stack_da)

In [143]:
stack_da = ensemble.GradientBoostingRegressor()
stack_da.fit(x_train_stack_da, y_train_da.ravel())

y_stack_da = stack_da.predict(x_test_stack_da).reshape(-1,1).round(0).astype(int)

for i in range(x_test_stack_da.shape[1]):
    acc = accuracy_score(y_test_da, x_test_stack_da[:,i])
    print("Accuracy: %.4f" % (acc))
    
acc = accuracy_score(y_test_da, y_stack_da)
print("\nAccuracy: %.4f" % (acc))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

Accuracy: 0.5567
Accuracy: 0.5726
Accuracy: 0.5646
Accuracy: 0.5507

Accuracy: 0.5726


In [145]:
feature_importances_da = stack_da.feature_importances_
feature_importances_da

array([0.16075052, 0.59401102, 0.1055127 , 0.13972575])

# LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

In [None]:
train_X = x_train.values
val_X = x_valid.values
# test_X = x_test.values

train_y = y_train.values
val_y = y_valid.values
# test_y = y_test.values

train_X = train_X.reshape(train_X.shape[0],  train_X.shape[1], 1)
val_X = val_X.reshape(val_X.shape[0], val_X.shape[1], 1)
# test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape)
print(val_X.shape, val_y.shape)
# print(test_X.shape, test_y.shape)

In [None]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.1))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(128))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')

In [None]:
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

checkpoint = ModelCheckpoint(filepath="models/LSTM.h5", 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=False, 
                             mode='auto', 
                             period=1
                            )

earlystopping = EarlyStopping(monitor='val_loss', 
                      min_delta=0, 
                      patience=10,
                      verbose=1, 
                      mode='auto')

# access via $ tensorboard --logdir=./logs
tensorboard = TensorBoard(log_dir='./logs')

In [None]:
model.fit(x=train_X, 
          y=train_y,
          epochs=1000,
          batch_size=256,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

In [None]:
evaluator(model, x_test, le, isLSTM=True)

In [None]:
from keras.layers import Bidirectional

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')

In [None]:
model.fit(x=train_X, 
          y=train_y,
          epochs=100,
          batch_size=128,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

In [None]:
evaluator(model, x_test, le, isLSTM=True)