# Regression with Pantip (all)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys
sys.version

import pickle
import pandas as pd
import os
import copy
import numpy as np

from pythainlp.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn import preprocessing
from sklearn import linear_model, tree, ensemble
from sklearn import svm

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import plotly.graph_objs as go
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
% matplotlib inline

target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

# target_stocks = ['BANPU']

df_price = pd.read_csv('merged_2013_2018.csv')
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%Y-%m-%d')
df_price = df_price.loc[df_price['Ticker'].isin(target_stocks)]
df_price['Date'] = df_price['Date'].dt.date
df_price = df_price.set_index('Date')
df_price.tail(3)
len(df_price)

df_pantip = pd.read_csv('data/pantip_all.csv')
df_pantip['Date'] = pd.to_datetime(df_pantip['Date'], format='%Y-%m-%d')
df_pantip = df_pantip.set_index('Date')
df_pantip = df_pantip.sort_index()
df_pantip = df_pantip['2014-1-1':'2018-2-8']
df_pantip.index = df_pantip.index.date
df_pantip.head(3)
df_pantip.tail(3)

'Total:', len(df_pantip)

('Total:', 9873)

# Lag & Horizon Construction

In [2]:
N_lags = 3
N_horizon = 1

df_train = []
df_test = []
for stock in tqdm_notebook(target_stocks):
    news_stocks = []
    df_stock = df_pantip.loc[df_pantip['Ticker'] == stock]
    prev_date = None
    prev_text = None
    
#     pbar = tqdm_notebook(total=len(df_stock))
    for date, row in df_stock.iterrows():
        if prev_date == None:
            prev_date = date
            prev_text = row['Text']
        elif prev_date != date:
            # horizon
            tmp_date = copy.deepcopy(prev_date)
            tmp_date += timedelta(days=1)
            prices = []
            count_lags = 0 
            while count_lags < N_horizon:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date += timedelta(days=1)
                if len(price) == 0: continue
                prices.append(price[0][4]) # Close price next day(s)
                count_lags+=1
            
            # lag
            tmp_date = copy.deepcopy(prev_date)
            count_lags = 0 
            while count_lags <= N_lags:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date -= timedelta(days=1)
                if len(price) == 0: continue
                for val in price[0][:-1]: 
                    if type(val) != str: prices.append(val)
                count_lags+=1

            news_stocks.append([prev_date, stock, prev_text] + prices)
            
            prev_date = date
            prev_text = row['Text']
        elif prev_date == date:
            prev_text += ' '+row['Text']
        
#         pbar.update(1)
#     pbar.close()

    news_stocks = pd.DataFrame.from_records(news_stocks)
    news_stocks.columns = ['Date', 'Ticker', 'Text', 
                           'Close(t+1)', # 'Close(t+2)','Close(t+3)','Close(t+4)','Close(t+5)',
                           'Open(t)', 'High(t)', 'Low(t)', 'Close(t)',
                           'Open(t-1)', 'High(t-1)', 'Low(t-1)', 'Close(t-1)',
                           'Open(t-2)', 'High(t-2)', 'Low(t-2)', 'Close(t-2)',
                           'Open(t-3)', 'High(t-3)', 'Low(t-3)', 'Close(t-3)',
#                            'Open(t-4)', 'High(t-4)', 'Low(t-4)', 'Close(t-4)',
#                            'Open(t-5)', 'High(t-5)', 'Low(t-5)', 'Close(t-5)'
                          ]
    news_stocks = news_stocks.set_index('Date')
    
    train_size = int(len(news_stocks) * 0.80)
    test_size = len(news_stocks) - train_size
    train, test = news_stocks.iloc[:train_size], news_stocks.iloc[train_size:]
    print(stock, ':\t',len(train), len(test))    
    df_train.append(train)
    df_test.append(test)
    

df_train = pd.concat(df_train, axis=0)
df_test = pd.concat(df_test, axis=0)

len(df_train), len(df_test) 
df_train.head(1)
df_test.head(1)

BANPU :	 172 43
IRPC :	 100 26
PTT :	 408 103
BBL :	 145 37
KBANK :	 115 29
SCB :	 778 195
AOT :	 294 74
THAI :	 168 43
CPF :	 348 88
MINT :	 68 17
TU :	 47 12
SCC :	 122 31
CPN :	 112 28
CK :	 131 33
CPALL :	 188 48
HMPRO :	 185 47
BDMS :	 111 28
BH :	 81 21
ADVANC :	 309 78
JAS :	 489 123
TRUE :	 359 90



(4730, 1194)

Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2014-01-06,BANPU,ดอย เกิด ไร ขึ้น คะ ดอย บ้าน ปู ที่ ลง ทุกวัน ...,26.0,26.0,26.25,24.8,25.25,27.5,28.25,26.5,26.5,30.0,30.25,28.0,28.0,30.0,30.25,29.0,30.25


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-04-26,BANPU,ราคา เท่ากัน ตัว ไหน น่าสนใจ กว่า ช่วงนี้ กับ ...,19.5,19.5,19.7,19.0,19.3,20.6,20.6,19.3,19.5,20.8,20.9,20.4,20.6,20.6,20.8,20.5,20.7


In [25]:
# df_train.to_csv('data/pantip_train_(t-3).csv', index=False)
# df_test.to_csv('data/pantip_test_(t-3).csv', index=False)

# TF-IDF Vetorization

In [21]:
stop_words = stopwords.words('thai')
stop_words.remove('ขึ้น')
stop_words.remove('ลง')

vertorizer = TfidfVectorizer(stop_words=stop_words, 
                             max_df=0.9, 
                             min_df=2, 
                             max_features=3000)

tfidf_train = vertorizer.fit_transform(df_train['Text'])
tfidf_test = vertorizer.transform(df_test['Text'])

df_tfidf_train = pd.DataFrame.from_records(tfidf_train.toarray())
df_tfidf_test = pd.DataFrame.from_records(tfidf_test.toarray())

df_tfidf_train = df_tfidf_train.set_index(df_train.index)
df_tfidf_test = df_tfidf_test.set_index(df_test.index)

len(df_tfidf_train), len(df_tfidf_test)

# replace Text with TF-IDF vector
x_train = df_train.drop(['Text'], axis=1)
x_train = pd.concat([x_train, df_tfidf_train], axis=1)

x_test = df_test.drop(['Text'], axis=1)
x_test = pd.concat([x_test, df_tfidf_test], axis=1)

# Label Encoding
le = preprocessing.LabelEncoder()
x_train['Ticker'] = le.fit_transform(x_train['Ticker'])
x_test['Ticker'] = le.transform(x_test['Ticker'])
x_train.head(2)
x_test.head(2)
le.classes_

(4730, 1194)

Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-06,2,26.0,26.0,26.25,24.8,25.25,27.5,28.25,26.5,26.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-01-07,2,26.5,25.0,26.5,25.0,26.0,26.0,26.25,24.8,25.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-26,2,19.5,19.5,19.7,19.0,19.3,20.6,20.6,19.3,19.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-10,2,17.9,19.2,19.3,18.4,18.5,19.2,19.4,19.1,19.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


array(['ADVANC', 'AOT', 'BANPU', 'BBL', 'BDMS', 'BH', 'CK', 'CPALL',
       'CPF', 'CPN', 'HMPRO', 'IRPC', 'JAS', 'KBANK', 'MINT', 'PTT',
       'SCB', 'SCC', 'THAI', 'TRUE', 'TU'], dtype=object)

# Create x_train and y_train

In [22]:
Horizon = 'Close(t+1)'
y_train = x_train[[Horizon]]
x_train = x_train.drop(['Close(t+1)'], axis=1).copy()
x_train.shape, y_train.shape

((4730, 3017), (4730, 1))

# Evaluate Each Stcok

In [50]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def evaluator(clf, df_test, le, isXGB=False, isLSTM=False):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        if isXGB:
            y_pred = clf.predict(xgboost.DMatrix(x_tmp))
        elif isLSTM:
            x = x_tmp.values
            x = x.reshape((x.shape[0], x.shape[1], 1))
            y_pred = clf.predict(x)
        else:
            y_pred = clf.predict(x_tmp.as_matrix())
        
        # Directional Accuracy Pred
        changes = np.reshape(y_pred, (-1,1)) -  np.reshape(x_tmp['Close(t)'].values,(-1,1))
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

In [51]:
def ensemble_evaluator(bagging, ada_dt, ada_rf,  xgb, stack, stack_da, df_test, le, feature_importances, feature_importances_da):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        # Directional Accuracy
        changes = x_tmp[Horizon] - x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
        
        y_tmp = x_tmp[Horizon].values.reshape(-1,1)
        x_tmp = x_tmp.drop([Horizon], axis=1)        
        
        # Prediction
        y_pred = np.concatenate((
                                 bagging.predict(x_tmp).reshape(-1,1),
                                 ada_dt.predict(x_tmp).reshape(-1,1),
                                 ada_rf.predict(x_tmp).reshape(-1,1),
                                 xgb.predict(xgboost.DMatrix(x_tmp)).reshape(-1,1)), 
                                axis=1)
        
        df_pred = pd.DataFrame.from_records(y_pred).round(2)
        df_pred.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred.head()

        # Directional Accuracy Pred
        close_t = np.reshape(x_tmp['Close(t)'].values, (-1, 1))
        y_changes = np.concatenate((
                                    np.array(y_pred[:,0]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,1]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,2]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,3]).reshape(-1,1)-close_t,
                                   ), axis=1)
        y_pred_da = []
        for row in y_changes:
            tmp_row = []
            for change in row:
                tmp_row.append(1 if change>=0 else 0)
            y_pred_da.append(tmp_row)

        df_pred_da = pd.DataFrame.from_records(y_pred_da)
        df_pred_da.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred_da.head()
        
        
        
        df_pred['weight'] = (
                     df_pred['Bagging_DT']*feature_importances[0] + 
                     df_pred['Ada_DT']*feature_importances[1] + 
                     df_pred['Ada_RF']*feature_importances[2] + 
                     df_pred['XGB']*feature_importances[3]
        )
        
        df_pred_da['vote'] = (
                     df_pred_da['Bagging_DT']*feature_importances_da[0] + 
                     df_pred_da['Ada_DT']*feature_importances_da[1] + 
                     df_pred_da['Ada_RF']*feature_importances_da[2] + 
                     df_pred_da['XGB']*feature_importances_da[3]
        ).round(0).astype(int)

        y_pred = stack.predict(y_pred).reshape(-1,1)
        y_pred_da = stack_da.predict(y_pred_da).reshape(-1,1).round(0).astype(int)
#         y_pred = df_pred['weight'].values.reshape(-1,1)
#         y_pred_da = df_pred_da['vote'].values.reshape(-1,1)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

# Ensemble

In [52]:
ensemble_evaluator( 
                   bagging,
                   adaboost_dt_regr, 
                   adaboost_rf_regr, 
                   xgb,
                   stack, stack_da,
                   x_test, le, feature_importances, feature_importances_da)

BANPU 	RMSE: 0.44	 MAE: 0.35 	MAPE: 1.85 	DA: 0.51
IRPC 	RMSE: 0.11	 MAE: 0.09 	MAPE: 1.70 	DA: 0.35
PTT 	RMSE: 6.90	 MAE: 5.66 	MAPE: 1.40 	DA: 0.50
BBL 	RMSE: 2.29	 MAE: 1.82 	MAPE: 1.07 	DA: 0.62
KBANK 	RMSE: 4.39	 MAE: 3.10 	MAPE: 1.68 	DA: 0.45
SCB 	RMSE: 1.90	 MAE: 1.45 	MAPE: 0.96 	DA: 0.53
AOT 	RMSE: 6.95	 MAE: 4.66 	MAPE: 6.79 	DA: 0.46
THAI 	RMSE: 0.67	 MAE: 0.43 	MAPE: 2.12 	DA: 0.67
CPF 	RMSE: 0.43	 MAE: 0.33 	MAPE: 1.28 	DA: 0.52
MINT 	RMSE: 0.75	 MAE: 0.65 	MAPE: 1.68 	DA: 0.47
TU 	RMSE: 0.38	 MAE: 0.28 	MAPE: 1.39 	DA: 0.58
SCC 	RMSE: 7.28	 MAE: 6.04 	MAPE: 1.22 	DA: 0.45
CPN 	RMSE: 6.94	 MAE: 5.80 	MAPE: 8.31 	DA: 0.43
CK 	RMSE: 0.47	 MAE: 0.40 	MAPE: 1.37 	DA: 0.58
CPALL 	RMSE: 5.67	 MAE: 4.45 	MAPE: 6.65 	DA: 0.33
HMPRO 	RMSE: 0.24	 MAE: 0.17 	MAPE: 1.68 	DA: 0.68
BDMS 	RMSE: 0.33	 MAE: 0.27 	MAPE: 1.31 	DA: 0.61
BH 	RMSE: 4.10	 MAE: 2.50 	MAPE: 1.38 	DA: 0.76
ADVANC 	RMSE: 2.12	 MAE: 1.64 	MAPE: 0.97 	DA: 0.45
JAS 	RMSE: 0.24	 MAE: 0.16 	MAPE: 2.09 	DA: 0.50
TRUE 	RM

# Linear Regression

In [7]:
from sklearn import linear_model

lineregr = linear_model.LinearRegression()
lineregr.fit(x_train, y_train)

evaluator(lineregr, x_test, le)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

BANPU 	RMSE: 3.35	 MAE: 2.53 	MAPE: 20.05 	DA: 0.65
IRPC 	RMSE: 3.24	 MAE: 2.66 	MAPE: 49.80 	DA: 0.50
PTT 	RMSE: 6.81	 MAE: 5.14 	MAPE: 13.22 	DA: 0.55
BBL 	RMSE: 4.81	 MAE: 3.44 	MAPE: 9.39 	DA: 0.46
KBANK 	RMSE: 5.24	 MAE: 3.88 	MAPE: 10.75 	DA: 0.55
SCB 	RMSE: 1.97	 MAE: 1.55 	MAPE: 3.27 	DA: 0.52
AOT 	RMSE: 3.29	 MAE: 2.74 	MAPE: 13.21 	DA: 0.54
THAI 	RMSE: 3.81	 MAE: 2.79 	MAPE: 19.78 	DA: 0.58
CPF 	RMSE: 3.30	 MAE: 2.62 	MAPE: 12.04 	DA: 0.52
MINT 	RMSE: 2.52	 MAE: 2.01 	MAPE: 10.67 	DA: 0.47
TU 	RMSE: 3.08	 MAE: 2.11 	MAPE: 11.63 	DA: 0.50
SCC 	RMSE: 6.10	 MAE: 4.94 	MAPE: 2.51 	DA: 0.52
CPN 	RMSE: 2.97	 MAE: 2.53 	MAPE: 15.82 	DA: 0.57
CK 	RMSE: 2.97	 MAE: 2.42 	MAPE: 9.76 	DA: 0.61
CPALL 	RMSE: 3.73	 MAE: 2.91 	MAPE: 9.19 	DA: 0.62
HMPRO 	RMSE: 4.48	 MAE: 2.85 	MAPE: 35.29 	DA: 0.60
BDMS 	RMSE: 3.08	 MAE: 2.37 	MAPE: 13.61 	DA: 0.50
BH 	RMSE: 5.74	 MAE: 4.76 	MAPE: 11.87 	DA: 0.38
ADVANC 	RMSE: 3.68	 MAE: 2.78 	MAPE: 10.76 	DA: 0.54
JAS 	RMSE: 3.07	 MAE: 2.36 	MAPE: 35.48 	DA

# Support Vector Regressor

In [8]:
# from sklearn.svm import SVR
# svr = SVR()
# svr.fit(x_train, y_train)

# evaluator(svr, x_test, le)

# Decistion Tree Regressor

In [9]:
decis_tree_regr = tree.DecisionTreeRegressor(max_depth=None)
decis_tree_regr.fit(x_train, y_train.values.ravel())

evaluator(decis_tree_regr, x_test, le)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

BANPU 	RMSE: 0.52	 MAE: 0.38 	MAPE: 2.02 	DA: 0.58
IRPC 	RMSE: 0.22	 MAE: 0.16 	MAPE: 3.01 	DA: 0.50
PTT 	RMSE: 10.88	 MAE: 8.51 	MAPE: 2.09 	DA: 0.48
BBL 	RMSE: 3.82	 MAE: 3.07 	MAPE: 1.79 	DA: 0.57
KBANK 	RMSE: 4.48	 MAE: 3.62 	MAPE: 1.94 	DA: 0.55
SCB 	RMSE: 2.77	 MAE: 2.09 	MAPE: 1.38 	DA: 0.53
AOT 	RMSE: 7.15	 MAE: 4.73 	MAPE: 6.92 	DA: 0.49
THAI 	RMSE: 0.75	 MAE: 0.46 	MAPE: 2.24 	DA: 0.67
CPF 	RMSE: 0.72	 MAE: 0.51 	MAPE: 1.94 	DA: 0.59
MINT 	RMSE: 0.73	 MAE: 0.57 	MAPE: 1.51 	DA: 0.47
TU 	RMSE: 0.39	 MAE: 0.33 	MAPE: 1.70 	DA: 0.50
SCC 	RMSE: 6.45	 MAE: 5.29 	MAPE: 1.06 	DA: 0.48
CPN 	RMSE: 7.48	 MAE: 6.21 	MAPE: 8.79 	DA: 0.43
CK 	RMSE: 0.70	 MAE: 0.58 	MAPE: 1.98 	DA: 0.52
CPALL 	RMSE: 5.70	 MAE: 3.97 	MAPE: 5.86 	DA: 0.42
HMPRO 	RMSE: 0.30	 MAE: 0.23 	MAPE: 2.20 	DA: 0.49
BDMS 	RMSE: 1.53	 MAE: 0.69 	MAPE: 3.44 	DA: 0.57
BH 	RMSE: 4.95	 MAE: 3.33 	MAPE: 1.81 	DA: 0.67
ADVANC 	RMSE: 3.67	 MAE: 2.90 	MAPE: 1.72 	DA: 0.53
JAS 	RMSE: 0.31	 MAE: 0.22 	MAPE: 2.83 	DA: 0.56
TRUE 	R

# Random Forest Regrssor

In [11]:
rnd_forest_regr = ensemble.RandomForestRegressor(n_jobs=-1,
                                                max_depth=None,
                                                n_estimators=10)
rnd_forest_regr.fit(x_train, y_train.values.ravel())

evaluator(rnd_forest_regr, x_test, le)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

BANPU 	RMSE: 0.50	 MAE: 0.37 	MAPE: 1.93 	DA: 0.58
IRPC 	RMSE: 0.13	 MAE: 0.11 	MAPE: 1.96 	DA: 0.35
PTT 	RMSE: 8.99	 MAE: 6.28 	MAPE: 1.55 	DA: 0.40
BBL 	RMSE: 2.41	 MAE: 1.91 	MAPE: 1.12 	DA: 0.62
KBANK 	RMSE: 4.19	 MAE: 3.22 	MAPE: 1.74 	DA: 0.38
SCB 	RMSE: 1.79	 MAE: 1.40 	MAPE: 0.91 	DA: 0.50
AOT 	RMSE: 6.75	 MAE: 5.07 	MAPE: 7.56 	DA: 0.39
THAI 	RMSE: 0.74	 MAE: 0.47 	MAPE: 2.26 	DA: 0.67
CPF 	RMSE: 0.51	 MAE: 0.41 	MAPE: 1.59 	DA: 0.48
MINT 	RMSE: 0.65	 MAE: 0.51 	MAPE: 1.31 	DA: 0.41
TU 	RMSE: 0.34	 MAE: 0.26 	MAPE: 1.30 	DA: 0.75
SCC 	RMSE: 5.35	 MAE: 4.54 	MAPE: 0.91 	DA: 0.32
CPN 	RMSE: 8.19	 MAE: 6.98 	MAPE: 10.03 	DA: 0.36
CK 	RMSE: 0.51	 MAE: 0.41 	MAPE: 1.39 	DA: 0.48
CPALL 	RMSE: 6.82	 MAE: 5.99 	MAPE: 9.12 	DA: 0.33
HMPRO 	RMSE: 0.27	 MAE: 0.20 	MAPE: 1.93 	DA: 0.60
BDMS 	RMSE: 0.44	 MAE: 0.30 	MAPE: 1.48 	DA: 0.57
BH 	RMSE: 4.04	 MAE: 2.47 	MAPE: 1.38 	DA: 0.62
ADVANC 	RMSE: 2.08	 MAE: 1.62 	MAPE: 0.97 	DA: 0.50
JAS 	RMSE: 0.28	 MAE: 0.19 	MAPE: 2.51 	DA: 0.42
TRUE 	R

# Bagging Regressor

In [12]:
bagging = ensemble.BaggingRegressor(base_estimator=None,
                                    n_estimators=30,n_jobs=-1)
bagging.fit(x_train, y_train.values.ravel())
evaluator(bagging, x_test, le)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=30, n_jobs=-1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

BANPU 	RMSE: 0.43	 MAE: 0.34 	MAPE: 1.77 	DA: 0.51
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.86 	DA: 0.31
PTT 	RMSE: 10.20	 MAE: 7.44 	MAPE: 1.82 	DA: 0.50
BBL 	RMSE: 2.61	 MAE: 2.07 	MAPE: 1.22 	DA: 0.59
KBANK 	RMSE: 4.11	 MAE: 2.86 	MAPE: 1.56 	DA: 0.48
SCB 	RMSE: 1.81	 MAE: 1.37 	MAPE: 0.90 	DA: 0.49
AOT 	RMSE: 6.45	 MAE: 4.59 	MAPE: 6.77 	DA: 0.41
THAI 	RMSE: 0.68	 MAE: 0.43 	MAPE: 2.09 	DA: 0.67
CPF 	RMSE: 0.40	 MAE: 0.30 	MAPE: 1.16 	DA: 0.51
MINT 	RMSE: 0.64	 MAE: 0.52 	MAPE: 1.33 	DA: 0.47
TU 	RMSE: 0.40	 MAE: 0.30 	MAPE: 1.50 	DA: 0.67
SCC 	RMSE: 5.74	 MAE: 4.78 	MAPE: 0.96 	DA: 0.48
CPN 	RMSE: 7.43	 MAE: 6.28 	MAPE: 8.97 	DA: 0.39
CK 	RMSE: 0.49	 MAE: 0.41 	MAPE: 1.41 	DA: 0.42
CPALL 	RMSE: 6.39	 MAE: 5.46 	MAPE: 8.26 	DA: 0.33
HMPRO 	RMSE: 0.28	 MAE: 0.20 	MAPE: 1.88 	DA: 0.66
BDMS 	RMSE: 0.35	 MAE: 0.29 	MAPE: 1.44 	DA: 0.54
BH 	RMSE: 3.82	 MAE: 2.55 	MAPE: 1.42 	DA: 0.62
ADVANC 	RMSE: 2.12	 MAE: 1.65 	MAPE: 0.98 	DA: 0.42
JAS 	RMSE: 0.24	 MAE: 0.16 	MAPE: 2.07 	DA: 0.56
TRUE 	R

# AdaBoost Regressor

In [13]:
adaboost_dt_regr = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(),
                                           learning_rate=0.3, 
                                           n_estimators=50, 
                                           loss='linear')

adaboost_dt_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_dt_regr, x_test, le)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=0.3, loss='linear', n_estimators=50,
         random_state=None)

BANPU 	RMSE: 0.45	 MAE: 0.35 	MAPE: 1.88 	DA: 0.51
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.76 	DA: 0.35
PTT 	RMSE: 6.90	 MAE: 5.39 	MAPE: 1.33 	DA: 0.50
BBL 	RMSE: 2.30	 MAE: 1.84 	MAPE: 1.08 	DA: 0.62
KBANK 	RMSE: 4.41	 MAE: 3.14 	MAPE: 1.70 	DA: 0.45
SCB 	RMSE: 1.90	 MAE: 1.45 	MAPE: 0.96 	DA: 0.53
AOT 	RMSE: 7.66	 MAE: 5.26 	MAPE: 7.69 	DA: 0.46
THAI 	RMSE: 0.68	 MAE: 0.43 	MAPE: 2.13 	DA: 0.67
CPF 	RMSE: 0.44	 MAE: 0.33 	MAPE: 1.27 	DA: 0.52
MINT 	RMSE: 0.65	 MAE: 0.49 	MAPE: 1.24 	DA: 0.47
TU 	RMSE: 0.39	 MAE: 0.28 	MAPE: 1.42 	DA: 0.58
SCC 	RMSE: 5.97	 MAE: 5.03 	MAPE: 1.01 	DA: 0.45
CPN 	RMSE: 7.49	 MAE: 6.33 	MAPE: 9.11 	DA: 0.43
CK 	RMSE: 0.48	 MAE: 0.39 	MAPE: 1.35 	DA: 0.58
CPALL 	RMSE: 6.52	 MAE: 5.53 	MAPE: 8.36 	DA: 0.33
HMPRO 	RMSE: 0.25	 MAE: 0.18 	MAPE: 1.72 	DA: 0.68
BDMS 	RMSE: 0.32	 MAE: 0.25 	MAPE: 1.24 	DA: 0.61
BH 	RMSE: 4.10	 MAE: 2.50 	MAPE: 1.38 	DA: 0.76
ADVANC 	RMSE: 2.13	 MAE: 1.63 	MAPE: 0.97 	DA: 0.45
JAS 	RMSE: 0.24	 MAE: 0.16 	MAPE: 2.06 	DA: 0.50
TRUE 	RM

In [14]:
adaboost_rf_regr = ensemble.AdaBoostRegressor(base_estimator=ensemble.RandomForestRegressor(n_jobs=-1),
                                           learning_rate=0.5, 
                                           n_estimators=30, 
                                           loss='linear')
adaboost_rf_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_rf_regr, x_test, le)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
         learning_rate=0.5, loss='linear', n_estimators=30,
         random_state=None)

BANPU 	RMSE: 0.45	 MAE: 0.35 	MAPE: 1.83 	DA: 0.51
IRPC 	RMSE: 0.10	 MAE: 0.07 	MAPE: 1.34 	DA: 0.58
PTT 	RMSE: 9.19	 MAE: 5.79 	MAPE: 1.43 	DA: 0.44
BBL 	RMSE: 2.53	 MAE: 1.96 	MAPE: 1.15 	DA: 0.54
KBANK 	RMSE: 4.07	 MAE: 3.00 	MAPE: 1.62 	DA: 0.52
SCB 	RMSE: 1.77	 MAE: 1.38 	MAPE: 0.90 	DA: 0.51
AOT 	RMSE: 6.92	 MAE: 4.93 	MAPE: 7.27 	DA: 0.43
THAI 	RMSE: 0.66	 MAE: 0.41 	MAPE: 2.00 	DA: 0.65
CPF 	RMSE: 0.42	 MAE: 0.32 	MAPE: 1.21 	DA: 0.50
MINT 	RMSE: 0.67	 MAE: 0.56 	MAPE: 1.44 	DA: 0.29
TU 	RMSE: 0.38	 MAE: 0.28 	MAPE: 1.43 	DA: 0.58
SCC 	RMSE: 5.59	 MAE: 4.67 	MAPE: 0.94 	DA: 0.35
CPN 	RMSE: 6.94	 MAE: 5.76 	MAPE: 8.33 	DA: 0.39
CK 	RMSE: 0.42	 MAE: 0.36 	MAPE: 1.22 	DA: 0.48
CPALL 	RMSE: 6.37	 MAE: 5.45 	MAPE: 8.26 	DA: 0.33
HMPRO 	RMSE: 0.25	 MAE: 0.18 	MAPE: 1.74 	DA: 0.66
BDMS 	RMSE: 0.29	 MAE: 0.23 	MAPE: 1.15 	DA: 0.43
BH 	RMSE: 3.69	 MAE: 2.29 	MAPE: 1.27 	DA: 0.62
ADVANC 	RMSE: 1.87	 MAE: 1.42 	MAPE: 0.84 	DA: 0.56
JAS 	RMSE: 0.24	 MAE: 0.17 	MAPE: 2.15 	DA: 0.44
TRUE 	RM

# Gradient Boosting Regressor

In [24]:
gbr = ensemble.GradientBoostingRegressor(n_estimators=500, 
                                         learning_rate=0.1,
                                         max_depth=4,
                                         min_samples_split=2,
                                         loss='ls',
                                        )
gbr.fit(x_train, y_train)

evaluator(gbr, x_test, le)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

BANPU 	RMSE: 0.94	 MAE: 0.67 	MAPE: 3.58 	DA: 0.40
IRPC 	RMSE: 0.47	 MAE: 0.32 	MAPE: 5.82 	DA: 0.46
PTT 	RMSE: 27.18	 MAE: 9.43 	MAPE: 2.31 	DA: 0.47
BBL 	RMSE: 2.57	 MAE: 2.04 	MAPE: 1.20 	DA: 0.51
KBANK 	RMSE: 3.97	 MAE: 3.03 	MAPE: 1.63 	DA: 0.59
SCB 	RMSE: 1.74	 MAE: 1.36 	MAPE: 0.90 	DA: 0.47
AOT 	RMSE: 5.94	 MAE: 4.18 	MAPE: 6.16 	DA: 0.54
THAI 	RMSE: 1.22	 MAE: 0.88 	MAPE: 4.20 	DA: 0.51
CPF 	RMSE: 1.41	 MAE: 0.71 	MAPE: 2.76 	DA: 0.47
MINT 	RMSE: 1.05	 MAE: 0.86 	MAPE: 2.31 	DA: 0.35
TU 	RMSE: 0.60	 MAE: 0.45 	MAPE: 2.27 	DA: 0.75
SCC 	RMSE: 10.57	 MAE: 6.00 	MAPE: 1.21 	DA: 0.45
CPN 	RMSE: 5.57	 MAE: 4.42 	MAPE: 6.41 	DA: 0.39
CK 	RMSE: 0.79	 MAE: 0.64 	MAPE: 2.19 	DA: 0.42
CPALL 	RMSE: 4.81	 MAE: 3.23 	MAPE: 4.75 	DA: 0.48
HMPRO 	RMSE: 1.23	 MAE: 0.79 	MAPE: 8.57 	DA: 0.62
BDMS 	RMSE: 0.57	 MAE: 0.44 	MAPE: 2.16 	DA: 0.43
BH 	RMSE: 3.62	 MAE: 2.46 	MAPE: 1.35 	DA: 0.71
ADVANC 	RMSE: 2.03	 MAE: 1.59 	MAPE: 0.94 	DA: 0.50
JAS 	RMSE: 0.49	 MAE: 0.35 	MAPE: 4.66 	DA: 0.49
TRUE 	


reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# XGBoost Regressor

In [15]:
import xgboost
from sklearn.model_selection import train_test_split

d_train, d_valid, y_d_train, y_d_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=10)
len(d_train), len(d_valid)

d_train = xgboost.DMatrix(d_train, label=y_d_train)
d_valid = xgboost.DMatrix(d_valid, label=y_d_valid)

(4020, 710)

In [16]:
params = {
    'booster':'dart',
    'max_depth': 4,
    'learning_rate': 0.1,
    'n_estimators':300,
    'subsample': 0.9,
    'objective': 'reg:tweedie',
    'eval_metric': 'mae',
    'reg_lambda': 0.8,
    'reg_alpha': 0.2,
    'silent': 1,
}

xgb = xgboost.train(params, d_train, 
                    num_boost_round=5000, 
                    evals=[(d_train, 'train'), (d_valid, 'valid')], 
                    early_stopping_rounds=50,
                    verbose_eval=100
                   )

[0]	train-mae:114.16	valid-mae:113.395
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 50 rounds.
[100]	train-mae:1.2573	valid-mae:1.94089
Stopping. Best iteration:
[95]	train-mae:1.27692	valid-mae:1.93602



In [17]:
evaluator(xgb, x_test, le, isXGB=True)

BANPU 	RMSE: 0.42	 MAE: 0.33 	MAPE: 1.74 	DA: 0.56
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.92 	DA: 0.46
PTT 	RMSE: 9.81	 MAE: 6.35 	MAPE: 1.55 	DA: 0.52
BBL 	RMSE: 2.62	 MAE: 2.21 	MAPE: 1.28 	DA: 0.32
KBANK 	RMSE: 4.09	 MAE: 3.06 	MAPE: 1.66 	DA: 0.55
SCB 	RMSE: 1.76	 MAE: 1.35 	MAPE: 0.89 	DA: 0.50
AOT 	RMSE: 5.95	 MAE: 4.05 	MAPE: 5.94 	DA: 0.46
THAI 	RMSE: 0.66	 MAE: 0.43 	MAPE: 2.12 	DA: 0.58
CPF 	RMSE: 0.45	 MAE: 0.34 	MAPE: 1.30 	DA: 0.47
MINT 	RMSE: 0.66	 MAE: 0.54 	MAPE: 1.37 	DA: 0.29
TU 	RMSE: 0.33	 MAE: 0.27 	MAPE: 1.37 	DA: 0.50
SCC 	RMSE: 5.38	 MAE: 4.48 	MAPE: 0.90 	DA: 0.35
CPN 	RMSE: 5.73	 MAE: 4.64 	MAPE: 6.74 	DA: 0.36
CK 	RMSE: 0.47	 MAE: 0.38 	MAPE: 1.32 	DA: 0.52
CPALL 	RMSE: 4.55	 MAE: 3.14 	MAPE: 4.64 	DA: 0.42
HMPRO 	RMSE: 0.28	 MAE: 0.19 	MAPE: 1.84 	DA: 0.53
BDMS 	RMSE: 0.31	 MAE: 0.25 	MAPE: 1.24 	DA: 0.46
BH 	RMSE: 4.61	 MAE: 2.85 	MAPE: 1.57 	DA: 0.67
ADVANC 	RMSE: 2.45	 MAE: 1.80 	MAPE: 1.06 	DA: 0.46
JAS 	RMSE: 0.25	 MAE: 0.17 	MAPE: 2.17 	DA: 0.45
TRUE 	RM

# Save Models

In [18]:
import pickle
pickle.dump(decis_tree_regr, open('models/decis_tree_regr_pantip_all.pkl', 'wb'))
pickle.dump(rnd_forest_regr, open('models/rnd_forest_regr_pantip_all.pkl', 'wb'))
pickle.dump(bagging, open('models/bagging_regr_pantip_all.pkl', 'wb'))
pickle.dump(adaboost_dt_regr, open('models/adaboost_dt_regr_pantip_all.pkl', 'wb'))
pickle.dump(adaboost_rf_regr, open('models/adaboost_rf_regr_pantip_all.pkl', 'wb'))
pickle.dump(xgb, open('models/xgb_pantip_all.pkl', 'wb'))

In [19]:
dt = pickle.load(open('models/decis_tree_regr_pantip_all.pkl', 'rb'))
rf = pickle.load(open('models/rnd_forest_regr_pantip_all.pkl', 'rb'))
bagging = pickle.load(open('models/bagging_regr_pantip_all.pkl', 'rb'))
ada_dt = pickle.load(open('models/adaboost_dt_regr_pantip_all.pkl', 'rb'))
ada_rf = pickle.load(open('models/adaboost_rf_regr_pantip_all.pkl', 'rb'))
xgb = pickle.load(open('models/xgb_pantip_all.pkl', 'rb'))

In [24]:
x_train_stack = np.concatenate((
                         bagging.predict(x_train).reshape(-1,1),
                         ada_dt.predict(x_train).reshape(-1,1),
                         ada_rf.predict(x_train).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix(x_train)).reshape(-1,1)), axis=1)

In [25]:
y_test = x_test[Horizon].values.reshape(-1,1)

x_test_stack = np.concatenate((
                         bagging.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_dt.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_rf.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix((x_test.drop(['Close(t+1)'], axis=1)))).reshape(-1,1)), axis=1)

In [44]:
stack = ensemble.RandomForestRegressor(n_jobs=-1)
stack.fit(x_train_stack, y_train.values.ravel())

y_pred_stack = stack.predict(x_test_stack).reshape(-1,1)

for i in range(x_test_stack.shape[1]):
    RMSE = np.sqrt(mean_squared_error(y_test, x_test_stack[:,i]))
    MAE = mean_absolute_error(y_test, x_test_stack[:,i])
    MAPE = mean_absolute_percentage_error(y_test, x_test_stack[:,i].reshape(-1,1))
    print("RMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))
    
RMSE = np.sqrt(mean_squared_error(y_test, y_pred_stack))
MAE = mean_absolute_error(y_test, y_pred_stack)
MAPE = mean_absolute_percentage_error(y_test, y_pred_stack)
print("\nRMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

RMSE: 4.14 	MAE: 2.04 	MAPE: 2.24
RMSE: 3.69 	MAE: 1.93 	MAPE: 2.26
RMSE: 3.93 	MAE: 1.88 	MAPE: 2.18
RMSE: 5.49 	MAE: 2.72 	MAPE: 2.37

RMSE: 3.58 	MAE: 1.89 	MAPE: 2.14


In [45]:
feature_importances = stack.feature_importances_
feature_importances

array([0.00353503, 0.50136105, 0.27341371, 0.2216902 ])

# Stacking DA

In [46]:
close_t = np.reshape(x_train['Close(t)'].values, (-1, 1))
changes = y_train.values.reshape(-1,1) - close_t
y_train_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_train_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_train_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_train_stack_da.append(tmp_row)

In [47]:
close_t = np.reshape(x_test['Close(t)'].values, (-1, 1))
changes = y_test - close_t
y_test_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_test_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_test_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_test_stack_da.append(tmp_row)

x_test_stack_da = np.array(x_test_stack_da)

In [48]:
stack_da = ensemble.GradientBoostingRegressor()
stack_da.fit(x_train_stack_da, y_train_da.ravel())

y_stack_da = stack_da.predict(x_test_stack_da).reshape(-1,1).round(0).astype(int)

for i in range(x_test_stack_da.shape[1]):
    acc = accuracy_score(y_test_da, x_test_stack_da[:,i])
    print("Accuracy: %.4f" % (acc))
    
acc = accuracy_score(y_test_da, y_stack_da)
print("\nAccuracy: %.4f" % (acc))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

Accuracy: 0.4941
Accuracy: 0.5092
Accuracy: 0.4908
Accuracy: 0.4539

Accuracy: 0.5092


In [49]:
feature_importances_da = stack_da.feature_importances_
feature_importances_da

array([0.08275431, 0.78415671, 0.03098645, 0.10210254])