In [1]:
import pandas as pd
import numpy as np
from talib.abstract import *
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_rows = 10

# Data preprocess

In [2]:
input_data = pd.read_csv("2330.csv")

In [3]:
input_data.head()

Unnamed: 0,證券代碼,年月日,開盤價(元),最高價(元),最低價(元),收盤價(元),成交量(千股),外資買賣超(千股),投信買賣超(千股),自營買賣超(千股),外資總投資股率%,投信持股率%,自營持股率%
0,2330 台積電,1/2/2008,39.58,39.58,38.43,38.62,38970,394,-8824,-365,71.79,0.53,0.02
1,2330 台積電,1/3/2008,38.24,38.3,37.92,37.92,34838,-9307,-804,482,71.76,0.52,0.02
2,2330 台積電,1/4/2008,37.86,38.56,37.34,38.3,48400,-6802,-1357,711,71.73,0.52,0.02
3,2330 台積電,1/7/2008,36.39,37.03,35.62,35.62,103229,-65640,-5021,-1424,71.48,0.5,0.02
4,2330 台積電,1/8/2008,35.62,36.13,35.43,35.75,47333,-13831,77,-1248,71.43,0.5,0.01


In [4]:
input_data.iloc[:, 1] = input_data.iloc[:, 1].apply(lambda x: datetime.strptime(x, "%m/%d/%Y"))
input_data.sort_values("年月日", inplace=True)

In [5]:
input_data.isna().sum()

證券代碼         0
年月日          0
開盤價(元)       0
最高價(元)       0
最低價(元)       0
            ..
投信買賣超(千股)    0
自營買賣超(千股)    0
外資總投資股率%     0
投信持股率%       0
自營持股率%       0
Length: 13, dtype: int64

In [6]:
def transform(x):
    try:
        return x.replace(",", "")
    except:
        return x

input_data.iloc[:, 2:] = input_data.iloc[:, 2:].applymap(lambda x: float(transform(x)))

In [7]:
input_data = input_data.iloc[:, 2:]

In [8]:
columns = ['open', 'high', 'low', 'close', 'volume',
           'foreign', 'trust', 'dealer',
           'foreign_ratio', 'trust_ratio', 'dealer_ratio']

input_data.columns = columns

In [9]:
input_data["EMA_5"] = EMA(input_data, timeperiod = 5) #計算EMA(5)
input_data["EMA_10"] = EMA(input_data, timeperiod = 10) #計算EMA(10)
input_data["EMA_20"] = EMA(input_data, timeperiod = 20) #計算EMA(20)
input_data = pd.concat([input_data, BBANDS(input_data, 20, 2, 2)], axis = 1) #計算BBands
input_data = pd.concat([input_data, STOCH(input_data)], axis = 1) #計算KD值
input_data["Momentum"] = MOM(input_data) #計算Momentum
input_data["RSI"] = RSI(input_data) #計算RSI
input_data["WILLR"] = WILLR(input_data) #計算William指標
input_data["ADOSC"] = ADOSC(input_data) #計算ADOSC

input_data.dropna(inplace = True) #去除缺值

In [10]:
input_data.head()

Unnamed: 0,open,high,low,close,volume,foreign,trust,dealer,foreign_ratio,trust_ratio,...,EMA_20,upperband,middleband,lowerband,slowk,slowd,Momentum,RSI,WILLR,ADOSC
19,35.68,36.39,35.49,36.39,58212.0,38559.0,198.0,406.0,71.31,0.54,...,35.4565,38.787946,35.4565,32.125054,88.01956,77.289222,-0.32,47.382253,-10.754717,-32111.853521
20,36.71,37.22,36.39,36.9,110696.0,38869.0,1103.0,614.0,71.46,0.54,...,35.593976,38.450133,35.3705,32.290867,87.414695,85.294176,1.53,49.330626,-5.755396,-11745.196156
21,36.71,38.24,36.58,38.24,171328.0,103404.0,438.0,401.0,71.85,0.55,...,35.845978,38.521772,35.3865,32.251228,96.787149,90.740468,3.32,54.136038,-0.0,52167.817872
22,38.94,39.71,38.56,39.2,134709.0,90112.0,-120.0,598.0,72.19,0.55,...,36.165409,38.753049,35.4315,32.109951,93.542874,92.581573,4.34,57.263119,-6.335404,78416.786412
23,38.11,38.49,38.11,38.3,103197.0,16243.0,-2641.0,-201.0,72.25,0.54,...,36.368703,39.115066,35.5655,32.015934,85.618284,91.982769,4.27,53.575153,-17.515528,82026.20205


In [11]:
#定義函數看要落後幾期
def lag_operator(data, lag):
    
    final_data = pd.DataFrame([])
    for i in np.arange(1, (lag), 1):
        temp_data = data.shift(i)
        
        #重新設columns名字
        temp_data.columns = pd.DataFrame([input_data.columns.values]).T.loc[:, 0].apply(lambda x : "{}_{}".format(x, i + 1)).values
        
        #合併進來
        final_data = pd.concat([final_data, temp_data], axis = 1)
                
    data.columns = pd.DataFrame([data.columns.values]).T.loc[:, 0].apply(lambda x : "{}_{}".format(x, 1)).values
    final_data = pd.concat([data, final_data], axis = 1)
    final_data.dropna(inplace = True)
    return(final_data)

In [12]:
def shift_close(close_price, window_size):
    shifted_return = (close_price - close_price.shift(window_size)) / close_price.shift(window_size)
    shifted_return.dropna(inplace=True)
    shifted_return.reset_index(drop=True, inplace=True)
    return shifted_return

In [30]:
def threshold_check_binary(close_price, ratio, low_bound, high_bound, iter_now):
    threshold = (low_bound + high_bound) / 2
    rate = sum(close_price > threshold) / len(close_price)
    if rate == ratio or iter_now == 0:
        return threshold;
    elif rate < ratio:
        return threshold_check(close_price, ratio, low_bound, threshold, iter_now - 1)
    else:
        return threshold_check(close_price, ratio, threshold, high_bound, iter_now - 1)
    

In [31]:
def threshold_check_segment(close_price, ratio, low_bound, high_bound, section_num):
    threshold = np.linspace(low_bound, high_bound, section_num)
    rate = [sum(close_price > th) / len(close_price) for th in threshold]
    mse = [(r - ratio) ** 2 for r in rate]
    best_rate = threshold[mse.index(min(mse))]
    return best_rate

In [15]:
X = lag_operator(input_data, 20)

In [32]:
window_size = 5
Y = shift_close(X.close_1, window_size)
threshold_binary = threshold_check_binary(Y, 7/17, 0, 0.01, 10)
threshold_segment = threshold_check_segment(Y, 7/17, 0, 0.01, 10)
print(threshold)
print(threshold_section)
total_num = len(Y)
up_num = sum(Y > threshold)
up_num / (total_num - up_num)

0.009819335937500003
0.01


0.7003841229193342

In [24]:
X = X.iloc[:total_num, :]

In [54]:
print("  ",len(X), len(Y))

   2661 2661


In [60]:
Y[Y > threshold] = 1
Y[Y <= threshold] = 0

# Train Model

In [27]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
#from confusion_matrix import *

In [63]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

x_train, x_test, y_train, y_test =  train_test_split(X, Y, test_size = 0.3, random_state = 666)

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
clf = RandomForestClassifier(criterion = "gini", 
                             n_estimators = 3,  
                             random_state = 20, 
                             max_features = 0.7, 
                             max_depth = 7,      
                             min_samples_leaf = 5,
                             ) 

clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features=0.7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=None,
            oob_score=False, random_state=20, verbose=0, warm_start=False)

In [27]:
#訓練資料集預測出之結果
y_predict_train = clf.predict(x_train)

#測試資料集預測出之結果
y_predict_test = clf.predict(x_test)

#評估訓練資料集與測試資料集之預測準確率
from sklearn.metrics import accuracy_score
train_acc = accuracy_score(y_train, y_predict_train )
test_acc = accuracy_score(y_test, y_predict_test)

print("Accuracy_train : %.2f%%" % (train_acc * 100.0),"\n"
      "Accuracy_test  : %.2f%%" % (test_acc * 100.0))

Accuracy_train : 75.89% 
Accuracy_test  : 55.57%


In [28]:
%%time
#使用RandomizedSearchCV在給定所有的模型參數空間中，透過隨機抽取參數的方式建立模型，最後選擇mse最佳者做為最終採用模型
from sklearn.model_selection import RandomizedSearchCV

#設定模型參數區間
rfr_param_grid = {"n_estimators" : np.arange(10, 31, 1),
                  "criterion" : ["entropy", "gini"],
                  "max_features" : ["auto", "sqrt", "log2"],
                  "max_depth" : np.arange(1, 31, 1),
                  "min_samples_leaf" : np.arange(1, 11, 1)
                 }

#同樣選擇隨機森林演算法，此處採用回歸模型
rfr = RandomForestClassifier(random_state = 666) #rfr == random forest regressor

#將模型導入GridSearchCV，同時考量交叉檢驗
rfr_GridSearchCV = RandomizedSearchCV(estimator = rfr, 
                                      param_distributions = rfr_param_grid, 
                                      scoring = "accuracy",
                                      cv = 10,
                                      n_iter = 30) #隨機抽取30組參數

#訓練GridSearchCV模型
rfr_GridSearchCV.fit(x_train, y_train)

#選擇GridSearchCV模型中表現最佳者做為最終採用模型
rfr = rfr_GridSearchCV.best_estimator_
rfr.fit(x_train, y_train)

Wall time: 1min 53s


In [29]:
#訓練資料集預測出之結果
y_predict_train = rfr.predict(x_train)

#測試資料集預測出之結果
y_predict_test = rfr.predict(x_test)

train_acc = accuracy_score(y_train, y_predict_train )
test_acc = accuracy_score(y_test, y_predict_test)

print("Accuracy_train : %.2f%%" % (train_acc * 100.0),"\n"
      "Accuracy_test  : %.2f%%" % (test_acc * 100.0))

Accuracy_train : 99.95% 
Accuracy_test  : 68.84%


In [30]:
scores = cross_val_score(rfr, X, Y, cv=10)
scores.mean()

0.5121566370877374

In [31]:
#confusion_matrix(confusion_matrix(y_test, y_predict_test), classes = ["1", "2"], normalize = True)

# XGboost

In [32]:
from xgboost import XGBClassifier

In [33]:
xgb_clf = XGBClassifier(random_state = 666) #rfr == random forest

xgb_clf.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=666, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [34]:
#訓練資料集預測出之結果
y_predict_train = xgb_clf.predict(x_train)

#測試資料集預測出之結果
y_predict_test = xgb_clf.predict(x_test)

train_acc = accuracy_score(y_train, y_predict_train )
test_acc = accuracy_score(y_test, y_predict_test)

print("Accuracy_train : %.2f%%" % (train_acc * 100.0),"\n"
      "Accuracy_test  : %.2f%%" % (test_acc * 100.0))

Accuracy_train : 90.28% 
Accuracy_test  : 64.46%


In [35]:
scores = cross_val_score(xgb_clf, X, Y, cv=10)
scores.mean()

0.5377421346454965

## DNN

In [21]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras import utils

In [37]:
batch_size = 100 # the number of samples that will be calculated for loss
n_epochs = 10 # training times 
n_classes = 10 # 0,1,2,3,4,5,6,7,8,9 => 10 classes

In [38]:
# build a sequential model
model = Sequential()
# the first layer has to specify the dimensions of the input vector
# Denase stands for "fully connected"
model.add(Dense(units=512, activation='relu', input_shape=(X.shape[1],)))
# add two hidden layers
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
# output layer can only have the neurons equal to the number of outputs
model.add(Dense(units=2, activation='softmax'))

# compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=SGD(lr=0.01), metrics=['accuracy'])

In [39]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               236032    
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
Total params: 400,514
Trainable params: 400,514
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epochs, validation_split=0.4)

Train on 1117 samples, validate on 745 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18a54ce5c0>

In [41]:
# evaluate the model and print the accuracy score
scores = model.evaluate(x_test, y_test)

print('\nloss:', scores[0])
print('accuracy:', scores[1])


loss: 0.6736836202899565
accuracy: 0.5832290366683645


# LSTM


Using TensorFlow backend.


  
  


compilation time :  0.018995285034179688


  from ipykernel import kernelapp as app




ValueError: Error when checking input: expected lstm_1_input to have shape (None, 1) but got array with shape (1, 460)