In [1]:
import numpy as np
import pandas as pd
import pandas_datareader as pdr
import matplotlib.pyplot as plt
import talib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from datetime import date,datetime

In [2]:
INDEX = ['^N225', '^HSI', '^DJI', '000001.SS']
start_date = '2016-01-01'
end_date = '2019-01-01'
scaler = MinMaxScaler((0,1))
start_date = datetime.strptime(start_date, "%Y-%m-%d")
end_date = datetime.strptime(end_date, "%Y-%m-%d")
shift_list = [1,3,7,10,30,60,90]
features_list = ['macd','RSI','STOCH_slowk','EMA','OBV','ADLine','ADX','SMA','CMO','CCI']

In [3]:
def prepare_index(INDEX_NAME, start_date, end_date):
    index = pdr.get_data_yahoo(INDEX_NAME, start=start_date, end=end_date)
    index = index[['Close','High','Low','Volume','Open']]
    index.reset_index(inplace=True)
    index = macd(index, 12, 26, 9) #fast=12 slow=26 signal=9
    index = rsi(index, 14) #time=14
    index = ema(index, 14) #time=14
    index = stoch(index) 
    index = obv(index) 
    index = ad(index)
    index = adx(index, 14) #time=14
    index = sma(index, 14) #time=14
    index = cmo(index, 14) #time=14
    index = cci(index, 14) #time=14
    index = index[['Date','Close','macd','RSI','STOCH_slowk','EMA','OBV','ADLine','ADX','SMA','CMO','CCI']]
    index.set_index("Date", inplace=True)
    index = index.bfill(axis ='rows')

    return index
    
def target(df, col="Close", shift=-1, sideway=0):
    return df.assign(Target = np.where( df[col].shift(shift) > df[col] * (1 + sideway), 1, 0))

def macd(idx, fast, slow, signal):
    macd, macdsignal, macdhist = talib.MACD(idx['Close'], fastperiod=fast, slowperiod=slow, signalperiod=signal)
    d = pd.DataFrame(macd)
    d = d.assign(signal=macdsignal)
    d = d.rename({0:'macd'},axis=1)
    idx = idx.assign(macd=d['macd'])
    return idx

def rsi(idx, time):
    RSI= talib.RSI(idx['Close'],timeperiod=time)
    RSI = RSI.to_frame()
    RSI = RSI.rename({0:'RSI'},axis=1)
    idx = idx.assign(RSI=RSI['RSI'])
    return idx

def ema(idx, time):
    real = talib.EMA(idx['Close'], timeperiod=time)
    idx = idx.assign(EMA=real)
    return idx

def stoch(idx):
    slowk, slowd = talib.STOCH(idx['High'], idx['Low'], idx['Close'])
    idx = idx.assign(STOCH_slowk=slowk)
    return idx

def obv(idx):
    real = talib.OBV(idx['Close'], idx['Volume'])
    idx = idx.assign(OBV=real)
    return idx

def ad(idx):
    real = talib.AD(idx['High'], idx['Low'], idx['Close'], idx['Volume'])
    idx = idx.assign(ADLine=real)
    return idx

def adx(idx, time):
    real = talib.ADX(idx['High'], idx['Low'], idx['Close'], timeperiod=time)
    idx = idx.assign(ADX=real)
    return idx

def aroon(idx, time):
    aroondown, aroonup = talib.AROON(idx['High'], idx['Low'], timeperiod=time)
    
def sma(idx, time):
    real = talib.SMA(idx['Close'], timeperiod=time)
    idx = idx.assign(SMA=real)
    return idx

def cmo(idx, time):
    real = talib.CMO(idx['Close'], timeperiod=time)
    idx = idx.assign(CMO=real)
    return idx

def cci(idx, time):
    real = talib.CCI(idx['High'], idx['Low'], idx['Close'], timeperiod=time)
    idx = idx.assign(CCI=real)
    return idx

In [4]:
index_data = {}
for shift in shift_list:
    for index_name in INDEX:
        name = index_name + "-" + str(shift)
        index_data[name] = prepare_index(index_name, start_date=start_date, end_date=end_date)
        index_data[name] = target(index_data[name],shift=shift)

<h2>Feature selection for Classification Model</h2> 
(Random Forest, Logistic, Decision Tree)

In [5]:
hit_features = pd.Series(np.zeros(len(features_list)), index=features_list)
for shift in shift_list:
    rf_model = RandomForestClassifier(random_state=0)
    lg_model = LogisticRegression(random_state=0)
    dt_model = DecisionTreeClassifier(random_state=0)
    class_model = [rf_model, lg_model, dt_model]
    for model in class_model:
        rfe = RFE(estimator=model, n_features_to_select=5)
        
        for index_name in INDEX:
            name = index_name + "-" + str(shift)
            index = index_data[name]
            X = index[features_list]
            y = index[["Target"]]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = False, random_state=0)
            X_train = scaler.fit_transform(X_train)
            y_train = scaler.fit_transform(y_train)
            rfe = rfe.fit(X_train, y_train.ravel())
        
        temp = pd.Series(rfe.support_, index = X.columns.to_list())
        for k in temp.keys(): 
            if temp[k] == True: hit_features[k] += 1
hit_features

macd           15.0
RSI            13.0
STOCH_slowk    11.0
EMA             9.0
OBV             7.0
ADLine         10.0
ADX             7.0
SMA             7.0
CMO            12.0
CCI            14.0
dtype: float64

<h2>Feature selection for Linear Regression Model</h2>

In [6]:
for shift in shift_list:
    ln_model = LinearRegression()
    rfe = RFE(estimator=ln_model, n_features_to_select=5)
    for index_name in INDEX:
        name = index_name + "-" + str(shift)
        index = index_data[name]
        X = index.iloc[:-shift,:][features_list]
        y = index.iloc[shift:,:][["Close"]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = False, random_state=0)
        X_train = scaler.fit_transform(X_train)
        y_train = scaler.fit_transform(y_train)
        rfe = rfe.fit(X_train, y_train.ravel())
    temp = pd.Series(rfe.support_, index = X.columns.to_list())
    for k in temp.keys(): 
        if temp[k] == True: hit_features[k] += 1
hit_features

macd           16.0
RSI            16.0
STOCH_slowk    11.0
EMA            16.0
OBV            13.0
ADLine         17.0
ADX             7.0
SMA            11.0
CMO            16.0
CCI            17.0
dtype: float64

In [12]:
selected_features = hit_features[hit_features > 15].index.tolist()
print("Selected features are :",selected_features)

Selected features are : ['macd', 'RSI', 'EMA', 'ADLine', 'CMO', 'CCI']
