In [1]:
import pandas as pd
df = pd.read_csv('stock_market_data/sp500/csv/GOOG.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4194 entries, 0 to 4193
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            4194 non-null   object 
 1   Low             4194 non-null   float64
 2   Open            4194 non-null   float64
 3   Volume          4194 non-null   int64  
 4   High            4194 non-null   float64
 5   Close           4194 non-null   float64
 6   Adjusted Close  4194 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 229.5+ KB


In [3]:
df_metrix = df[['High', 'Low', 'Open', 'Close', 'Volume']]
df_metrix.columns = ['high', 'low', 'open', 'close', 'volume']

In [4]:
# 前N天的變動量
timeperiod=5

# Metrix Preprocess
https://medium.com/ai股仔/用-python-快速計算-158-種技術指標-26f9579b8f3a


# Ta-Lib
https://mrjbq7.github.io/ta-lib/func.html

In [5]:
import talib
from talib import abstract

# RSI

In [6]:
RSI = abstract.RSI(df.Close, timeperiod)
RSI = pd.DataFrame({'RSI' : RSI})

# MOM

In [7]:
MOM = abstract.MOM(df.Close, timeperiod)
MOM = pd.DataFrame({'MOM':MOM})

# KD

In [8]:
KD = abstract.STOCH(df_metrix)
KD = pd.DataFrame(KD) # KD has slowd and slowk

# MACD 

In [9]:
MACD = abstract.MACD(df_metrix) 
MACD = pd.DataFrame(MACD) # MACD has its own column names

# ADX

In [10]:
ADX = abstract.ADX(df_metrix)
ADX = pd.DataFrame({'ADX': ADX})

# Simple Moving Average

In [11]:
SMA = abstract.SMA(df.Close)
SMA = pd.DataFrame({'SMA': SMA})

# Bollinger Bands

In [12]:
from talib import MA_Type
upper, middle, lower = talib.BBANDS(df.Close, matype=MA_Type.T3)
bb_df = pd.DataFrame({ \
    'upper_bb' : upper,
    'middel_bb' : middle,
    'lower_bb' : lower})

In [13]:
frames = [df,
          RSI, 
          MOM, 
          KD, 
          MACD,
          ADX,
          SMA,
          bb_df]

combined = pd.concat(frames, axis = 1)

# Add target

Target 為今日的收盤價格是否為過去N天的還要高，1:漲，0:跌

In [14]:
import numpy as np
combined['week_trend'] = np.where(combined.Close.shift(-timeperiod) > combined.Close, 1, 0)
combined = combined.dropna()

# Date to Timestamp

In [15]:
import datetime
def date_to_day(date_str):
    d = datetime.datetime.strptime(date_str, '%d-%m-%Y')
    return d.timestamp()

combined = combined.copy()
combined['Date'] = combined['Date'].apply(date_to_day)

# Split Data

Note: cannot randomly split the dataset because the order of time matters.

In [16]:
train_size = int(len(combined) * 0.75)
train = combined.iloc[:train_size, :]
test = combined.iloc[train_size:, :]

In [17]:
# 訓練樣本再分成目標序列 y 以及因子矩陣 X
train_X = train.drop('week_trend', axis = 1)
train_y = train.week_trend

# 測試樣本再分成目標序列 y 以及因子矩陣 X
test_X = test.drop('week_trend', axis = 1)
test_y = test.week_trend

In [18]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=5)
clf.fit(train_X, train_y)
prediction = clf.predict(test_X)

from sklearn.metrics import accuracy_score
accuracy_score(test_y, prediction)

0.5206532180595581

In [19]:
feature_importance = dict()
for i, c in enumerate(train_X.columns):
    feature_importance[c] = round(clf.feature_importances_[i] * 100, 2)
feature_importance

{'Date': 2.77,
 'Low': 0.0,
 'Open': 0.0,
 'Volume': 12.06,
 'High': 15.11,
 'Close': 0.0,
 'Adjusted Close': 2.35,
 'RSI': 5.84,
 'MOM': 0.0,
 'slowk': 0.0,
 'slowd': 0.0,
 'macd': 0.0,
 'macdsignal': 16.11,
 'macdhist': 9.53,
 'ADX': 23.73,
 'SMA': 4.38,
 'upper_bb': 5.09,
 'middel_bb': 0.0,
 'lower_bb': 3.03}

# Drop unimportant features and train again

In [20]:
important_features = combined.drop(['Low', 'Open', 'High', 'Close', 'slowd', 'macd', 'SMA', 'middel_bb'], axis=1)
train = important_features.iloc[:train_size, :]
test = important_features.iloc[train_size:, :]
train_X = train.drop('week_trend', axis = 1)
train_y = train.week_trend
test_X = test.drop('week_trend', axis = 1)
test_y = test.week_trend

clf = DecisionTreeClassifier(max_depth=6)
clf.fit(train_X, train_y)
prediction = clf.predict(test_X)
accuracy_score(test_y, prediction)

0.5485110470701249