In [1]:
import pandas as pd
df = pd.read_csv('stock_market_data/sp500/csv/BA.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12938 entries, 0 to 12937
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            12938 non-null  object 
 1   Low             12938 non-null  float64
 2   Open            12938 non-null  float64
 3   Volume          12938 non-null  int64  
 4   High            12938 non-null  float64
 5   Close           12938 non-null  float64
 6   Adjusted Close  12938 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 707.7+ KB


In [3]:
df_metrix = df[['High', 'Low', 'Open', 'Close', 'Volume']]
df_metrix.columns = ['high', 'low', 'open', 'close', 'volume']

In [4]:
# 前N天的變動量
timeperiod=5

# Metrix Preprocess
https://medium.com/ai股仔/用-python-快速計算-158-種技術指標-26f9579b8f3a


# Ta-Lib
https://mrjbq7.github.io/ta-lib/func.html

In [5]:
import talib
from talib import abstract

# RSI

In [6]:
rsi = abstract.RSI(df.Close, timeperiod)

# MOM
https://blog.csdn.net/The_Time_Runner/article/details/101512714    
https://en.wikipedia.org/wiki/Momentum_(technical_analysis)

In [7]:
mom = abstract.MOM(df.Close, timeperiod)

# KD

In [8]:
KD = abstract.STOCH(df_metrix)

# MACD 

In [9]:
MACD = abstract.MACD(df_metrix) 

In [10]:
frames = [df,
          pd.DataFrame({'rsi':rsi}), # rsi is an array
          pd.DataFrame({'mom': mom}), # mom is also an array
          KD, # KD is a 2-column dataframe
          MACD] # MACD is a  3-column dataframe

combined = pd.concat(frames, axis = 1)

# Add target

Target 為今日的收盤價格是否為過去N天的還要高，1:漲，0:跌

In [11]:
import numpy as np
combined['week_trend'] = np.where(combined.Close.shift(-timeperiod) > combined.Close, 1, 0)
combined = combined.dropna()

# Date to Timestamp

In [12]:
import datetime
def date_to_day(date_str):
    d = datetime.datetime.strptime(date_str, '%d-%m-%Y')
    return d.timestamp()

combined = combined.copy()
combined['Date'] = combined['Date'].apply(date_to_day)

# Split Data

Note: cannot randomly split the dataset because the order of time matters.

In [13]:
train_size = int(len(combined) * 0.75)
train = combined.iloc[:train_size, :]
test = combined.iloc[train_size:, :]

In [14]:
# 訓練樣本再分成目標序列 y 以及因子矩陣 X
train_X = train.drop('week_trend', axis = 1)
train_y = train.week_trend

# 測試樣本再分成目標序列 y 以及因子矩陣 X
test_X = test.drop('week_trend', axis = 1)
test_y = test.week_trend

In [15]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=6)
clf.fit(train_X, train_y)
prediction = clf.predict(test_X)

from sklearn.metrics import accuracy_score
accuracy_score(test_y, prediction)

0.5447784319801673

In [16]:
feature_importance = dict()
for i, c in enumerate(train_X.columns):
    feature_importance[c] = round(clf.feature_importances_[i] * 100, 2)
feature_importance

{'Date': 31.25,
 'Low': 0.85,
 'Open': 3.57,
 'Volume': 3.33,
 'High': 11.76,
 'Close': 3.25,
 'Adjusted Close': 6.61,
 'rsi': 6.52,
 'mom': 3.09,
 'slowk': 3.52,
 'slowd': 0.0,
 'macd': 7.03,
 'macdsignal': 13.24,
 'macdhist': 5.98}

# Drop unimportant features and train again

In [17]:
important_features = combined.drop(['Low', 'slowd', 'mom'], axis=1)
train = important_features.iloc[:train_size, :]
test = important_features.iloc[train_size:, :]
train_X = train.drop('week_trend', axis = 1)
train_y = train.week_trend
test_X = test.drop('week_trend', axis = 1)
test_y = test.week_trend

clf = DecisionTreeClassifier(max_depth=6)
clf.fit(train_X, train_y)
prediction = clf.predict(test_X)
accuracy_score(test_y, prediction)

0.5447784319801673