In [4]:
import pandas as pd
df = pd.read_csv('stock_market_data/sp500/csv/GOOG.csv')

In [5]:
df

Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close
0,19-08-2004,47.800831,49.813290,44871361,51.835709,49.982655,49.982655
1,20-08-2004,50.062355,50.316402,22942874,54.336334,53.952770,53.952770
2,23-08-2004,54.321388,55.168217,18342897,56.528118,54.495735,54.495735
3,24-08-2004,51.591621,55.412300,15319808,55.591629,52.239197,52.239197
4,25-08-2004,51.746044,52.284027,9232276,53.798351,52.802086,52.802086
...,...,...,...,...,...,...,...
4189,12-04-2021,2238.465088,2266.250000,1565900,2275.320068,2254.790039,2254.790039
4190,13-04-2021,2256.090088,2261.469971,1165500,2277.209961,2267.270020,2267.270020
4191,14-04-2021,2249.189941,2275.159912,1011000,2277.989990,2254.840088,2254.840088
4192,15-04-2021,2266.000000,2276.979980,1373600,2306.596924,2296.659912,2296.659912


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4194 entries, 0 to 4193
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            4194 non-null   object 
 1   Low             4194 non-null   float64
 2   Open            4194 non-null   float64
 3   Volume          4194 non-null   int64  
 4   High            4194 non-null   float64
 5   Close           4194 non-null   float64
 6   Adjusted Close  4194 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 229.5+ KB


In [7]:
df_metrix = df[['High', 'Low', 'Open', 'Close', 'Volume']]
df_metrix.columns = ['high', 'low', 'open', 'close', 'volume']

In [8]:
timeperiod=5

# Generate other indicators

Here I am going to use the TA-lib, which helps us to calculate those stock market indicators. 


In [9]:
import talib
from talib import abstract

# RSI

In [10]:
RSI = abstract.RSI(df.Close, timeperiod)
RSI = pd.DataFrame({'RSI' : RSI})

# MOM

In [11]:
MOM = abstract.MOM(df.Close, timeperiod)
MOM = pd.DataFrame({'MOM':MOM})

# KD

In [12]:
KD = abstract.STOCH(df_metrix)
KD = pd.DataFrame(KD) # KD has slowd and slowk

# MACD 

In [13]:
MACD = abstract.MACD(df_metrix) 
MACD = pd.DataFrame(MACD) # MACD has its own column names

# ADX

In [14]:
ADX = abstract.ADX(df_metrix)
ADX = pd.DataFrame({'ADX': ADX})

# Simple Moving Average

In [15]:
SMA = abstract.SMA(df.Close)
SMA = pd.DataFrame({'SMA': SMA})

# Bollinger Bands

In [16]:
from talib import MA_Type
upper, middle, lower = talib.BBANDS(df.Close, matype=MA_Type.T3)
bb_df = pd.DataFrame({ \
    'upper_bb' : upper,
    'middel_bb' : middle,
    'lower_bb' : lower})

In [17]:
frames = [df,
          RSI, 
          MOM, 
          KD, 
          MACD,
          ADX,
          SMA,
          bb_df]

combined = pd.concat(frames, axis = 1)

# Target

In [18]:
import numpy as np
combined['week_trend'] = np.where(combined.Close.shift(-timeperiod) > combined.Close, 1, 0)
combined = combined.dropna()

In [20]:
dd = combined[['Close', 'week_trend']]

In [24]:
dd.iloc[0:20, :]

Unnamed: 0,Close,week_trend
33,68.284058,1
34,69.165749,1
35,68.607841,1
36,67.377457,1
37,68.443459,1
38,70.18692,0
39,70.734871,1
40,71.785927,1
41,74.301498,1
42,73.693779,1


# Date to Timestamp

In [None]:
th = (date64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 'h')
  
print("Printing the converted datetime in Timestamp in hour:",


In [15]:
import datetime
def date_to_day(date_str):
    d = datetime.datetime.strptime(date_str, '%d-%m-%Y')
    return d.timestamp()

combined = combined.copy()
combined['Date'] = combined['Date'].apply(date_to_day)

# Split Data

Note: cannot randomly split the dataset because the order of time matters.

In [16]:
train_size = int(len(combined) * 0.75)
train = combined.iloc[:train_size, :]
test = combined.iloc[train_size:, :]

In [17]:

train_X = train.drop('week_trend', axis = 1)
train_y = train.week_trend

test_X = test.drop('week_trend', axis = 1)
test_y = test.week_trend

In [18]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=5)
clf.fit(train_X, train_y)
prediction = clf.predict(test_X)

from sklearn.metrics import accuracy_score
accuracy_score(test_y, prediction)

0.5206532180595581

In [19]:
feature_importance = dict()
for i, c in enumerate(train_X.columns):
    feature_importance[c] = round(clf.feature_importances_[i] * 100, 2)
feature_importance

{'Date': 2.77,
 'Low': 0.0,
 'Open': 0.0,
 'Volume': 12.06,
 'High': 15.11,
 'Close': 0.0,
 'Adjusted Close': 2.35,
 'RSI': 5.84,
 'MOM': 0.0,
 'slowk': 0.0,
 'slowd': 0.0,
 'macd': 0.0,
 'macdsignal': 16.11,
 'macdhist': 9.53,
 'ADX': 23.73,
 'SMA': 4.38,
 'upper_bb': 5.09,
 'middel_bb': 0.0,
 'lower_bb': 3.03}

# Drop unimportant features and train again

Because a stock dataset may have different feature importance. Here I just want to see if I drop those unimportant features, would the accuracy raise?   
  
Disappointedly, it didn't.

In [20]:
important_features = combined.drop(['Low', 'Open', 'High', 'Close', 'slowd', 'macd', 'SMA', 'middel_bb'], axis=1)
train = important_features.iloc[:train_size, :]
test = important_features.iloc[train_size:, :]
train_X = train.drop('week_trend', axis = 1)
train_y = train.week_trend
test_X = test.drop('week_trend', axis = 1)
test_y = test.week_trend

clf = DecisionTreeClassifier(max_depth=6)
clf.fit(train_X, train_y)
prediction = clf.predict(test_X)
accuracy_score(test_y, prediction)

0.5485110470701249