In [1]:
import pandas as pd
import numpy as np

In [2]:
input_df = pd.read_csv('../raw_data/processed/AAPL.csv')
input_df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,AAPL,1984-10-24,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206
1,AAPL,1984-10-25,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181
2,AAPL,1984-10-26,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218
3,AAPL,1984-10-29,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162
4,AAPL,1984-10-30,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265


## Split Data

In [3]:
def train_test_split(df, test_set_size):
    """
    Split the preprocessed stock data file into a train and test dataset
    INPUT: the dataframe to be split, and size of the test set in months or years ('3M' or '2Y')
    OUTPUT: returns a train_set and test_set dataframe, index is set to the date
    
    EXAMPLE: train_set, test_set = train_test_split(input_df, '3Y')
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    test_set = df.sort_values(by="date",ascending=True).set_index("date").last(test_set_size)
    train_set = df.drop(df.tail(len(test_set)).index).set_index("date")
    test_set.reset_index(inplace=True)
    train_set.reset_index(inplace=True)
    return train_set, test_set

In [4]:
train_set, test_set = train_test_split(input_df, '3Y')

## Binary Threshold

In [5]:
def returns_classification(return_column, returns_threshold):
    """
    Classify the returns versus a defined threshold, and returning either a 1 or 0
    INPUT: the dataframes column, and return threshold
    OUTPUT: returns a column with 1/0 binary classification 
    
    EXAMPLE: train_set['5TD_return_B'] = returns_classification(train_set['5TD_return'], 0.0006)
    """
    return (return_column > returns_threshold).astype(np.int)

In [6]:
train_set['5TD_return'] = returns_classification(train_set['5TD_return'], 0.0006)

In [7]:
train_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,1984-10-24,AAPL,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,0,-0.018263,-0.119206
1,1984-10-25,AAPL,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,0,-0.022156,-0.060181
2,1984-10-26,AAPL,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,1,-0.055397,-0.026218
3,1984-10-29,AAPL,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0,-0.025925,-0.003162
4,1984-10-30,AAPL,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,1,-0.060959,0.035265


## Scale Dataset

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff','MACD_signal', '5TD_return', '10TD_return', '20TD_return']

scaler = RobustScaler()

scaler.fit(train_set[INPUT_COLS])

train_set.loc[:, INPUT_COLS] = scaler.transform(train_set[INPUT_COLS])

In [10]:
scaler

RobustScaler()

In [12]:
train_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,1984-10-24,AAPL,0.263514,0.34493,0.229024,0.279435,-0.702443,-0.079254,-0.406791,0.894287,0.097887,-0.109651,0.163234,-0.145903,-1.0,-0.285421,-0.946308
1,1984-10-25,AAPL,-0.19804,-0.01388,0.20811,0.276948,-0.703459,-0.079075,-0.478593,0.66788,0.619549,-0.108427,0.132317,-0.138488,-1.0,-0.325153,-0.546074
2,1984-10-26,AAPL,-0.449887,-0.25083,0.028799,0.275548,-0.704197,-0.079319,-0.548747,0.510827,0.981414,-0.112992,0.08477,-0.133458,0.0,-0.664431,-0.315778
3,1984-10-29,AAPL,-0.412921,-0.221816,-0.172112,0.274752,-0.703872,-0.080339,-0.613889,0.458925,0.921313,-0.115696,0.054067,-0.129969,-1.0,-0.363621,-0.159434
4,1984-10-30,AAPL,-0.259071,-0.10189,-0.203312,0.273579,-0.703393,-0.080756,-0.691205,0.62674,0.780651,-0.114742,0.043918,-0.12699,0.0,-0.721201,0.101128


## Window Dataset

#### Ian Version

In [7]:
def window_column(df_series, window_size=30, stride_size=5):
    """
    Turns data series into array of windowed arrays
    INPUT: the input data series, window size, stride size
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: y = window_column(train_set['RSI'], 30, 5)
    """
    np_array = df_series.to_numpy()
    nrows = ((np_array.size-window_size)//stride_size)+1
    n = np_array.strides[0]
    return np.lib.stride_tricks.as_strided(
        np_array, shape=(nrows, window_size), strides=(stride_size*n, n))


def window_dataframe(df, window=30, stride_size=5, target=['5TD_return'], feature_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff', 'MACD_signal']):
    """
    Turns the input dataframe into an array of windowed arrays
    INPUT: the input dataframe, window size, stride size, target column, feature columns
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: windowed_array = window_dataframe(train_set)
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    inverse_df = df.sort_values(by="date", ascending=False)
    feature_array = []
    target_array = []
    for column in inverse_df:
        if column in feature_cols: 
            feature_array.append(window_column(inverse_df[column], window, stride_size))
            
        elif column in target:
            target_array.append(window_column(inverse_df[column], window, stride_size))
            
    
    return np.array(feature_array), np.array(target_array)


In [15]:
X_ian, y_ian = window_dataframe(scaled_X)

#### Marcin Version

In [14]:
def build_arrays(df, time_window=5, stride=3, input_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal'] , target_col='5TD_return'):
    output_array = []
    target_array = []
    df_sorted = df.sort_values('date', ascending=False)
    df_sorted.reset_index(drop=True, inplace=True)
    for row in range(0, len(df), stride):
        df_slice = df_sorted.iloc[row: row + time_window]
        if df_slice.shape[0]==time_window:
            output_array.append(np.array(df_slice[input_cols].values))
            target_array.append(df_slice[target_col].iloc[0])
    return np.array(output_array), np.array(target_array)

In [19]:
X_marcin, y_marcin = build_arrays(scaled_X)

## Build Model

In [20]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers, models 
from tensorflow.keras.optimizers import RMSprop, Adam

optim = RMSprop(learning_rate=0.00001)

In [28]:
model = Sequential()

model.add(layers.LSTM(units=10,  activation='tanh')) 
model.add(layers.Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer=optim, metrics=['accuracy'])

# –– The following returns an error
model.fit(X_marcin, y_marcin)



<tensorflow.python.keras.callbacks.History at 0x2035c26bd88>

In [29]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 10)                920       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 931
Trainable params: 931
Non-trainable params: 0
_________________________________________________________________


In [30]:
col_to_scale_df = test_set.drop(columns=['ticker', 'date', '5TD_return', '10TD_return', '20TD_return'])
col_to_scale = list(col_to_scale_df)

scaled_df = test_set

for col in col_to_scale:
    scaled_df[col] = scaler.transform(scaled_df[[col]])

In [32]:
test_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,2018-01-02,AAPL,52.023193,38.152401,23.652648,65334150000.0,254106400000.0,0.580472,13.608768,25.773273,27.130529,0.102056,-0.121981,0.224037,0.012041,0.039743,-0.027992
1,2018-01-03,AAPL,51.94029,37.787056,27.077955,65238010000.0,253984000000.0,0.583509,13.326351,30.417985,25.061481,0.09854,-0.100397,0.198938,0.011971,0.040884,-0.025872
2,2018-01-04,AAPL,54.143797,48.016701,41.31872,65272810000.0,254076900000.0,0.566044,13.064107,29.116762,23.989399,0.110298,-0.070912,0.18121,0.012947,0.031372,-0.072472
3,2018-01-05,AAPL,58.989231,72.494781,52.76618,65338080000.0,254174900000.0,0.565755,13.494921,32.807955,22.287245,0.155667,-0.020435,0.176101,0.011877,0.011473,-0.105798
4,2018-01-08,AAPL,57.011158,65.031315,61.847599,65299330000.0,254089600000.0,0.55413,13.975125,31.838404,21.129476,0.17803,0.001543,0.176487,0.01032,0.015159,-0.065164


In [31]:
scaled_df

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,2018-01-02,AAPL,52.023193,38.152401,23.652648,6.533415e+10,2.541064e+11,0.580472,13.608768,25.773273,27.130529,0.102056,-0.121981,0.224037,0.012041,0.039743,-0.027992
1,2018-01-03,AAPL,51.940290,37.787056,27.077955,6.523801e+10,2.539840e+11,0.583509,13.326351,30.417985,25.061481,0.098540,-0.100397,0.198938,0.011971,0.040884,-0.025872
2,2018-01-04,AAPL,54.143797,48.016701,41.318720,6.527281e+10,2.540769e+11,0.566044,13.064107,29.116762,23.989399,0.110298,-0.070912,0.181210,0.012947,0.031372,-0.072472
3,2018-01-05,AAPL,58.989231,72.494781,52.766180,6.533808e+10,2.541749e+11,0.565755,13.494921,32.807955,22.287245,0.155667,-0.020435,0.176101,0.011877,0.011473,-0.105798
4,2018-01-08,AAPL,57.011158,65.031315,61.847599,6.529933e+10,2.540896e+11,0.554130,13.975125,31.838404,21.129476,0.178030,0.001543,0.176487,0.010320,0.015159,-0.065164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,2020-12-17,AAPL,67.111474,93.108849,96.114229,6.962657e+10,2.576077e+11,2.967725,18.541790,31.807016,14.677821,2.569341,0.616523,1.952818,0.025408,0.005517,-0.006760
747,2020-12-18,AAPL,61.153688,69.435737,85.978511,6.950316e+10,2.574151e+11,2.968602,18.901932,29.526363,18.257190,2.573448,0.496504,2.076944,0.079231,0.034385,0.042438
748,2020-12-21,AAPL,63.817814,85.683987,82.742858,6.962042e+10,2.575364e+11,3.103773,18.119520,26.223295,22.361683,2.672979,0.476828,2.196151,0.051782,-0.012712,0.067379
749,2020-12-22,AAPL,69.105683,82.286917,79.135547,6.960991e+10,2.577057e+11,3.323147,18.952436,35.843505,19.393683,3.011667,0.652413,2.359254,0.013952,-0.007279,0.054519


In [None]:
y_new = model.predict(X_new)