In [1]:
import pandas as pd
import numpy as np

In [2]:
input_df = pd.read_csv('../raw_data/processed/AAPL.csv')
input_df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,AAPL,1984-10-24,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206
1,AAPL,1984-10-25,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181
2,AAPL,1984-10-26,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218
3,AAPL,1984-10-29,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162
4,AAPL,1984-10-30,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265


## Split Data

In [3]:
def train_test_split(df, test_set_size):
    """
    Split the preprocessed stock data file into a train and test dataset
    INPUT: the dataframe to be split, and size of the test set in months or years ('3M' or '2Y')
    OUTPUT: returns a train_set and test_set dataframe, index is set to the date
    
    EXAMPLE: train_set, test_set = train_test_split(input_df, '3Y')
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    test_set = df.sort_values(by="date",ascending=True).set_index("date").last(test_set_size)
    train_set = df.drop(df.tail(len(test_set)).index).set_index("date")
    test_set.reset_index(inplace=True)
    train_set.reset_index(inplace=True)
    return train_set, test_set

In [4]:
train_set, test_set = train_test_split(input_df, '3Y')

## Binary Threshold

In [5]:
def returns_classification(return_column, returns_threshold):
    """
    Classify the returns versus a defined threshold, and returning either a 1 or 0
    INPUT: the dataframes column, and return threshold
    OUTPUT: returns a column with 1/0 binary classification 
    
    EXAMPLE: train_set['5TD_return_B'] = returns_classification(train_set['5TD_return'], 0.0006)
    """
    return (return_column > returns_threshold).astype(np.int)

In [6]:
train_set['5TD_return'] = returns_classification(train_set['5TD_return'], 0.0006)

In [9]:
train_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,1984-10-24,AAPL,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,0,-0.018263,-0.119206
1,1984-10-25,AAPL,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,0,-0.022156,-0.060181
2,1984-10-26,AAPL,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,1,-0.055397,-0.026218
3,1984-10-29,AAPL,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0,-0.025925,-0.003162
4,1984-10-30,AAPL,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,1,-0.060959,0.035265


## Scale Dataset

In [10]:
from sklearn.preprocessing import StandardScaler

def std_scaler(df):
    """
    Scale the data with SKlearn StandardScaler
    """
    
    scaler = StandardScaler()
    
    col_to_scale_df = df.drop(columns=['ticker', 'date', '5TD_return', '10TD_return', '20TD_return'])
    col_to_scale = list(col_to_scale_df)
    
    scaled_df = df
    
    for col in col_to_scale:
        scaled_df[col] = scaler.fit_transform(scaled_df[[col]])
    
    return scaled_df

In [11]:
scaled_X = std_scaler(train_set)
scaled_X.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,1984-10-24,AAPL,0.370168,0.719809,0.515734,-0.057869,-1.532051,-0.655419,-0.719394,1.168775,0.050118,-0.18341,0.020991,-0.199775,0,-0.018263,-0.119206
1,1984-10-25,AAPL,-0.301928,0.06415,0.477299,-0.06246,-1.53413,-0.655251,-0.812577,0.850479,0.768844,-0.183187,0.016966,-0.198334,0,-0.022156,-0.060181
2,1984-10-26,AAPL,-0.668659,-0.368832,0.147772,-0.065044,-1.535639,-0.655481,-0.903622,0.629684,1.267407,-0.184019,0.010776,-0.197356,1,-0.055397,-0.026218
3,1984-10-29,AAPL,-0.61483,-0.315814,-0.22145,-0.066514,-1.534974,-0.656442,-0.988163,0.556718,1.184601,-0.184511,0.006778,-0.196678,0,-0.025925,-0.003162
4,1984-10-30,AAPL,-0.3908,-0.096672,-0.278787,-0.068679,-1.533993,-0.656835,-1.088502,0.792642,0.990803,-0.184337,0.005457,-0.196099,1,-0.060959,0.035265


## Window Dataset

#### Ian Version

In [7]:
def window_column(df_series, window_size=30, stride_size=5):
    """
    Turns data series into array of windowed arrays
    INPUT: the input data series, window size, stride size
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: y = window_column(train_set['RSI'], 30, 5)
    """
    np_array = df_series.to_numpy()
    nrows = ((np_array.size-window_size)//stride_size)+1
    n = np_array.strides[0]
    return np.lib.stride_tricks.as_strided(
        np_array, shape=(nrows, window_size), strides=(stride_size*n, n))


def window_dataframe(df, window=30, stride_size=5, target=['5TD_return'], feature_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff', 'MACD_signal']):
    """
    Turns the input dataframe into an array of windowed arrays
    INPUT: the input dataframe, window size, stride size, target column, feature columns
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: windowed_array = window_dataframe(train_set)
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    inverse_df = df.sort_values(by="date", ascending=False)
    feature_array = []
    target_array = []
    for column in inverse_df:
        if column in feature_cols: 
            feature_array.append(window_column(inverse_df[column], window, stride_size))
            
        elif column in target:
            target_array.append(window_column(inverse_df[column], window, stride_size))
            
    
    return np.array(feature_array), np.array(target_array)


In [15]:
X_ian, y_ian = window_dataframe(scaled_X)

#### Marcin Version

In [14]:
def build_arrays(df, time_window=5, stride=3, input_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal'] , target_col='5TD_return'):
    output_array = []
    target_array = []
    df_sorted = df.sort_values('date', ascending=False)
    df_sorted.reset_index(drop=True, inplace=True)
    for row in range(0, len(df), stride):
        df_slice = df_sorted.iloc[row: row + time_window]
        if df_slice.shape[0]==time_window:
            output_array.append(np.array(df_slice[input_cols].values))
            target_array.append(df_slice[target_col].iloc[0])
    return np.array(output_array), np.array(target_array)

In [19]:
X_marcin, y_marcin = build_arrays(scaled_X)

## Build Model

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers