In [1]:
import pandas as pd
import numpy as np

In [148]:
input_df = pd.read_csv('../raw_data/processed/AAPL.csv')
input_df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,AAPL,1984-10-24,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206
1,AAPL,1984-10-25,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181
2,AAPL,1984-10-26,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218
3,AAPL,1984-10-29,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162
4,AAPL,1984-10-30,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265


In [149]:
len(input_df)

9115

## Split into Train/Test
Make in generic (i.e. able to change the test size)

In [150]:
def train_test_split(df, test_set_size):
    """
    Split the preprocessed stock data file into a train and test dataset
    INPUT: the dataframe to be split, and size of the test set in months or years ('3M' or '2Y')
    OUTPUT: returns a train_set and test_set dataframe, index is set to the date
    
    EXAMPLE: train_set, test_set = train_test_split(input_df, '3Y')
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    test_set = df.sort_values(by="date",ascending=True).set_index("date").last(test_set_size)
    train_set = df.drop(df.tail(len(test_set)).index).set_index("date")
    test_set.reset_index(inplace=True)
    train_set.reset_index(inplace=True)
    return train_set, test_set

In [151]:
train_set, test_set = train_test_split(input_df, '3Y')

print('len input_df: ', len(input_df))
print('len test_set: ', len(test_set))
print('len train_set: ', len(train_set))
print('len train_set + test_set: ', len(train_set) + len(test_set))

len input_df:  9115
len test_set:  751
len train_set:  8364
len train_set + test_set:  9115


## Binary Threshold for Returns
Able to input any threshold value

In [77]:
def returns_classification(return_column, returns_threshold):
    """
    Classify the returns versus a defined threshold, and returning either a 1 or 0
    INPUT: the dataframes column, and return threshold
    OUTPUT: returns a column with 1/0 binary classification 
    
    EXAMPLE: train_set['5TD_return_B'] = returns_classification(train_set['5TD_return'], 0.0006)
    """
    return (return_column > returns_threshold).astype(np.int)

train_set['5TD_return_B'] = returns_classification(train_set['5TD_return'], 0.0006)
train_set['10TD_return_B'] = returns_classification(train_set['10TD_return'], 0.0012)
train_set['20TD_return_B'] = returns_classification(train_set['20TD_return'], 0.0024)
train_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return,5TD_return_B,10TD_return_B,20TD_return_B
0,1984-10-24,AAPL,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206,0,0,0
1,1984-10-25,AAPL,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181,0,0,0
2,1984-10-26,AAPL,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218,1,0,0
3,1984-10-29,AAPL,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162,0,0,0
4,1984-10-30,AAPL,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265,1,0,1


## Scale Train Dataset

## Turn Windows Into Arrays
Able to set:

- window size
- stride size

Start from the newest values

In [78]:
train_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return,5TD_return_B,10TD_return_B,20TD_return_B
0,1984-10-24,AAPL,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206,0,0,0
1,1984-10-25,AAPL,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181,0,0,0
2,1984-10-26,AAPL,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218,1,0,0
3,1984-10-29,AAPL,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162,0,0,0
4,1984-10-30,AAPL,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265,1,0,1


In [155]:
train_set.keys()

Index(['date', 'ticker', 'RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal', '5TD_return', '10TD_return', '20TD_return'],
      dtype='object')

In [140]:
def window_column(df_series, window_size=30, stride_size=5):
    """
    Turns data series into array of windowed arrays
    INPUT: the input data series, window size, stride size
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: y = window_column(train_set['RSI'], 30, 5)
    """
    np_array = df_series.to_numpy()
    nrows = ((np_array.size-window_size)//stride_size)+1
    n = np_array.strides[0]
    return np.lib.stride_tricks.as_strided(
        np_array, shape=(nrows, window_size), strides=(stride_size*n, n))

y = window_column(train_set['RSI'], 30, 5)

In [141]:
y[0]

array([57.97800667, 49.2476453 , 44.48389879, 45.18312437, 48.09322183,
       46.75665291, 48.30386636, 46.83808663, 45.35589207, 58.66884091,
       54.20341905, 45.97092954, 37.51473793, 43.71371141, 40.66673425,
       42.52432268, 42.52432268, 39.65975297, 32.89959579, 38.83490824,
       42.51207194, 47.0463878 , 48.65957779, 52.87317557, 59.91596901,
       56.46878408, 51.73749434, 49.73746898, 52.69922749, 59.1697874 ])

In [142]:
y[1]

array([46.75665291, 48.30386636, 46.83808663, 45.35589207, 58.66884091,
       54.20341905, 45.97092954, 37.51473793, 43.71371141, 40.66673425,
       42.52432268, 42.52432268, 39.65975297, 32.89959579, 38.83490824,
       42.51207194, 47.0463878 , 48.65957779, 52.87317557, 59.91596901,
       56.46878408, 51.73749434, 49.73746898, 52.69922749, 59.1697874 ,
       64.71649173, 64.10772421, 60.04022654, 57.77137702, 51.92969841])

In [143]:
y[2]

array([54.20341905, 45.97092954, 37.51473793, 43.71371141, 40.66673425,
       42.52432268, 42.52432268, 39.65975297, 32.89959579, 38.83490824,
       42.51207194, 47.0463878 , 48.65957779, 52.87317557, 59.91596901,
       56.46878408, 51.73749434, 49.73746898, 52.69922749, 59.1697874 ,
       64.71649173, 64.10772421, 60.04022654, 57.77137702, 51.92969841,
       53.33462402, 56.88592509, 60.13943292, 66.91140131, 59.3288684 ])

In [164]:
def window_dataframe(df, window=30, stride_size=5, target=['5TD_return'], feature_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff', 'MACD_signal']):
    """
    Turns the input dataframe into an array of windowed arrays
    INPUT: the input dataframe, window size, stride size, target column, feature columns
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: windowed_array = window_dataframe(train_set)
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    inverse_df = df.sort_values(by="date", ascending=False)
    feature_array = []
    target_array = []
    for column in inverse_df:
        if column in feature_cols: 
            feature_array.append(window_column(inverse_df[column], window, stride_size))
            
        elif column in target:
            target_array.append(window_column(inverse_df[column], window, stride_size))
            
    
    return np.array(feature_array), np.array(target_array)

windowed_array, target_array = window_dataframe(train_set)

In [165]:
windowed_array.shape

(12, 1667, 30)

In [166]:
target_array.shape

(1, 1667, 30)

In [167]:
windowed_array

array([[[ 4.32662483e+01,  4.83092053e+01,  4.67694665e+01, ...,
          5.79274676e+01,  5.84282959e+01,  6.10856298e+01],
        [ 6.05459106e+01,  5.90130329e+01,  5.95640312e+01, ...,
          7.24868750e+01,  7.49659579e+01,  7.70856109e+01],
        [ 5.66041720e+01,  5.67865293e+01,  5.53480108e+01, ...,
          7.64260181e+01,  7.46666038e+01,  6.94240909e+01],
        ...,
        [ 5.90013256e+01,  5.86117173e+01,  5.62163194e+01, ...,
          4.25243227e+01,  4.25243227e+01,  4.06667342e+01],
        [ 6.69114013e+01,  6.01394329e+01,  5.68859251e+01, ...,
          4.59709295e+01,  5.42034190e+01,  5.86688409e+01],
        [ 5.77713770e+01,  6.00402265e+01,  6.41077242e+01, ...,
          4.83038664e+01,  4.67566529e+01,  4.80932218e+01]],

       [[ 5.29440871e+00,  2.75111331e+01,  2.15734785e+01, ...,
          4.83453785e+01,  4.99809814e+01,  6.39573618e+01],
        [ 7.97292070e+01,  7.36943907e+01,  7.52417795e+01, ...,
          9.21837709e+01,  9.54455052e

In [168]:
windowed_array[0]

array([[43.26624827, 48.30920533, 46.76946646, ..., 57.92746764,
        58.4282959 , 61.08562979],
       [60.54591065, 59.0130329 , 59.56403124, ..., 72.48687504,
        74.96595785, 77.08561095],
       [56.604172  , 56.78652927, 55.34801078, ..., 76.42601809,
        74.66660376, 69.42409085],
       ...,
       [59.00132556, 58.61171726, 56.21631942, ..., 42.52432268,
        42.52432268, 40.66673425],
       [66.91140131, 60.13943292, 56.88592509, ..., 45.97092954,
        54.20341905, 58.66884091],
       [57.77137702, 60.04022654, 64.10772421, ..., 48.30386636,
        46.75665291, 48.09322183]])

In [169]:
windowed_array[0][0]

array([43.26624827, 48.30920533, 46.76946646, 46.68344079, 60.54591065,
       60.54591065, 59.0130329 , 59.56403124, 66.14462232, 61.0121307 ,
       56.604172  , 56.78652927, 55.34801078, 58.39056452, 49.78933085,
       49.63063378, 48.85705349, 50.43513694, 50.81877937, 53.92782325,
       55.89397765, 50.85846633, 60.6915276 , 63.87319345, 66.74313489,
       66.72348804, 63.86224513, 57.92746764, 58.4282959 , 61.08562979])

In [170]:
windowed_array[0][len(windowed_array[0])-1]

array([57.77137702, 60.04022654, 64.10772421, 64.71649173, 59.1697874 ,
       52.69922749, 49.73746898, 51.73749434, 56.46878408, 59.91596901,
       52.87317557, 48.65957779, 47.0463878 , 42.51207194, 38.83490824,
       32.89959579, 39.65975297, 42.52432268, 42.52432268, 40.66673425,
       43.71371141, 37.51473793, 45.97092954, 54.20341905, 58.66884091,
       45.35589207, 46.83808663, 48.30386636, 46.75665291, 48.09322183])

In [113]:
len(windowed_array[0])

1667

In [122]:
len(windowed_array[0][len(windowed_array[0])-1])

30

In [112]:
windowed_array[0]

array([['1984-10-24T00:00:00.000000000', '1984-10-25T00:00:00.000000000',
        '1984-10-26T00:00:00.000000000', ...,
        '1984-12-03T00:00:00.000000000', '1984-12-04T00:00:00.000000000',
        '1984-12-05T00:00:00.000000000'],
       ['1984-10-31T00:00:00.000000000', '1984-11-01T00:00:00.000000000',
        '1984-11-02T00:00:00.000000000', ...,
        '1984-12-10T00:00:00.000000000', '1984-12-11T00:00:00.000000000',
        '1984-12-12T00:00:00.000000000'],
       ['1984-11-07T00:00:00.000000000', '1984-11-08T00:00:00.000000000',
        '1984-11-09T00:00:00.000000000', ...,
        '1984-12-17T00:00:00.000000000', '1984-12-18T00:00:00.000000000',
        '1984-12-19T00:00:00.000000000'],
       ...,
       ['2017-10-27T00:00:00.000000000', '2017-10-30T00:00:00.000000000',
        '2017-10-31T00:00:00.000000000', ...,
        '2017-12-06T00:00:00.000000000', '2017-12-07T00:00:00.000000000',
        '2017-12-08T00:00:00.000000000'],
       ['2017-11-03T00:00:00.000000000', '20