In [1]:
import pandas as pd
import numpy as np

In [72]:
input_df = pd.read_csv('../../code/project/processed_files/AAPL.csv')
input_df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,AAPL,1984-10-24,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206
1,AAPL,1984-10-25,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181
2,AAPL,1984-10-26,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218
3,AAPL,1984-10-29,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162
4,AAPL,1984-10-30,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265


In [73]:
len(input_df)

9115

## Split into Train/Test
Make in generic (i.e. able to change the test size)

In [74]:
def train_test_split(df, test_set_size):
    """
    Split the preprocessed stock data file into a train and test dataset
    INPUT: the dataframe to be split, and size of the test set in months or years ('3M' or '2Y')
    OUTPUT: returns a train_set and test_set dataframe, index is set to the date
    
    EXAMPLE: train_set, test_set = train_test_split(input_df, '3Y')
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    test_set = df.sort_values(by="date",ascending=True).set_index("date").last(test_set_size)
    train_set = df.drop(input_df.tail(len(test_set)).index).set_index("date")
    test_set.reset_index(inplace=True)
    train_set.reset_index(inplace=True)
    return train_set, test_set

In [75]:
train_set, test_set = train_test_split(input_df, '3Y')

print('len input_df: ', len(input_df))
print('len test_set: ', len(test_set))
print('len train_set: ', len(train_set))
print('len train_set + test_set: ', len(train_set) + len(test_set))

len input_df:  9115
len test_set:  751
len train_set:  8364
len train_set + test_set:  9115


## Binary Threshold for Returns
Able to input any threshold value

In [77]:
def returns_classification(return_column, returns_threshold):
    """
    Classify the returns versus a defined threshold, and returning either a 1 or 0
    INPUT: the dataframes column, and return threshold
    OUTPUT: returns a column with 1/0 binary classification 
    
    EXAMPLE: train_set['5TD_return_B'] = returns_classification(train_set['5TD_return'], 0.0006)
    """
    return (return_column > returns_threshold).astype(np.int)

train_set['5TD_return_B'] = returns_classification(train_set['5TD_return'], 0.0006)
train_set['10TD_return_B'] = returns_classification(train_set['10TD_return'], 0.0012)
train_set['20TD_return_B'] = returns_classification(train_set['20TD_return'], 0.0024)
train_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return,5TD_return_B,10TD_return_B,20TD_return_B
0,1984-10-24,AAPL,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206,0,0,0
1,1984-10-25,AAPL,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181,0,0,0
2,1984-10-26,AAPL,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218,1,0,0
3,1984-10-29,AAPL,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162,0,0,0
4,1984-10-30,AAPL,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265,1,0,1


## Scale Train Dataset

## Turn Windows Into Arrays
Able to set:

- window size
- stride size

Start from the newest values

In [78]:
train_set.head()

Unnamed: 0,date,ticker,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return,5TD_return_B,10TD_return_B,20TD_return_B
0,1984-10-24,AAPL,57.978007,76.690103,69.424914,-5139242000.0,387549732,0.003297,19.036526,35.250645,23.13537,-0.000432,0.001254,-0.001687,-0.05201,-0.018263,-0.119206,0,0,0
1,1984-10-25,AAPL,49.247645,56.625203,68.325221,-5322591000.0,204200382,0.003326,18.051296,32.458308,29.224107,-0.00039,0.001037,-0.001427,-0.009378,-0.022156,-0.060181,0,0,0
2,1984-10-26,AAPL,44.483899,43.374797,58.896701,-5425815000.0,71070262,0.003286,17.088681,30.521333,33.447714,-0.000548,0.000704,-0.001251,0.009726,-0.055397,-0.026218,1,0,0
3,1984-10-29,AAPL,45.183124,44.997296,48.332432,-5484525000.0,129780480,0.003117,16.194824,29.881216,32.746222,-0.000641,0.000488,-0.001129,0.0,-0.025925,-0.003162,0,0,0
4,1984-10-30,AAPL,48.093222,51.703624,46.691906,-5570994000.0,216248865,0.003048,15.133939,31.950924,31.104445,-0.000608,0.000417,-0.001025,0.04806,-0.060959,0.035265,1,0,1


In [140]:
def window_column(df_series, window_size=30, stride_size=5):
    """
    Turns data series into array of windowed arrays
    INPUT: the input data series, window size, stride size
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: y = window_column(train_set['RSI'], 30, 5)
    """
    np_array = df_series.to_numpy()
    nrows = ((np_array.size-window_size)//stride_size)+1
    n = np_array.strides[0]
    return np.lib.stride_tricks.as_strided(
        np_array, shape=(nrows, window_size), strides=(stride_size*n, n))

y = window_column(train_set['RSI'], 30, 5)

In [141]:
y[0]

array([57.97800667, 49.2476453 , 44.48389879, 45.18312437, 48.09322183,
       46.75665291, 48.30386636, 46.83808663, 45.35589207, 58.66884091,
       54.20341905, 45.97092954, 37.51473793, 43.71371141, 40.66673425,
       42.52432268, 42.52432268, 39.65975297, 32.89959579, 38.83490824,
       42.51207194, 47.0463878 , 48.65957779, 52.87317557, 59.91596901,
       56.46878408, 51.73749434, 49.73746898, 52.69922749, 59.1697874 ])

In [142]:
y[1]

array([46.75665291, 48.30386636, 46.83808663, 45.35589207, 58.66884091,
       54.20341905, 45.97092954, 37.51473793, 43.71371141, 40.66673425,
       42.52432268, 42.52432268, 39.65975297, 32.89959579, 38.83490824,
       42.51207194, 47.0463878 , 48.65957779, 52.87317557, 59.91596901,
       56.46878408, 51.73749434, 49.73746898, 52.69922749, 59.1697874 ,
       64.71649173, 64.10772421, 60.04022654, 57.77137702, 51.92969841])

In [143]:
y[2]

array([54.20341905, 45.97092954, 37.51473793, 43.71371141, 40.66673425,
       42.52432268, 42.52432268, 39.65975297, 32.89959579, 38.83490824,
       42.51207194, 47.0463878 , 48.65957779, 52.87317557, 59.91596901,
       56.46878408, 51.73749434, 49.73746898, 52.69922749, 59.1697874 ,
       64.71649173, 64.10772421, 60.04022654, 57.77137702, 51.92969841,
       53.33462402, 56.88592509, 60.13943292, 66.91140131, 59.3288684 ])

In [144]:
def window_dataframe(df, window=30, stride_size=5):
    """
    Turns the input dataframe into an array of windowed arrays
    INPUT: the input dataframe, window size, stride size
    OUTPUT: array of windowed arrays 
    
    EXAMPLE: windowed_array = window_dataframe(train_set)
    """
    if not np.issubdtype(df['date'].dtype, np.datetime64):
        df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))
    inverse_df = df.sort_values(by="date", ascending=False)
    output_array = []
    for column in inverse_df:
        if column not in []: ## exclude columns from being windowed
            output_array.append(window_column(inverse_df[column], window, stride_size))
    
    return output_array

windowed_array = window_dataframe(train_set)

In [146]:
windowed_array

[array([['2017-12-29T00:00:00.000000000', '2017-12-28T00:00:00.000000000',
         '2017-12-27T00:00:00.000000000', ...,
         '2017-11-20T00:00:00.000000000', '2017-11-17T00:00:00.000000000',
         '2017-11-16T00:00:00.000000000'],
        ['2017-12-21T00:00:00.000000000', '2017-12-20T00:00:00.000000000',
         '2017-12-19T00:00:00.000000000', ...,
         '2017-11-13T00:00:00.000000000', '2017-11-10T00:00:00.000000000',
         '2017-11-09T00:00:00.000000000'],
        ['2017-12-14T00:00:00.000000000', '2017-12-13T00:00:00.000000000',
         '2017-12-12T00:00:00.000000000', ...,
         '2017-11-06T00:00:00.000000000', '2017-11-03T00:00:00.000000000',
         '2017-11-02T00:00:00.000000000'],
        ...,
        ['1984-12-26T00:00:00.000000000', '1984-12-24T00:00:00.000000000',
         '1984-12-21T00:00:00.000000000', ...,
         '1984-11-15T00:00:00.000000000', '1984-11-14T00:00:00.000000000',
         '1984-11-13T00:00:00.000000000'],
        ['1984-12-18T00:00:

In [147]:
windowed_array[0]

array([['2017-12-29T00:00:00.000000000', '2017-12-28T00:00:00.000000000',
        '2017-12-27T00:00:00.000000000', ...,
        '2017-11-20T00:00:00.000000000', '2017-11-17T00:00:00.000000000',
        '2017-11-16T00:00:00.000000000'],
       ['2017-12-21T00:00:00.000000000', '2017-12-20T00:00:00.000000000',
        '2017-12-19T00:00:00.000000000', ...,
        '2017-11-13T00:00:00.000000000', '2017-11-10T00:00:00.000000000',
        '2017-11-09T00:00:00.000000000'],
       ['2017-12-14T00:00:00.000000000', '2017-12-13T00:00:00.000000000',
        '2017-12-12T00:00:00.000000000', ...,
        '2017-11-06T00:00:00.000000000', '2017-11-03T00:00:00.000000000',
        '2017-11-02T00:00:00.000000000'],
       ...,
       ['1984-12-26T00:00:00.000000000', '1984-12-24T00:00:00.000000000',
        '1984-12-21T00:00:00.000000000', ...,
        '1984-11-15T00:00:00.000000000', '1984-11-14T00:00:00.000000000',
        '1984-11-13T00:00:00.000000000'],
       ['1984-12-18T00:00:00.000000000', '19

In [138]:
windowed_array[0][0]

array(['2017-12-29T00:00:00.000000000', '2017-12-28T00:00:00.000000000',
       '2017-12-27T00:00:00.000000000', '2017-12-26T00:00:00.000000000',
       '2017-12-22T00:00:00.000000000', '2017-12-21T00:00:00.000000000',
       '2017-12-20T00:00:00.000000000', '2017-12-19T00:00:00.000000000',
       '2017-12-18T00:00:00.000000000', '2017-12-15T00:00:00.000000000',
       '2017-12-14T00:00:00.000000000', '2017-12-13T00:00:00.000000000',
       '2017-12-12T00:00:00.000000000', '2017-12-11T00:00:00.000000000',
       '2017-12-08T00:00:00.000000000', '2017-12-07T00:00:00.000000000',
       '2017-12-06T00:00:00.000000000', '2017-12-05T00:00:00.000000000',
       '2017-12-04T00:00:00.000000000', '2017-12-01T00:00:00.000000000',
       '2017-11-30T00:00:00.000000000', '2017-11-29T00:00:00.000000000',
       '2017-11-28T00:00:00.000000000', '2017-11-27T00:00:00.000000000',
       '2017-11-24T00:00:00.000000000', '2017-11-22T00:00:00.000000000',
       '2017-11-21T00:00:00.000000000', '2017-11-20

In [137]:
windowed_array[0][len(windowed_array[0])-1]

array(['1984-12-11T00:00:00.000000000', '1984-12-10T00:00:00.000000000',
       '1984-12-07T00:00:00.000000000', '1984-12-06T00:00:00.000000000',
       '1984-12-05T00:00:00.000000000', '1984-12-04T00:00:00.000000000',
       '1984-12-03T00:00:00.000000000', '1984-11-30T00:00:00.000000000',
       '1984-11-29T00:00:00.000000000', '1984-11-28T00:00:00.000000000',
       '1984-11-27T00:00:00.000000000', '1984-11-26T00:00:00.000000000',
       '1984-11-23T00:00:00.000000000', '1984-11-21T00:00:00.000000000',
       '1984-11-20T00:00:00.000000000', '1984-11-19T00:00:00.000000000',
       '1984-11-16T00:00:00.000000000', '1984-11-15T00:00:00.000000000',
       '1984-11-14T00:00:00.000000000', '1984-11-13T00:00:00.000000000',
       '1984-11-12T00:00:00.000000000', '1984-11-09T00:00:00.000000000',
       '1984-11-08T00:00:00.000000000', '1984-11-07T00:00:00.000000000',
       '1984-11-06T00:00:00.000000000', '1984-11-05T00:00:00.000000000',
       '1984-11-02T00:00:00.000000000', '1984-11-01

In [113]:
len(windowed_array[0])

1667

In [122]:
len(windowed_array[0][len(windowed_array[0])-1])

30

In [112]:
windowed_array[0]

array([['1984-10-24T00:00:00.000000000', '1984-10-25T00:00:00.000000000',
        '1984-10-26T00:00:00.000000000', ...,
        '1984-12-03T00:00:00.000000000', '1984-12-04T00:00:00.000000000',
        '1984-12-05T00:00:00.000000000'],
       ['1984-10-31T00:00:00.000000000', '1984-11-01T00:00:00.000000000',
        '1984-11-02T00:00:00.000000000', ...,
        '1984-12-10T00:00:00.000000000', '1984-12-11T00:00:00.000000000',
        '1984-12-12T00:00:00.000000000'],
       ['1984-11-07T00:00:00.000000000', '1984-11-08T00:00:00.000000000',
        '1984-11-09T00:00:00.000000000', ...,
        '1984-12-17T00:00:00.000000000', '1984-12-18T00:00:00.000000000',
        '1984-12-19T00:00:00.000000000'],
       ...,
       ['2017-10-27T00:00:00.000000000', '2017-10-30T00:00:00.000000000',
        '2017-10-31T00:00:00.000000000', ...,
        '2017-12-06T00:00:00.000000000', '2017-12-07T00:00:00.000000000',
        '2017-12-08T00:00:00.000000000'],
       ['2017-11-03T00:00:00.000000000', '20

In [64]:
def my_function(input_array):
    return input_array.mean()

def apply_rolling_data(data, col, function, window, step=1, labels=None):
    """Perform a rolling window analysis at the column `col` from `data`

    Given a dataframe `data` with time series, call `function` at
    sections of length `window` at the data of column `col`. Append
    the results to `data` at a new columns with name `label`.

    Parameters
    ----------
    data : DataFrame
        Data to be analyzed, the dataframe must stores time series
        columnwise, i.e., each column represent a time series and each
        row a time index
    col : str
        Name of the column from `data` to be analyzed
    function : callable
        Function to be called to calculate the rolling window
        analysis, the function must receive as input an array or
        pandas series. Its output must be either a number or a pandas
        series
    window : int
        length of the window to perform the analysis
    step : int
        step to take between two consecutive windows
    labels : str
        Name of the column for the output, if None it defaults to
        'MEASURE'. It is only used if `function` outputs a number, if
        it outputs a Series then each index of the series is going to
        be used as the names of their respective columns in the output

    Returns
    -------
    data : DataFrame
        Input dataframe with added columns with the result of the
        analysis performed

    """

    x = _strided_app(data[col].to_numpy(), window, step)
    rolled = np.apply_along_axis(function, 1, x)

    if labels is None:
        labels = [f"metric_{i}" for i in range(rolled.shape[1])]

    for col in labels:
        data[col] = np.nan

    data.loc[
        data.index[
            [False]*(window-1)
            + list(np.arange(len(data) - (window-1)) % step == 0)],
        labels] = rolled

    return data


def _strided_app(a, L, S):  # Window len = L, Stride len/stepsize = S
    """returns an array that is strided
    """
    nrows = ((a.size-L)//S)+1
    n = a.strides[0]
    return np.lib.stride_tricks.as_strided(
        a, shape=(nrows, L), strides=(S*n, n))