In [2]:
import pandas as pd
import numpy as np

In [4]:
def build_arrays(df, time_window=5, stride=3, input_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return'] , target_col='5TD_return'):
    """
    A function to transform dataframe into input and output arrays.

    Takes:
    df - input dataframe
    time_window (default=5) - time series length
    stride (default=3) - a step for moving window across dataframe rows
    input_cols (default = 'RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return']) - all input features, that should be included in the input array
    target_col (default = '5TD_return') - target variable, first (newest) value for each input array


    Return tuple (input_array, target_array).

    input_array dim: (number_of_samples x time_window x features_number)
    target_array dim: number_of_samples
    """

    input_array = []
    target_array = []
    df_sorted = df.sort_values('date', ascending=False)
    df_sorted.reset_index(drop=True, inplace=True)
    for row in range(0, len(df), stride):
        df_slice = df_sorted.iloc[row: row + time_window]
        if df_slice.shape[0]==time_window:
            input_array.append(np.array(df_slice[input_cols].values))
            target_array.append(df_slice[target_col].iloc[0])
    return np.array(input_array), np.array(target_array)

In [7]:
test = pd.read_csv('../raw_data/processed/AAPL.csv')
test.keys()

Index(['ticker', 'date', 'RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal', '5TD_return', '10TD_return', '20TD_return'],
      dtype='object')

In [9]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff','MACD_signal', '5TD_return', '10TD_return', '20TD_return']
#INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal']

ticker_train_x, ticker_train_y = build_arrays(test,input_cols=INPUT_COLS, target_col='20TD_return', time_window=6, stride=5)

## Redo Function

In [116]:
test = test.rename(columns={'5TD_return':'1D_past_return', '10TD_return': '5D_past_return', '20TD_return':'10D_past_return'})

In [141]:
import random

def build_randomised_arrays(df, time_window=5, stride=3, check_outliers=False, input_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return'], target_col=['1D_past_return', '5D_past_return', '10D_past_return'], 
        outlier_validation={'ATR': [-100, 100], 'Stochastic': [0, 100], 'Stochastic_signal': [-10, 110], '5D_past_return': [-0.5, 0.5]}):
    """
    A function to transform dataframe into input and output arrays.

    Takes:
    df - input dataframe
    time_window (default=5) - time series length
    stride (default=3) - controls the number of windows taken (i.e. max_num_windows = len(df)/strides)
    check_outliers (default=False) - controls whether it checks each window for outliers or not
    input_cols (default = 'RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return']) - all input features, that should 
       be included in the input array target_col (default = '5TD_return') - target variable, first (newest) value for each input array
    target_col - all columns that should be included in target_col
        (default: target_col=['1D_past_return', '5D_past_return', '10D_past_return'])
    outlier_validation - a dict that sets the outlier checks to be completed. Enter data in the format:
        outlier_validation={'column_name': [lower_threshold, upper_threshold]} 
        Example: {'Stochastic': [0, 100], 'Stochastic_signal': [-10, 110], '5D_past_return': [-0.5, 0.5]}

    Return tuple (input_array, target_array).

    input_array dim: (number_of_samples x time_window x features_number)
    target_array dim: (number_of_samples x time_window x returns_numbder)
    """

    input_array = []
    target_array = []
    df_sorted = df.sort_values('date', ascending=False)
    df_sorted.reset_index(drop=True, inplace=True)    
    max_num_windows = len(df)/stride
    random_index = []
    for i in range(int(max_num_windows)):
        r=random.randint(time_window, len(df)- time_window)
        if r not in random_index: random_index.append(r)
    
    for window_start in random_index:
        outlier = False
        df_slice = df_sorted.iloc[window_start: window_start + time_window]
        if check_outliers == True:
            for k, v in outlier_validation.items(): 
                if ((df_slice[k] < v[0]).any() == True) or ((df_slice[k] > v[1]).any() == True): outlier = True
        if df_slice.shape[0]==time_window and outlier==False:
            input_array.append(np.array(df_slice[input_cols].values))
            target_array.append(np.array(df_slice[target_col].values))
    
    return np.array(input_array), np.array(target_array)

In [142]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff','MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return']

ticker_train_x, ticker_train_y = build_randomised_arrays(test, input_cols=INPUT_COLS, check_outliers=True, target_col=['1D_past_return', '5D_past_return', '10D_past_return'], time_window=6)

In [136]:
len(test)/3

3038.3333333333335

In [137]:
len(ticker_train_x)

2546

In [138]:
ticker_train_x.shape

(2546, 6, 15)

In [139]:
len(ticker_train_y)

2546

In [132]:
ticker_train_y.shape

(2561, 6, 3)

In [133]:
ticker_train_x

array([[[ 7.82105005e+01,  8.09254328e+01,  9.32309439e+01, ...,
          9.46876763e-02,  5.53930122e-02,  2.59252942e-02],
        [ 8.30196656e+01,  9.93976858e+01,  9.83023801e+01, ...,
          7.06107866e-02,  7.92040476e-02,  3.79068433e-02],
        [ 8.06952878e+01,  9.93697131e+01,  9.70907704e+01, ...,
          5.49677010e-02,  6.47300493e-02,  6.67644223e-02],
        [ 7.78155759e+01,  9.61397414e+01,  9.71019501e+01, ...,
          8.95714228e-03,  8.09623907e-02,  5.82013416e-02],
        [ 7.76685691e+01,  9.57628567e+01,  9.53602083e+01, ...,
          1.98014496e-02,  1.12418108e-01,  4.83112498e-02],
        [ 7.72307412e+01,  9.94032523e+01,  9.47386742e+01, ...,
          2.31974332e-02,  1.20081621e-01, -2.44861795e-02]],

       [[ 2.68325510e+01,  5.34484643e+00,  1.33786600e+01, ...,
          4.73708069e-02,  4.26110607e-03,  1.43608341e-01],
        [ 3.46211589e+01,  2.15833068e+01,  1.87370781e+01, ...,
         -9.53035510e-02, -1.76365025e-01,  3.43642