In [1]:
from mlchartist.preprocessing import fit_train_scaler, train_test_split, train_test_split_multiple_companies
from mlchartist.array_builder import full_dataset_randomised_arrays

import pandas as pd
import numpy as np

In [2]:
apple = pd.read_csv('../../raw_data/processed/aapl.csv')
google = pd.read_csv('../../raw_data/processed/googl.csv')
amzn = pd.read_csv('../../raw_data/processed/amzn.csv')

joined_df = pd.DataFrame()
joined_df = joined_df.append(apple)
joined_df = joined_df.append(google)
joined_df = joined_df.append(amzn)

print('joined_df', len(joined_df))
print('')


apple_train, apple_test = train_test_split(apple, 500)
google_train, google_test = train_test_split(google, 500)
amazon_train, amazon_test = train_test_split(amzn, 500)

print('trains', len(apple_train) + len(google_train) + len(amazon_train))
print('tests', len(apple_test) + len(google_test) + len(amazon_test))

joined_df 19103

trains 17603
tests 1500


## Train/Test Split Function

In [3]:
train_set, test_set = train_test_split_multiple_companies(joined_df, 500)

print('train_set', len(train_set))
print('test_set', len(test_set))

train_set 17603
test_set 1500


## Fit Scaler Function
Fits scaler on dataframe after the outliers have been removed

In [4]:
outlier_validation={'5TD_return': [-0.5, 0.5]}
input_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 
            'ADX_neg', 'MACD', 'MACD_diff', 'MACD_signal', '5TD_return', '10TD_return', '20TD_return']

fitted_scaler = fit_train_scaler(train_set, outlier_validation=outlier_validation, input_cols=input_cols)
fitted_scaler

RobustScaler()

## New Build Arrays Function

#### Old Functionality
Input full dataframe where the function then splits the data and fits a scaler

In [6]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
              'MACD_signal', '5TD_return', '10TD_return', '20TD_return']
TARGET_COLS=['5TD_return', '10TD_return', '20TD_return']
outlier_validation={'5TD_return': [-0.5, 0.5]}

stride = 100


train_x, train_y, test_x, test_y, scaler = full_dataset_randomised_arrays(unsplit_df=joined_df, 
                                                                          #IMPORTANT: need use unsplit_df=df
                                                                            #train_df=train_set, 
                                                                            #test_df=test_set, 
                                                                            stride=stride,
                                                                            #fitted_scaler=scaler,
                                                                            #split_dataframe=True,
                                                                            input_cols=INPUT_COLS, 
                                                                            outlier_threshold=1, 
                                                                            outlier_validation=outlier_validation, 
                                                                            check_train_outliers=True,
                                                                            check_test_outliers=True, 
                                                                            target_col=TARGET_COLS, 
                                                                            time_window=6,
                                                                            test_set_size=500)

print('')
print('')
print('### Stats ###')
print('train_x', train_x.shape)
print('train_y', train_y.shape)
print('test_x', test_x.shape)
print('test_y', test_y.shape)
print('scaler', scaler)

Train/Test Split: Splitting unsplit dataframe
Scaler: Fitting scaler on train set
3 Companies in Dataset
Starting AAPL: Company 1 of 3
Starting GOOGL: Company 2 of 3
Starting AMZN: Company 3 of 3
All Companies Completed



### Stats ###
train_x (177, 6, 15)
train_y (177, 3)
test_x (15, 6, 15)
test_y (15, 3)
scaler RobustScaler()


#### Pass Fitted Scaler to Function
Input full dataframe with a pre-fitted scaler

In [7]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
              'MACD_signal', '5TD_return', '10TD_return', '20TD_return']
TARGET_COLS=['5TD_return', '10TD_return', '20TD_return']
outlier_validation={'5TD_return': [-0.5, 0.5]}

stride = 100


train_x, train_y, test_x, test_y, scaler = full_dataset_randomised_arrays(unsplit_df=joined_df, 
                                                                          #IMPORTANT: need use unsplit_df=df
                                                                            stride=stride,
                                                                            fitted_scaler=fitted_scaler, ## <-- HERE
                                                                            input_cols=INPUT_COLS, 
                                                                            outlier_threshold=1, 
                                                                            outlier_validation=outlier_validation, 
                                                                            check_train_outliers=True,
                                                                            check_test_outliers=True, 
                                                                            target_col=TARGET_COLS, 
                                                                            time_window=6,
                                                                            test_set_size=500)

print('')
print('')
print('### Stats ###')
print('train_x', train_x.shape)
print('train_y', train_y.shape)
print('test_x', test_x.shape)
print('test_y', test_y.shape)
print('scaler', scaler)

Train/Test Split: Splitting unsplit dataframe
Scaler: Using provided fitted_scaler
3 Companies in Dataset
Starting AAPL: Company 1 of 3
Starting GOOGL: Company 2 of 3
Starting AMZN: Company 3 of 3
All Companies Completed



### Stats ###
train_x (177, 6, 15)
train_y (177, 3)
test_x (15, 6, 15)
test_y (15, 3)
scaler RobustScaler()


#### Pass Pre Split Train & Test Set
Instead of inputting a full dataframe, input a train and test set into the function. <br>
NOTE: By default the function will use the unsplit_df so make sure unsplit_df=None

In [8]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
              'MACD_signal', '5TD_return', '10TD_return', '20TD_return']
TARGET_COLS=['5TD_return', '10TD_return', '20TD_return']
outlier_validation={'5TD_return': [-0.5, 0.5]}

stride = 100


train_x, train_y, test_x, test_y, scaler = full_dataset_randomised_arrays(unsplit_df=None, 
                                                                            train_df=train_set,  ## <-- HERE
                                                                            test_df=test_set,     ## <-- HERE
                                                                            stride=stride,
                                                                            input_cols=INPUT_COLS, 
                                                                            outlier_threshold=1, 
                                                                            outlier_validation=outlier_validation, 
                                                                            check_train_outliers=True,
                                                                            check_test_outliers=True, 
                                                                            target_col=TARGET_COLS, 
                                                                            time_window=6,
                                                                            test_set_size=500)

print('')
print('')
print('### Stats ###')
print('train_x', train_x.shape)
print('train_y', train_y.shape)
print('test_x', test_x.shape)
print('test_y', test_y.shape)
print('scaler', scaler)

Train/Test Split: Using provided train/test split dataframe
Scaler: Fitting scaler on train set
3 Companies in Dataset
Starting AAPL: Company 1 of 3
Starting GOOGL: Company 2 of 3
Starting AMZN: Company 3 of 3
All Companies Completed



### Stats ###
train_x (177, 6, 15)
train_y (177, 3)
test_x (15, 6, 15)
test_y (15, 3)
scaler RobustScaler()


#### Don't Split The Data (Holdout Dataset)
Uses the unsplit_df dataframe but doesn't split it.

In [9]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
              'MACD_signal', '5TD_return', '10TD_return', '20TD_return']
TARGET_COLS=['5TD_return', '10TD_return', '20TD_return']
outlier_validation={'5TD_return': [-0.5, 0.5]}

stride = 100


holdout_x, holdout_y, scaler = full_dataset_randomised_arrays(unsplit_df=joined_df, 
                                                                            stride=stride,
                                                                            split_dataframe=False, ## <-- HERE, use for holdout
                                                                            input_cols=INPUT_COLS, 
                                                                            outlier_threshold=1, 
                                                                            outlier_validation=outlier_validation, 
                                                                            check_train_outliers=True,
                                                                            check_test_outliers=True, 
                                                                            target_col=TARGET_COLS, 
                                                                            time_window=6,
                                                                            test_set_size=500)

print('')
print('')
print('### Stats ###')
print('holdout_x', holdout_x.shape)
print('holdout_y', holdout_y.shape)
print('scaler', scaler)

Train/Test Split: Not splitting dataframe. (Holdout Data)
Scaler: Fitting scaler on train set
3 Companies in Dataset
Starting AAPL: Company 1 of 3
Starting GOOGL: Company 2 of 3
Starting AMZN: Company 3 of 3
All Companies Completed



### Stats ###
holdout_x (192, 6, 15)
holdout_y (192, 3)
scaler RobustScaler()
