In [1]:
from mlchartist.array_builder import full_dataset_randomised_arrays
from mlchartist.preprocessing import train_test_split

import pandas as pd
import numpy as np

In [2]:
apple = pd.read_csv('../raw_data/processed/aapl.csv')
google = pd.read_csv('../raw_data/processed/googl.csv')
amzn = pd.read_csv('../raw_data/processed/amzn.csv')

joined_df = pd.DataFrame()
joined_df = joined_df.append(apple)
joined_df = joined_df.append(google)
joined_df = joined_df.append(amzn)

print('joined_df', len(joined_df))
print('')


apple_train, apple_test = train_test_split(apple, '3Y')
google_train, google_test = train_test_split(google, '3Y')
amazon_train, amazon_test = train_test_split(amzn, '3Y')

print('trains', len(apple_train) + len(google_train) + len(amazon_train))
print('tests', len(apple_test) + len(google_test) + len(amazon_test))

joined_df 19103

trains 16850
tests 2253


In [9]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
              'MACD_signal', '5TD_return', '10TD_return', '20TD_return']
TARGET_COLS=['5TD_return', '10TD_return', '20TD_return']
outlier_validation={'5TD_return': [-0.5, 0.5]}

stride = 80


train_x, train_y, test_x, test_y, scaler = full_dataset_randomised_arrays(joined_df, 
                                                                                stride=stride, 
                                                                                input_cols=INPUT_COLS, 
                                                                                outlier_threshold=1, 
                                                                                outlier_validation=outlier_validation, 
                                                                                check_train_outliers=True,
                                                                                check_test_outliers=True, 
                                                                                target_col=TARGET_COLS, 
                                                                                time_window=6,
                                                                                test_set_size='3Y')

print('')
print('')
print('### Stats ###')
print('train_x', train_x.shape)
print('train_y', train_y.shape)
print('test_x', test_x.shape)
print('test_y', test_y.shape)
print('scaler', scaler)

print('')
print('')
print('### Validation ###')
print('apple_train', len(apple_train)/stride)
print('apple_test', len(apple_test)/stride)
print('google_train', len(google_train)/stride)
print('google_test', len(google_test)/stride)
print('amazon_train', len(amazon_train)/stride)
print('amazon_test', len(amazon_test)/stride)

print('all trains sets', (len(apple_train) + len(google_train) + len(amazon_train))/stride)
print('all tests sets', (len(apple_test) + len(google_test) + len(amazon_test))/stride)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))


3 Companies in Dataset
Starting AAPL: Company 1 of 3
Starting GOOGL: Company 2 of 3
Starting AMZN: Company 3 of 3
All Companies Completed

Processing Stats: {'AAPL': {'train_possible_windows': 167.28, 'train_outliers': 0, 'train_windows': 168, 'test_possible_windows': 15.02, 'test_outliers': 1, 'test_windows': 15}, 'GOOGL': {'train_possible_windows': 66.66, 'train_outliers': 0, 'train_windows': 67, 'test_possible_windows': 15.02, 'test_outliers': 1, 'test_windows': 15}, 'AMZN': {'train_possible_windows': 103.06, 'train_outliers': 1, 'train_windows': 103, 'test_possible_windows': 15.02, 'test_outliers': 1, 'test_windows': 15}}


### Stats ###
train_x (338, 6, 15)
train_y (338, 6, 3)
test_x (45, 6, 15)
test_y (45, 6, 3)
scaler RobustScaler()


### Validation ###
apple_train 167.28
apple_test 15.02
google_train 66.66
google_test 15.02
amazon_train 103.06
amazon_test 15.02
all trains sets 337.0
all tests sets 45.06


In [34]:
from mlchartist.preprocessing import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import random
import pandas as pd
import numpy as np

def full_dataset_randomised_arrays_FIXED(df, 
                                         test_set_size='3Y', 
                                         time_window=5, 
                                         stride=3, 
                                         check_train_outliers=False, 
                                         check_test_outliers=False, 
                                         outlier_threshold=1, 
                                         input_cols=['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 
                                                     'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff', 'MACD_signal', '1D_past_return', 
                                                     '5D_past_return', '10D_past_return'], 
                                         target_col=['1D_past_return', '5D_past_return', '10D_past_return'], 
                                         outlier_validation={'ATR': [-100, 100], 'Stochastic': [0, 100], 
                                                             'Stochastic_signal': [-10, 110], '5D_past_return': [-0.5, 0.5]}):
    """
    A function to transform dataframe into input and output arrays.

    Takes:
    df - input dataframe
    time_window (default=5) - time series length
    stride (default=3) - controls the number of windows taken (i.e. max_num_windows = len(df)/strides)
    check_outliers (default=False) - controls whether it checks each window for outliers or not
    input_cols (default = 'RSI', 'Stochastic', 'Stochastic_signal', 'ADI',
       'OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
       'MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return']) - all input features, that should 
       be included in the input array target_col (default = '5TD_return') - target variable, first 
       (newest) value for each input array
    target_col - all columns that should be included in target_col
        (default: target_col=['1D_past_return', '5D_past_return', '10D_past_return'])
    outlier_validation - a dict that sets the outlier checks to be completed. Enter data in the format:
        outlier_validation={'column_name': [lower_threshold, upper_threshold]} 
        Example: {'Stochastic': [0, 100], 'Stochastic_signal': [-10, 110], '5D_past_return': [-0.5, 0.5]}

    Return tuple (input_array, target_array).

    input_array dim: (number_of_samples x time_window x features_number)
    target_array dim: (number_of_samples x time_window x returns_numbder)
    """
    
    ## split into train/test split
    raw_train_set = pd.DataFrame()
    raw_test_set = pd.DataFrame()
    for ticker in df['ticker'].unique():
        company_df = df[df['ticker'] == ticker]
        temp_train_set, temp_test_set = train_test_split(company_df, test_set_size)
        raw_train_set = raw_train_set.append(temp_train_set)
        raw_test_set = raw_test_set.append(temp_test_set)
        
    ## create copy of train_set & fit scaler
    no_outlier_train_df = raw_train_set.copy()
    for k, v in outlier_validation.items(): 
        no_outlier_train_df = no_outlier_train_df[no_outlier_train_df[k].between(v[0], v[1])]
    scaler = RobustScaler()
    scaler.fit(no_outlier_train_df[input_cols])
    
    train_x = []
    train_y = []
    test_x = []
    test_y = []
    stats2 = []
    stats = {}
    ## go company by company
    print(f"{df['ticker'].unique().size} Companies in Dataset")
    status_count = 0
    for ticker in df['ticker'].unique():
        status_count +=1
        stats[ticker] = {}
        print(f"Starting {ticker}: Company {status_count} of {df['ticker'].unique().size}")
        train_outlier_count = 0
        test_outlier_count = 0
        company_train_x_array = []
        company_train_y_array = []
        
        company_test_x_array = []
        company_test_y_array = []

        ## train
        company_train_df = raw_train_set[raw_train_set['ticker'] == ticker]
        company_train_sorted = company_train_df.sort_values('date', ascending=False)
        company_train_sorted.reset_index(drop=True, inplace=True)
        for row in range(0, len(company_train_sorted), stride):
            outlier = False
            df_slice = company_train_sorted.iloc[row: row + time_window].copy()
            ## check for outliers
            if check_train_outliers == True:
                for k, v in outlier_validation.items(): 
                    if ((df_slice[k] < v[0]).any() == True) or ((df_slice[k] > v[1]).any() == True): outlier = True
                
            if df_slice.shape[0]==time_window and outlier==False:
                ## scale the window
                df_slice.loc[:, input_cols] = scaler.transform(df_slice[input_cols])
                ## add to company array
                company_train_x_array.append(np.array(df_slice[input_cols].values))
                company_train_y_array.append(np.array(df_slice[target_col].iloc[0]))
            else: train_outlier_count+=1
        
        if train_outlier_count/(len(company_train_sorted)/stride) <= outlier_threshold:
            stats[ticker]['train_possible_windows'] = (len(company_train_sorted)/stride)
            stats[ticker]['train_outliers'] = train_outlier_count
            stats[ticker]['train_windows'] = len(company_train_x_array)
            train_x.extend(company_train_x_array)
            train_y.extend(company_train_y_array)
            

        ## test
        company_test_df = raw_test_set[raw_test_set['ticker'] == ticker]
        company_test_sorted = company_test_df.sort_values('date', ascending=False)
        company_test_sorted.reset_index(drop=True, inplace=True)
        for row in range(0, len(company_test_sorted), stride):
            outlier = False
            df_slice = company_test_sorted.iloc[row: row + time_window].copy()
            ## check for outliers
            if check_test_outliers == True:
                for k, v in outlier_validation.items(): 
                    if ((df_slice[k] < v[0]).any() == True) or ((df_slice[k] > v[1]).any() == True): outlier = True
                
            if df_slice.shape[0]==time_window and outlier==False:
                ## scale the window
                df_slice.loc[:, input_cols] = scaler.transform(df_slice[input_cols])
                ## add to company array
                company_test_x_array.append(np.array(df_slice[input_cols].values))
                company_test_y_array.append(np.array(df_slice[target_col].iloc[0]))
            else: test_outlier_count+=1
        
        if train_outlier_count/(len(company_train_sorted)/stride) <= outlier_threshold:
            stats[ticker]['test_possible_windows'] = (len(company_test_sorted)/stride)
            stats[ticker]['test_outliers'] = test_outlier_count
            stats[ticker]['test_windows'] = len(company_test_x_array)
            test_x.extend(company_test_x_array)
            test_y.extend(company_test_y_array)
    
    print('All Companies Completed')
    print('')
    print('Processing Stats:', stats)
    
    ## shuffle arrays
    output_train_x = []
    output_train_y = []
    index_list = random.sample(range(len(train_x)), len(train_x))
    for i in index_list:
        output_train_x.append(train_x[i])
        output_train_y.append(train_y[i])
    return np.array(output_train_x), np.array(output_train_y), np.array(test_x), np.array(test_y), scaler

In [31]:
random.sample(range(100), 5)

[75, 83, 66, 87, 39]

In [30]:
x = list(range(0, 10))
random.choice(x)

0

In [35]:
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff',
              'MACD_signal', '5TD_return', '10TD_return', '20TD_return']
TARGET_COLS=['5TD_return', '10TD_return', '20TD_return']
outlier_validation={'5TD_return': [-0.5, 0.5]}

stride = 100


train_x, train_y, test_x, test_y, scaler = full_dataset_randomised_arrays_FIXED(joined_df, 
                                                                                stride=stride, 
                                                                                input_cols=INPUT_COLS, 
                                                                                outlier_threshold=1, 
                                                                                outlier_validation=outlier_validation, 
                                                                                check_train_outliers=True,
                                                                                check_test_outliers=True, 
                                                                                target_col=TARGET_COLS, 
                                                                                time_window=6,
                                                                                test_set_size='3Y')

print('')
print('')
print('### Stats ###')
print('train_x', train_x.shape)
print('train_y', train_y.shape)
print('test_x', test_x.shape)
print('test_y', test_y.shape)
print('scaler', scaler)

print('')
print('')
print('### Validation ###')
print('apple_train', len(apple_train)/stride)
print('apple_test', len(apple_test)/stride)
print('google_train', len(google_train)/stride)
print('google_test', len(google_test)/stride)
print('amazon_train', len(amazon_train)/stride)
print('amazon_test', len(amazon_test)/stride)

print('all trains sets', (len(apple_train) + len(google_train) + len(amazon_train))/stride)
print('all tests sets', (len(apple_test) + len(google_test) + len(amazon_test))/stride)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'], format=('%Y-%m-%d'))


3 Companies in Dataset
Starting AAPL: Company 1 of 3
Starting GOOGL: Company 2 of 3
Starting AMZN: Company 3 of 3
All Companies Completed

Processing Stats: {'AAPL': {'train_possible_windows': 83.64, 'train_outliers': 0, 'train_windows': 84, 'test_possible_windows': 7.51, 'test_outliers': 0, 'test_windows': 8}, 'GOOGL': {'train_possible_windows': 33.33, 'train_outliers': 0, 'train_windows': 34, 'test_possible_windows': 7.51, 'test_outliers': 0, 'test_windows': 8}, 'AMZN': {'train_possible_windows': 51.53, 'train_outliers': 0, 'train_windows': 52, 'test_possible_windows': 7.51, 'test_outliers': 0, 'test_windows': 8}}
len(train_x) 170
index_list [151, 111, 55, 92, 70, 120, 41, 131, 113, 13, 91, 57, 37, 30, 71, 136, 123, 22, 14, 154, 11, 119, 61, 62, 5, 26, 46, 168, 98, 38, 77, 7, 84, 122, 158, 68, 112, 21, 23, 54, 49, 33, 94, 85, 66, 9, 142, 141, 15, 74, 130, 82, 104, 137, 48, 109, 27, 95, 20, 16, 1, 44, 162, 51, 139, 166, 116, 34, 110, 0, 42, 4, 125, 133, 107, 102, 150, 75, 24, 134, 101

array([ 0.13040668, -0.07898534, -0.03820554])