In [1]:
from mlchartist.array_builder import full_dataset_randomised_arrays_
from mlchartist.preprocessing import train_test_split, thresholds_encoding

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# Preprocessing

## Importing NASDAQ_100 Files

In [2]:
# Reads Nasdaq 100 components

nasdaq100 = pd.read_csv('../../raw_data/metadata/nasdaq100.csv', header=None)

In [3]:
nasdaq100_list = list(nasdaq100.values.flatten())

In [26]:
# Samples n random companies from Nasdaq 100 list

rand_tickers = random.sample(nasdaq100_list, 5)
rand_tickers

['INTC', 'CTSH', 'ALXN', 'ILMN', 'ALGN']

In [27]:
joined_df = pd.DataFrame()

# Takes nrows from history from each company from the sample
nrows = 3000

for ticker in rand_tickers:
    ticker_path = '../../raw_data/processed/' + ticker.strip().lower() + '.csv'
    ticker_df = pd.read_csv(ticker_path, nrows=nrows)
    joined_df = joined_df.append(ticker_df)
    

In [28]:
# Prints sample tickers list

for tick in rand_tickers:
    print(tick)

INTC
CTSH
ALXN
ILMN
ALGN


In [29]:
joined_df['date'] = pd.to_datetime(joined_df['date'])

#FIVE_TR = 0.0006
#TEN_TR = 0.0012
#TWENTY_TR = 0.0024

# Drops 'ADI' and 'OBV' from input columns
# INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff','MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return']
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff','MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return']

joined_df = thresholds_encoding(joined_df)

#joined_df['5D_return_bin'] = (joined_df['5TD_return'] >= FIVE_TR)
#joined_df['10D_return_bin'] = (joined_df['10TD_return'] >= TEN_TR)
#joined_df['20D_return_bin'] = (joined_df['20TD_return'] >= TWENTY_TR)

In [30]:
joined_df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,...,MACD_signal,5TD_return,10TD_return,20TD_return,1D_past_return,5D_past_return,10D_past_return,5D_return_bin,10D_return_bin,20D_return_bin
0,INTC,1972-06-26,99.964229,100.0,100.0,0.0,714414509,0.000461,19.897066,99.906352,...,0.000528,0.0,0.0,0.0,0.0,0.506089,0.506089,0,0,0
1,INTC,1972-06-27,99.964229,100.0,100.0,0.0,720290438,0.000429,25.605326,99.906352,...,0.000755,0.0,0.0,0.0,0.0,0.506089,0.506089,0,0,0
2,INTC,1972-06-28,99.964229,100.0,100.0,0.0,747548205,0.000398,30.905853,99.906352,...,0.000974,0.0,0.0,0.0,0.0,0.506089,0.506089,0,0,0
3,INTC,1972-06-29,99.964229,100.0,100.0,0.0,754729896,0.000369,35.827771,99.906352,...,0.001173,0.0,0.0,0.0,0.0,0.0,0.506089,0,0,0
4,INTC,1972-06-30,99.964229,100.0,100.0,0.0,764686322,0.000343,40.398123,99.906352,...,0.001346,0.0,0.0,0.0,0.0,0.0,0.506089,0,0,0


In [None]:
# Plots distribution of the input df without removing outliers

joined_df[INPUT_COLS].hist(figsize=(20, 15), bins=50)
plt.show()

In [31]:

TARGET_COLS=['10D_return_bin']
outlier_validation={'1D_past_return': [-0.3, 0.3]}

stride = 1


train_x, train_y, test_x, test_y, scaler = full_dataset_randomised_arrays_(joined_df, 
                                                                                stride=stride, 
                                                                                input_cols=INPUT_COLS, 
                                                                                outlier_threshold=1, 
                                                                                outlier_validation=outlier_validation, 
                                                                                check_train_outliers=True,
                                                                                check_test_outliers=False, 
                                                                                target_col=TARGET_COLS, 
                                                                                time_window=30,
                                                                                test_set_size='2Y')

print('')
print('')
print('### Stats ###')
print('train_x', train_x.shape)
print('train_y', train_y.shape)
print('test_x', test_x.shape)
print('test_y', test_y.shape)
print('scaler', scaler)

print('')
print('')
print('### Validation ###')


5 Companies in Dataset
Starting INTC: Company 1 of 5
Starting CTSH: Company 2 of 5
Starting ALXN: Company 3 of 5
Starting ILMN: Company 4 of 5
Starting ALGN: Company 5 of 5
All Companies Completed

Processing Stats: {'INTC': {'train_possible_windows': 2569.0, 'train_outliers': 249, 'train_windows': 2320, 'test_possible_windows': 431.0, 'test_outliers': 29, 'test_windows': 402}, 'CTSH': {'train_possible_windows': 2698.0, 'train_outliers': 29, 'train_windows': 2669, 'test_possible_windows': 302.0, 'test_outliers': 29, 'test_windows': 273}, 'ALXN': {'train_possible_windows': 2698.0, 'train_outliers': 29, 'train_windows': 2669, 'test_possible_windows': 302.0, 'test_outliers': 29, 'test_windows': 273}, 'ILMN': {'train_possible_windows': 2698.0, 'train_outliers': 89, 'train_windows': 2609, 'test_possible_windows': 302.0, 'test_outliers': 29, 'test_windows': 273}, 'ALGN': {'train_possible_windows': 2698.0, 'train_outliers': 149, 'train_windows': 2549, 'test_possible_windows': 302.0, 'test_out

In [32]:
# Samples n rows of train_x

indx = list(range(len(train_x)))
sample_indx = random.sample(indx, 10000)
X_train_sample =  train_x[[sample_indx], :][0]
y_train_sample = train_y[[sample_indx]]

  


In [33]:
y_train_sample.shape

(10000, 1)

In [34]:
test_y.shape

(1494, 1)

In [55]:
from tensorflow.keras.metrics import Precision
from tensorflow.keras import regularizers
from tensorflow.keras import Sequential
from tensorflow.keras import layers, models 
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax

optim = RMSprop(learning_rate=0.0001)
precision = Precision()

def init_model():
    model = Sequential()
    reg_l1 = regularizers.l1(0.001)
    reg_l2 = regularizers.l2(0.001)
    reg_l1_l2 = regularizers.l1_l2(l1=0.001, l2=0.001)
    model.add(layers.LSTM(200, return_sequences=True, input_shape=(30,13), activation='tanh'))
    model.add(layers.LSTM(200, activation='tanh'))
    model.add(layers.Dropout(0.3))
    #model.add(layers.BatchNormalization())
    model.add(layers.Dense(200, activation='relu', kernel_regularizer=reg_l1))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(100, activation='relu', bias_regularizer=reg_l2))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(50, activation='relu', activity_regularizer=reg_l1_l2))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=optim, metrics=[precision, 'accuracy'])
    
    return model

In [56]:
model = init_model()

from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(train_x, train_y, 
          epochs=500, 
          batch_size=16,
          validation_split=0.2,
          callbacks=[es])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500




<tensorflow.python.keras.callbacks.History at 0x7fa82c40b290>

In [57]:
train_y.sum()/len(train_y)

0.5372191011235955

In [58]:
test_y.sum()/len(test_y)

0.5414993306559571

In [59]:
model.evaluate(test_x, test_y)



[1.4125611782073975, 0.5266821384429932, 0.4892905056476593]