In [1]:
import pandas as pd
import numpy as np
import os
import random

from mlchartist.array_builder import build_arrays
from mlchartist.preprocessing import thresholds_encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Function implementation from marcin_first_model

In [2]:
def stocks_selector_local(min_len = 2000, num_samp = 2, path='../raw_data/processed/'):
    """
    Returns a concatened Pandas DataFrame of the stocks data

    Randomly select the number of stocks (`num_samp`) in the `path` directory according their
        minimal lenght (`min_len` in days)
    """

    filenames = []

    for file in os.listdir(path):
        if file.endswith('.csv'):
            with open(path + file) as f:
                rows_num = sum(1 for line in f)
                if rows_num >= min_len:
                    filenames.append(file)
                    
    if len(filenames) < num_samp:
        print('Sample size bigger that nuber of companies available')
    else:
        sample = random.sample(filenames, num_samp)
    
    dfs = []

    for ticker in sample:
        ticker_df = pd.read_csv('../raw_data/processed/'+ticker)
        dfs.append(ticker_df)
    
    return pd.concat(dfs)

In [3]:
df = stocks_selector_local()

In [4]:
df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,VLGEA,2005-04-22,63.928477,86.705551,47.296591,-87605.671388,178930,0.251243,18.457228,31.986514,28.45944,0.110486,0.019626,0.090861,-0.030657,-0.028699,-0.033524
1,VLGEA,2005-04-25,50.468196,34.271923,47.302038,-123304.876703,139179,0.27254,17.388709,27.31721,29.297569,0.094898,0.00323,0.091668,0.043353,0.034041,0.024803
2,VLGEA,2005-04-26,56.047626,63.626307,61.534594,-118705.876703,143778,0.273923,16.396512,25.21143,27.039132,0.104889,0.010576,0.094312,-0.016811,-0.008925,-0.00195
3,VLGEA,2005-04-27,55.562718,61.514481,53.13757,-121147.876703,141336,0.255857,15.475186,25.061752,26.878603,0.109846,0.012427,0.097419,0.0,-0.008866,0.002998
4,VLGEA,2005-04-28,56.249866,65.074417,63.405068,-119423.876703,143060,0.243089,14.784136,24.486955,27.502602,0.115302,0.014306,0.100996,-0.003992,0.023667,-0.002046


In [6]:
df['date'] = pd.to_datetime(df['date'])

In [8]:
df = thresholds_encoding(df)

Missing return columns


In [9]:
df.head()

Unnamed: 0,ticker,date,RSI,Stochastic,Stochastic_signal,ADI,OBV,ATR,ADX,ADX_pos,ADX_neg,MACD,MACD_diff,MACD_signal,5TD_return,10TD_return,20TD_return
0,VLGEA,2005-04-22,63.928477,86.705551,47.296591,-87605.671388,178930,0.251243,18.457228,31.986514,28.45944,0.110486,0.019626,0.090861,0,0,0
1,VLGEA,2005-04-25,50.468196,34.271923,47.302038,-123304.876703,139179,0.27254,17.388709,27.31721,29.297569,0.094898,0.00323,0.091668,1,1,1
2,VLGEA,2005-04-26,56.047626,63.626307,61.534594,-118705.876703,143778,0.273923,16.396512,25.21143,27.039132,0.104889,0.010576,0.094312,0,0,0
3,VLGEA,2005-04-27,55.562718,61.514481,53.13757,-121147.876703,141336,0.255857,15.475186,25.061752,26.878603,0.109846,0.012427,0.097419,0,0,1
4,VLGEA,2005-04-28,56.249866,65.074417,63.405068,-119423.876703,143060,0.243089,14.784136,24.486955,27.502602,0.115302,0.014306,0.100996,0,1,0


In [None]:
FIVE_TR = 0.0006
TEN_TR = 0.0012
TWENTY_TR = 0.0024
INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal', 'ADI','OBV', 'ATR', 'ADX', 'ADX_pos', 'ADX_neg', 'MACD', 'MACD_diff','MACD_signal', '1D_past_return', '5D_past_return', '10D_past_return']
#INPUT_COLS = ['RSI', 'Stochastic', 'Stochastic_signal']

In [None]:
df['5D_return_bin'] = (df['5TD_return'] >= FIVE_TR)
df['10D_return_bin'] = (df['10TD_return'] >= TEN_TR)
df['20D_return_bin'] = (df['20TD_return'] >= TWENTY_TR)

In [None]:
test_df = df[df['date'].dt.year >= 2018]
train_df = df[df['date'].dt.year < 2018]

In [None]:
scaler = RobustScaler()

In [None]:
scaler.fit(train_df[INPUT_COLS])

In [None]:
train_df.loc[:, INPUT_COLS] = scaler.transform(train_df[INPUT_COLS])
test_df.loc[:, INPUT_COLS] = scaler.transform(test_df[INPUT_COLS])

In [None]:
train_df

In [None]:
for col in INPUT_COLS:
    plt.hist(train_df[col], bins=20)
    plt.title(col)
    plt.show()

In [None]:
plt.hist(df['10TD_return'], bins=50, range=(-0.5, 0.5))

In [None]:
(df['10TD_return'] == 0).sum()/len(df)*100

In [None]:
(train_df['10D_past_return'] > 0.8).sum()

In [None]:
tickers = train_df['ticker'].unique()

In [None]:
train_x_dfs = []
train_y_dfs = []
test_x_dfs = []
test_y_dfs = []

In [None]:
for ticker in tickers:
    ticker_train_df = train_df[train_df['ticker'] == ticker]
    ticker_test_df = test_df[test_df['ticker'] == ticker]
    ticker_train_x, ticker_train_y = build_arrays(ticker_train_df,input_cols=INPUT_COLS, target_col='20D_return_bin', time_window=6, stride=5)
    ticker_test_x, ticker_test_y = build_arrays(ticker_test_df,input_cols=INPUT_COLS, target_col='20D_return_bin', time_window=6, stride=5)
    train_x_dfs.append(ticker_train_x)
    train_y_dfs.append(ticker_train_y)
    test_x_dfs.append(ticker_test_x)
    test_y_dfs.append(ticker_test_y)

In [None]:
X_train = np.concatenate(train_x_dfs)
y_train = np.concatenate(train_y_dfs)
X_test = np.concatenate(test_x_dfs)
y_test = np.concatenate(test_y_dfs)

In [None]:
y_train.shape

In [None]:
X_train = X_train.astype(float)
X_test = X_test.astype(float)

In [None]:
X_train.shape

In [None]:
y_train = y_train.astype(float)
y_test = y_test.astype(float)

In [None]:
# from tensorflow.keras.backend import expand_dims
# X_train = expand_dims(X_train, axis=-1)
# X_test = expand_dims(X_test, axis=-1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.imshow(X_train[0])
plt.show()
plt.imshow(X_train[30000])
plt.show()
plt.imshow(X_train[100000])

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers, models 
from tensorflow.keras.optimizers import RMSprop, Adam

optim = RMSprop(learning_rate=0.0000001)

def init_model():
    model = Sequential()
    model.add(layers.LSTM(200, return_sequences=True, input_shape=(6,15), activation='tanh'))
    model.add(layers.LSTM(200, activation='tanh'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(200, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=optim, metrics=['accuracy'])
    
    return model

In [None]:
model = init_model()

from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_train, y_train, 
          epochs=500, 
          batch_size=8,
          validation_split=0.2,
          callbacks=[es]
         )

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred.shape

In [None]:
y_test.sum()/len(y_test)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
for i in range(0, len(y_pred), 500):
    print(f'Predicted {y_pred[i][0] >= 0.5}, test {y_test[i]}')

In [None]:
def initialize_model_2():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), padding='same', input_shape=(30, 4, 1), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))

    model.add(layers.Conv2D(64, (2, 2), padding='same', activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(layers.Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(120, activation='relu'))
    model.add(layers.Dense(60, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    return model

In [None]:
model_2 = initialize_model_2()

model_2.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
es = EarlyStopping(patience=5, restore_best_weights=True)

model_2.fit(X_train, y_train, 
          epochs=200, 
          batch_size=8,
          validation_split=0.2,
            callbacks=[es]
         )

In [None]:
y_test.sum()/y_test.shape[0]

In [None]:
y_pred = model.predict(X_test)