In [25]:
# Example from https://github.com/wcneill/jn-ml-textbook/blob/master/Deep%20Learning/04%20Recurrent%20Networks/pytorch13b_LSTM.ipynb

import torch
from torch import nn, optim
import optuna

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report

from datetime import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from ta import add_all_ta_features #pip install --upgrade ta https://github.com/bukosabino/ta https://medium.datadriveninvestor.com/predicting-the-stock-market-with-python-bba3cf4c56ef
from fastai.tabular.all import add_datepart #pip install fastai https://docs.fast.ai/tabular.core.html https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-price-machine-learningnd-deep-learning-techniques-python/

mpl.rcParams['figure.figsize'] = [12,8]
viz_dict = {
    'axes.titlesize':18,
    'axes.labelsize':16,
}
sns.set_context("notebook", rc=viz_dict)
sns.set_style("whitegrid")

In [2]:
def convert(date_string):
    year, month, day = [int(i) for i in date_string.split('-')]
    return datetime(year=year, month=month, day=day)

In [3]:
def convert_df(df):
    '''
    Converts the dataframe to the appropriate types.
    '''
    # Convert data to float32 for PyTorch
    df = df.astype(np.float32)

    # I'm going to convert this to train and test instead of train and valid.
    # Train will include validation set through cross validation.
    y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)
    df = df.drop(['Close'], axis=1)
    if 'trend_psar_down' in df.columns:
        df = df.drop(['trend_psar_down', 'trend_psar_up', 'Adj Close'], axis=1)
    # Split training and validation data
    split = int(0.8 * len(df))
    x_train = df[:split].to_numpy() #.reshape(-1, 1)
    x_test = df[split:].to_numpy() #.reshape(-1, 1)

    y_train = y[:split].reshape(-1, 1)
    y_test = y[split:].reshape(-1, 1)

    # scale data: MOVED THIS TO HAPPEN WITHIN CROSS VALIDATION

    # t_scaler = MinMaxScaler(feature_range=(-1, 1))
    # v_scaler = MinMaxScaler(feature_range=(-1, 1))
    # x_train = t_scaler.fit_transform(x_train)
    # x_test = v_scaler.fit_transform(x_test)

    # convert training data to tensor
    x_train = torch.tensor(x_train, dtype=torch.float32)
    x_test = torch.tensor(x_test, dtype=torch.float32)

    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    
    return x_train, y_train, x_test, y_test

In [4]:
def get_batches(data, window):
    """
    Takes data with shape (n_samples, n_features) and creates mini-batches
    with shape (1, window). 
    """
    x_data, y_data = data
    L = len(x_data)
    for i in range(L - window):
        x_sequence = x_data[i:i + window]
        y_sequence = y_data[i:i + window]
        yield x_sequence, y_sequence

In [5]:
class stocksLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_p):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_p)
        self.fc   = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hs):
   
        out, hs = self.lstm(x, hs)           # out.shape = (batch_size, seq_len, hidden_size)
        out = out.view(-1, self.hidden_size) # out.shape = (seq_len, hidden_size)     
        out = self.fc(out)
        
        return out, hs

In [6]:
def train(model, epochs, train_set, valid_data=None, lr=0.001, print_every=100):

    criterion = nn.MSELoss()
    opt = optim.Adam(model.parameters(), lr=lr)
    
    train_loss = []
    valid_loss = []
    
    for e in range(epochs):
        
        hs = None
        t_loss = 0
        for x, y in get_batches(train_set, 12):

            opt.zero_grad()
            
            # Create batch_size dimension
            x = x.unsqueeze(0)
            out, hs = model(x, hs)
            hs = tuple([h.data for h in hs])
            
            loss = criterion(out, y)
            loss.backward()
            opt.step()
            t_loss += loss.item()
            
        if valid_data is not None:
                model.eval()
                val_x, val_y = valid_data
                val_x = val_x.unsqueeze(0)
                preds, _ = model(val_x, hs)
                v_loss = criterion(preds, val_y)
                valid_loss.append(v_loss.item())
                
                model.train()
            
        train_loss.append(np.mean(t_loss))
            
        if print_every:    
            if e % print_every == 0:
                print(f'Epoch {e}:\nTraining Loss: {train_loss[-1]}')
                if valid_data is not None:
                    print(f'Validation Loss: {valid_loss[-1]}')
        
    return train_loss, valid_loss
    
#     plt.figure(figsize=[8., 6.])
#     plt.plot(train_loss, label='Training Loss')
#     plt.plot(valid_loss, label='Validation Loss')
#     plt.title('Loss vs Epochs')
#     plt.xlabel('Epochs')
#     plt.legend()
#     plt.show()

In [7]:
def train_cross_valid(model, epochs, lr, training_data):
    train_losses = []
    valid_losses = []
    X_train, y_train = training_data
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, validation_index in tscv.split(X_train):
        # print("TRAIN:", train_index, "VALIDATION:", validation_index)
        # Split training and validation data
        # split = int(0.8 * len(df))
        # x_train = X_train[train_index].to_numpy() #.reshape(-1, 1)
        # x_valid = X_train[validation_index].to_numpy() #.reshape(-1, 1)
        x_train_fold = X_train[train_index]
        x_valid_fold = X_train[validation_index]

        y_train_fold = y_train[train_index].reshape(-1, 1)
        y_valid_fold = y_train[validation_index].reshape(-1, 1)

        # scale data
        # t_scaler = MinMaxScaler(feature_range=(-1, 1))
        # v_scaler = MinMaxScaler(feature_range=(-1, 1))
        t_scaler = MinMaxScaler()
        x_train_fold = t_scaler.fit_transform(x_train_fold)
        x_valid_fold = t_scaler.transform(x_valid_fold)

        # convert training data to tensor
        x_train_fold = torch.tensor(x_train_fold, dtype=torch.float32)
        x_valid_fold = torch.tensor(x_valid_fold, dtype=torch.float32)

        # y_train_fold = torch.tensor(y_train_fold, dtype=torch.float32)
        # y_valid_fold = torch.tensor(y_valid_fold, dtype=torch.float32)

        # Create training and validation set:
        train_data_fold = (x_train_fold, y_train_fold)
        valid_data_fold = (x_valid_fold, y_valid_fold)
        
        train_loss, valid_loss = train(model, epochs, train_data_fold, valid_data=valid_data_fold, lr=lr, print_every=None)
                #Only append losses from the last epoch
        train_losses.append(train_loss[-1])
        valid_losses.append(valid_loss[-1])

    # Compute average loss of folds             
    train_losses_avg = sum(train_losses) / len(train_losses)   
    valid_losses_avg = sum(valid_losses) / len(valid_losses)  
    return train_losses_avg, valid_losses_avg

In [8]:
# Optuna objective function

def objective(trial):

    params = {
              'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
              'hidden_size': trial.suggest_int('hidden_size', 40, 100, 20),
              'num_layers': trial.suggest_int('num_layers', 2, 4),
              'dropout_p': trial.suggest_float('dropout_p', 0.0, 0.2, step=0.1)
              }
    
    input_size = x_train.size()[1]  # 21 in current implementation
    hidden_size = 100
    #num_layers = 1     # tuned by optuna
    output_size = 1

    model = stocksLSTM(input_size, params['hidden_size'], params['num_layers'], output_size, params['dropout_p'])
    
    train_losses_avg, valid_losses_avg = train_cross_valid(model, EPOCHS, params['learning_rate'], train_data)

    return valid_losses_avg

In [9]:
def optimize_hyperparameters(df):
    '''
    Find the optimal hyperparameters given the training data.
    '''
    x_train, y_train, x_test, y_test = convert_df(df)
    train_data = (x_train, y_train)
    test_data = (x_test, y_test)

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler())
    study.optimize(objective, n_trials=30)

    best_trial = study.best_trial

    lr = best_trial.params.get('learning_rate')
    hs = best_trial.params.get('hidden_size')
    nl = best_trial.params.get('num_layers')
    dp = best_trial.params.get('dropout_p')
    return lr, hs, nl, dp, study

In [10]:
EPOCHS = 100

## Obtaining the DataFrames

In this cell, we obtain the four desired DataFrames corresponding to 1.) long-term with all features, 2.) short-term with all features, 3.) short-term with selected features (including StockTwits sentiment), and 4.) short-term with selected features (excluding StockTwits sentiment).

In [11]:
# df = pd.read_csv('final/Stock_Stalkers/data/AAPL_data.csv')
df = pd.read_csv('data/AAPL_data.csv')
df.dropna()

df["Date"]=pd.to_datetime(df.Date, format="%Y-%m-%d")
df.index=df['Date']
data = df.sort_index(ascending=True, axis=0)
add_datepart(df, 'Date', drop=False)
df.drop('Elapsed', axis=1, inplace=True)

df = add_all_ta_features(
    df, high="High", low="Low", open="Open", close="Close", volume="Volume"
)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [12]:
#All parameters, whole time period:
df_all = df
df_all = df[df['Date'] >= '2017-08-09']
df_all.index=df_all['Date']
df_all = df_all.loc[:, df_all.columns !='Date']
# All parameters, short time period:
df_all_short = df[df['Date'] >= '2022-01-01']
# Add sentiment
df_all_short.index = np.array(range(len(df_all_short)))
# df_sentiment = pd.read_csv('final/Stock_Stalkers/data/AAPL_byday_RoBERTa.csv')
df_sentiment = pd.read_csv('data/AAPL_byday_RoBERTa.csv')
# Drop the first empty column for AAPL
df_sentiment = df_sentiment.iloc[: , 1:]
df_sentiment.date = df_sentiment.date.apply(convert)
df_sentiment.rename(columns={'date':'Date'}, inplace=True)
df_all_short = df_all_short.merge(df_sentiment, how='inner', on='Date').fillna(0)
df_all_short.index=df_all_short['Date']
df_all_short = df_all_short.loc[:, df_all_short.columns !='Date']

# Selected parameters, with sentiment:
# sentiment_params = pd.read_csv('feature_selection_AAPL_Sentiment.csv').feature[:20]
sentiment_params = pd.read_csv('data/feature_selection_AAPL_Sentiment.csv').feature[:20]
df_selected_sentiment = df_all_short[list(sentiment_params) + ['Close']]

# Selected parameters, no sentiment:
# noSentiment_params = pd.read_csv('feature_selection_AAPL_NoSentiment.csv').feature[:20]
noSentiment_params = pd.read_csv('data/feature_selection_AAPL_NoSentiment.csv').feature[:20]

df_selected_noSentiment = df_all_short[list(noSentiment_params) + ['Close']]

In [13]:
df_all_short

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Year,Month,Week,Day,...,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr,bullish,bearish
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,182.880005,177.710007,177.830002,182.009995,104487900.0,181.778397,2022,1,1,3,...,-0.143274,-8.251705,-2.942693,-5.309012,173.552778,2.500415,2.469666,406.815182,0,0
2022-01-04,182.940002,179.119995,182.630005,179.699997,99310400.0,179.471344,2022,1,1,4,...,-0.189433,-6.848703,-3.723895,-3.124807,174.349880,-1.269160,-1.277282,400.382888,0,0
2022-01-05,180.169998,174.639999,179.610001,174.919998,94537600.0,174.697418,2022,1,1,5,...,-0.407488,-6.083336,-4.195783,-1.887553,174.357792,-2.659988,-2.696006,387.072762,545,195
2022-01-06,175.300003,171.639999,172.699997,172.000000,96904000.0,171.781143,2022,1,1,6,...,-0.653812,-5.210295,-4.398686,-0.811609,174.294544,-1.669334,-1.683424,378.941893,548,170
2022-01-07,174.139999,171.029999,172.889999,172.169998,86709100.0,171.950928,2022,1,1,7,...,-0.791825,-5.341759,-4.587300,-0.754458,174.226442,0.098836,0.098787,379.415261,478,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993,2022,4,16,22,...,-0.658369,-4.834595,-5.769592,0.934997,170.303486,-2.782120,-2.821555,350.511661,756,510
2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005,2022,4,17,25,...,-0.666083,-2.708870,-5.157448,2.448578,169.837855,0.673720,0.671461,353.546848,951,395
2022-04-26,162.339996,156.720001,162.250000,156.800003,95623200.0,156.800003,2022,4,17,26,...,-0.870720,-1.087910,-4.343540,3.255630,168.820698,-3.732810,-3.804264,336.616804,921,531
2022-04-27,159.789993,155.380005,155.910004,156.570007,88063200.0,156.570007,2022,4,17,27,...,-0.963265,-0.543871,-3.583606,3.039735,167.321938,-0.146681,-0.146789,335.976370,1158,391


In [14]:
df_all

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Year,Month,Week,Day,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-09,40.317501,39.777500,39.814999,40.264999,104526000.0,38.173512,2017,8,32,9,...,1.965409,1.310218,0.655191,5.497509,3.675388,1.822121,38.576629,0.612191,0.610325,12.119739
2017-08-10,40.000000,38.657501,39.974998,38.830002,163217200.0,36.957611,2017,8,32,10,...,1.809656,1.410105,0.399550,8.404348,4.621180,3.783168,38.585672,-3.563883,-3.628940,8.123922
2017-08-11,39.642502,39.017502,39.150002,39.369999,105028400.0,37.471577,2017,8,32,11,...,1.777848,1.483654,0.294194,6.635481,5.024040,1.611441,38.640159,1.390670,1.381089,9.627569
2017-08-14,40.052502,39.687500,39.830002,39.962502,88490800.0,38.035515,2017,8,33,14,...,1.853388,1.557601,0.295788,4.022824,4.823797,-0.800973,38.779533,1.504960,1.493747,11.277419
2017-08-15,40.549999,40.035000,40.165001,40.400002,117862000.0,38.451912,2017,8,33,15,...,1.979255,1.641932,0.337324,3.959375,4.650913,-0.691537,38.956294,1.094776,1.088827,12.495658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993,2022,4,16,22,...,-0.502188,0.156180,-0.658369,-4.834595,-5.769592,0.934997,170.303486,-2.782120,-2.821555,350.511661
2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005,2022,4,17,25,...,-0.676423,-0.010340,-0.666083,-2.708870,-5.157448,2.448578,169.837855,0.673720,0.671461,353.546848
2022-04-26,162.339996,156.720001,162.250000,156.800003,95623200.0,156.800003,2022,4,17,26,...,-1.098741,-0.228020,-0.870720,-1.087910,-4.343540,3.255630,168.820698,-3.732810,-3.804264,336.616804
2022-04-27,159.789993,155.380005,155.910004,156.570007,88063200.0,156.570007,2022,4,17,27,...,-1.432102,-0.468837,-0.963265,-0.543871,-3.583606,3.039735,167.321938,-0.146681,-0.146789,335.976370


In [15]:
df_selected_sentiment

Unnamed: 0_level_0,momentum_pvo,trend_mass_index,trend_vortex_ind_pos,momentum_pvo_hist,trend_sma_fast,momentum_stoch_signal,trend_dpo,volume_vwap,volume_nvi,volume_adi,...,volatility_ui,trend_sma_slow,trend_vortex_ind_neg,trend_kst_sig,bullish,bearish,volume_cmf,momentum_ppo_hist,volatility_bbw,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,-8.251705,25.725812,1.011669,-5.309012,176.236666,78.828218,-3.230006,175.442399,7115.616430,6.564038e+09,...,2.669928,172.404616,0.969559,118.009490,0,0,0.095208,-0.143274,9.273015,182.009995
2022-01-04,-6.848703,25.634624,1.094392,-3.124807,176.856667,80.781268,-5.069000,175.881553,7025.307892,6.494884e+09,...,2.581105,173.285000,0.891153,115.975285,0,0,0.084438,-0.189433,7.780912,179.699997
2022-01-05,-6.083336,25.531909,1.015064,-1.887553,177.171666,73.872936,-6.646000,175.817775,6838.435528,6.409920e+09,...,2.783068,173.849615,0.969154,113.793983,545,195,0.002168,-0.407488,7.359295,174.919998
2022-01-06,-5.210295,25.348191,0.987762,-0.811609,177.359166,52.196343,-3.251994,175.732804,6838.435528,6.332079e+09,...,2.959604,174.107307,1.097695,110.848332,548,170,-0.072365,-0.653812,7.659172,172.000000
2022-01-07,-5.341759,25.079790,0.996898,-0.754458,177.290833,35.981871,-0.482500,176.202866,6845.194374,6.308938e+09,...,3.052086,174.391923,0.964433,107.146247,478,110,-0.055469,-0.791825,7.887688,172.169998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-22,-4.834595,24.250758,0.792892,0.934997,167.589165,15.113140,0.803002,169.248558,7803.812186,6.774359e+09,...,6.010746,170.139231,1.049587,39.461044,756,510,-0.023885,-0.658369,11.489273,161.789993
2022-04-25,-2.708870,24.390685,0.706046,2.448578,166.843332,14.362295,1.705000,168.009840,7803.812186,6.858578e+09,...,6.291326,170.226155,1.187449,33.289143,951,395,0.000494,-0.666083,12.062967,162.880005
2022-04-26,-1.087910,24.584276,0.682444,3.255630,165.564999,8.152724,0.594997,166.676075,7512.510670,6.765678e+09,...,6.701780,169.950001,1.184898,26.559597,921,531,-0.107833,-0.870720,13.658505,156.800003
2022-04-27,-0.543871,24.718761,0.724501,3.039735,164.438333,9.789935,-2.625500,165.509576,7501.491248,6.725141e+09,...,7.044409,169.611155,1.175419,18.978894,1158,391,-0.193393,-0.963265,14.276945,156.570007


In [16]:
df_selected_noSentiment

Unnamed: 0_level_0,momentum_pvo,trend_mass_index,trend_sma_fast,trend_vortex_ind_pos,volume_vwap,volume_nvi,trend_dpo,momentum_stoch_signal,volatility_ui,momentum_pvo_hist,...,volume_adi,trend_stc,momentum_ao,volume_cmf,volatility_dcp,trend_sma_slow,volatility_bbl,volatility_atr,Day,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,-8.251705,25.725812,176.236666,1.011669,175.442399,7115.616430,-3.230006,78.828218,2.669928,-5.309012,...,6.564038e+09,79.968965,10.892441,0.095208,0.953225,172.404616,167.353393,3.993418,3,182.009995
2022-01-04,-6.848703,25.634624,176.856667,1.094392,175.881553,7025.307892,-5.069000,80.781268,2.581105,-3.124807,...,6.494884e+09,83.025870,10.219059,0.084438,0.790697,173.285000,169.353666,3.976077,4,179.699997
2022-01-05,-6.083336,25.531909,177.171666,1.015064,175.817775,6838.435528,-6.646000,73.872936,2.783068,-1.887553,...,6.409920e+09,41.512935,9.029235,0.002168,0.481912,173.849615,169.905249,4.131469,5,174.919998
2022-01-06,-5.210295,25.348191,177.359166,0.987762,175.732804,6838.435528,-3.251994,52.196343,2.959604,-0.811609,...,6.332079e+09,20.756467,7.255029,-0.072365,0.293281,174.107307,169.492661,4.084322,6,172.000000
2022-01-07,-5.341759,25.079790,177.290833,0.996898,176.202866,6845.194374,-0.482500,35.981871,3.052086,-0.754458,...,6.308938e+09,10.378234,5.631118,-0.055469,0.304263,174.391923,169.176504,3.986890,7,172.169998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-22,-4.834595,24.250758,167.589165,0.792892,169.248558,7803.812186,0.803002,15.113140,6.010746,0.934997,...,6.774359e+09,0.112911,-0.699527,-0.023885,0.016013,170.139231,161.202121,4.367901,22,161.789993
2022-04-25,-2.708870,24.390685,166.843332,0.706046,168.009840,7803.812186,1.705000,14.362295,6.291326,2.448578,...,6.858578e+09,0.056456,-1.517793,0.000494,0.208983,170.226155,160.155241,4.402110,25,162.880005
2022-04-26,-1.087910,24.584276,165.564999,0.682444,166.676075,7512.510670,0.594997,8.152724,6.701780,3.255630,...,6.765678e+09,0.028228,-2.790382,-0.107833,0.003495,169.950001,157.919758,4.577899,26,156.800003
2022-04-27,-0.543871,24.718761,164.438333,0.724501,165.509576,7501.491248,-2.625500,9.789935,7.044409,3.039735,...,6.725141e+09,0.014114,-4.658883,-0.193393,0.049113,169.611155,156.356061,4.561108,27,156.570007


# Hyperparameter optimization

### Study with long-term data and all parameters.

In [31]:
x_train, y_train, x_test, y_test = convert_df(df_all)
train_data = (x_train, y_train)
test_data = (x_test, y_test)

In [32]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

torch.Size([951, 100]) torch.Size([951, 1]) torch.Size([238, 100]) torch.Size([238, 1])


In [None]:
lr_lt, hs_lt, nl_lt, dp_lt, study_lt = optimize_hyperparameters(df_all)

### Study with short-term data and all parameters.

In [20]:
x_train, y_train, x_test, y_test = convert_df(df_all_short)
train_data = (x_train, y_train)
test_data = (x_test, y_test)

In [21]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

torch.Size([64, 102]) torch.Size([64, 1]) torch.Size([17, 102]) torch.Size([17, 1])


In [None]:
lr_sta, hs_sta, nl_sta, dp_sta, study_sta = optimize_hyperparameters(df_all_short)

In [None]:
print("learning rate: " + str(lr_sta))
print("hidden size: " + str(hs_sta))
print("number of layers: " + str(nl_sta))
print("dropout rate: " + str(dp_sta))

### Study with short-term data and selected parameters, including StockTwits sentiment.

In [17]:
x_train, y_train, x_test, y_test = convert_df(df_selected_sentiment)
train_data = (x_train, y_train)
test_data = (x_test, y_test)

In [18]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

torch.Size([64, 20]) torch.Size([64, 1]) torch.Size([17, 20]) torch.Size([17, 1])


In [None]:
lr_sts, hs_sts, nl_sts, dp_sts, study_sts = optimize_hyperparameters(df_selected_sentiment)

In [None]:
print("learning rate: " + str(lr_sts))
print("hidden size: " + str(hs_sts))
print("number of layers: " + str(nl_sts))
print("dropout rate: " + str(dp_sts))

### Study with short-term data and selected parameters, excluding StockTwits sentiment.

In [47]:
x_train, y_train, x_test, y_test = convert_df(df_selected_noSentiment)
train_data = (x_train, y_train)
test_data = (x_test, y_test)

In [48]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

torch.Size([64, 20]) torch.Size([64, 1]) torch.Size([17, 20]) torch.Size([17, 1])


In [23]:
lr_stns, hs_stns, nl_stns, dp_stns, study_stns = optimize_hyperparameters(df_selected_noSentiment)

[32m[I 2022-06-01 16:01:07,457][0m A new study created in memory with name: no-name-a6ffbcef-6540-45e9-ad1c-6469d2b303f9[0m
[32m[I 2022-06-01 16:06:50,393][0m Trial 0 finished with value: 1.770090913772583 and parameters: {'learning_rate': 0.008825182499014803, 'hidden_size': 80, 'num_layers': 4, 'dropout_p': 0.2}. Best is trial 0 with value: 1.770090913772583.[0m
[32m[I 2022-06-01 16:09:06,453][0m Trial 1 finished with value: 1.9544145345687867 and parameters: {'learning_rate': 0.0009991500382845982, 'hidden_size': 80, 'num_layers': 2, 'dropout_p': 0.1}. Best is trial 0 with value: 1.770090913772583.[0m
[32m[I 2022-06-01 16:12:35,485][0m Trial 2 finished with value: 1.7041390419006348 and parameters: {'learning_rate': 0.0012883546479614287, 'hidden_size': 60, 'num_layers': 4, 'dropout_p': 0.2}. Best is trial 2 with value: 1.7041390419006348.[0m
[32m[I 2022-06-01 16:14:25,721][0m Trial 3 finished with value: 1.7596633672714233 and parameters: {'learning_rate': 0.055358243

In [25]:
print("learning rate: " + str(lr_stns))
print("hidden size: " + str(hs_stns))
print("number of layers: " + str(nl_stns))
print("dropout rate: " + str(dp_stns))

learning rate: 1.7691961498826596e-05
hidden size: 60
number of layers: 3
dropout rate: 0.2


## Testing the models

In [54]:
# Impute obtained HP to avoid HP optimization, which takes a long time
    # long-term data and all parameters
lr_lta =  1.1883247534334845e-05
hs_lta =  80
nl_lta =  4
dp_lta =  0.1
    # short-term data and all parameters
lr_sta =  1.1883247534334845e-05
hs_sta =  80
nl_sta =  4
dp_sta =  0.1
   # short-term data and selected parameters, including StockTwits sentiment 
lr_sts = 1.1615177249554967e-05
hs_sts = 100
nl_sts =  3
dp_sts =  0.2
    # short-term data and selected parameters, excluding StockTwits sentiment
lr_stns = 1.7691961498826596e-05
hs_stns = 60
nl_stns = 3
dp_stns = 0.2


# Scale data
    # long-term data and all parameters
x_train_lta, y_train_lta, x_test_lta, y_test_lta = convert_df(df_all)

t_scaler_lta = MinMaxScaler()
x_train_lta = t_scaler_lta.fit_transform(x_train_lta)
x_test_lta = t_scaler_lta.transform(x_test_lta)

x_train_lta = torch.tensor(x_train_lta, dtype=torch.float32)
x_test_lta = torch.tensor(x_test_lta, dtype=torch.float32)

train_data_lta = (x_train_lta, y_train_lta)

    # short-term data and all parameters
x_train_sta, y_train_sta, x_test_sta, y_test_sta = convert_df(df_all_short)

t_scaler_sta = MinMaxScaler()
x_train_sta = t_scaler_sta.fit_transform(x_train_sta)
x_test_sta = t_scaler_sta.transform(x_test_sta)

x_train_sta = torch.tensor(x_train_sta, dtype=torch.float32)
x_test_sta = torch.tensor(x_test_sta, dtype=torch.float32)

train_data_sta = (x_train_sta, y_train_sta)

   # short-term data and selected parameters, including StockTwits sentiment 
x_train_sts, y_train_sts, x_test_sts, y_test_sts = convert_df(df_selected_sentiment)

t_scaler_sts = MinMaxScaler()
x_train_sts = t_scaler_sts.fit_transform(x_train_sts)
x_test_sts = t_scaler_sts.transform(x_test_sts)

x_train_sts = torch.tensor(x_train_sts, dtype=torch.float32)
x_test_sts = torch.tensor(x_test_sts, dtype=torch.float32)

train_data_sts = (x_train_sts, y_train_sts)

    # short-term data and selected parameters, excluding StockTwits sentiment
x_train_stns, y_train_stns, x_test_stns, y_test_stns = convert_df(df_selected_noSentiment)

t_scaler_stns = MinMaxScaler()
x_train_stns = t_scaler_stns.fit_transform(x_train_stns)
x_test_stns = t_scaler_stns.transform(x_test_stns)

x_train_stns = torch.tensor(x_train_stns, dtype=torch.float32)
x_test_stns = torch.tensor(x_test_stns, dtype=torch.float32)

train_data_stns = (x_train_stns, y_train_stns)


In [58]:
# Create models with tuned HP
output_size = 1
model_lta = stocksLSTM(x_train_lta.size()[1], hs_lta, nl_lta, output_size, dp_lta)
model_sta = stocksLSTM(x_train_sta.size()[1], hs_sta, nl_sta, output_size, dp_sta)
model_sts = stocksLSTM(x_train_sts.size()[1], hs_sts, nl_sts, output_size, dp_sts)
model_stns = stocksLSTM(x_train_stns.size()[1], hs_stns, nl_stns, output_size, dp_stns)

In [59]:
# Train models with tuned HP
EPOCHS = 100*2
#train(model_lta, EPOCHS, train_data_lta, valid_data=None, lr=lr_lta)
train(model_sta, EPOCHS, train_data_sta, valid_data=None, lr=lr_sta*10)
train(model_sts, EPOCHS, train_data_sts, valid_data=None, lr=lr_sts*10)
train(model_stns, EPOCHS, train_data_stns, valid_data=None, lr=lr_stns*10)


Epoch 0:
Training Loss: 53.12024164199829
Epoch 100:
Training Loss: 6.150647629285231
Epoch 0:
Training Loss: 52.54952275753021
Epoch 100:
Training Loss: 33.36127424240112
Epoch 0:
Training Loss: 52.30368655920029
Epoch 100:
Training Loss: 26.348262026906013


([52.30368655920029,
  51.54582995176315,
  51.426953077316284,
  50.947267174720764,
  48.65872746706009,
  47.536966383457184,
  46.97531741857529,
  46.13974440097809,
  46.180207282304764,
  46.11367058753967,
  46.13580694794655,
  46.251478016376495,
  46.29696208238602,
  46.86714246869087,
  47.75264385342598,
  45.94577306509018,
  47.75531557202339,
  46.25571918487549,
  46.54837319254875,
  46.8564487695694,
  46.88105607032776,
  46.38868635892868,
  47.79576623439789,
  45.79392620921135,
  47.56597054004669,
  46.122165113687515,
  45.881316512823105,
  49.885848343372345,
  45.4810888171196,
  46.085785895586014,
  45.67219805717468,
  45.58818903565407,
  45.09038320183754,
  44.43429210782051,
  43.64019954204559,
  41.854240626096725,
  42.26430416107178,
  42.70684140920639,
  41.919868499040604,
  41.09028574824333,
  38.81782707571983,
  38.772187143564224,
  38.738134652376175,
  38.0724393427372,
  38.42644736170769,
  37.384124130010605,
  37.98183351755142,
  

In [60]:
# Make predictions and compute testing indicators
    # long-term data and all parameters
# hs = None
# train_preds_lta, hs = model_lta(x_train_lta.unsqueeze(0), hs)
# test_preds_lta, hs = model_lta(x_test_lta.unsqueeze(0), hs)
# test_preds_class_lta = np.where(test_preds_lta > 0, 1, -1)
# print('long-term data and all parameters \n', classification_report(y_test_lta, test_preds_class_lta))
    # short-term data and all parameters
hs = None
train_preds_sta, hs = model_sta(x_train_sta.unsqueeze(0), hs)
train_preds_class_sta = np.where(train_preds_sta > 0, 1, -1)
print('TRAINING: short-term data and all parameters \n', classification_report(y_train_sta, train_preds_class_sta))

test_preds_sta, hs = model_sta(x_test_sta.unsqueeze(0), hs)
test_preds_class_sta = np.where(test_preds_sta > 0, 1, -1)
print('TESTING: short-term data and all parameters \n', classification_report(y_test_sta, test_preds_class_sta))

   # short-term data and selected parameters, including StockTwits sentiment 
hs = None
train_preds_sts, hs = model_sts(x_train_sts.unsqueeze(0), hs)
train_preds_class_sts = np.where(train_preds_sts > 0, 1, -1)
print('TRAINING: short-term data and selected parameters, including StockTwits sentiment \n', classification_report(y_train_sts, train_preds_class_sts))

test_preds_sts, hs = model_sts(x_test_sts.unsqueeze(0), hs)
test_preds_class_sts = np.where(test_preds_sts > 0, 1, -1)
print('TESTING: short-term data and selected parameters, including StockTwits sentiment \n', classification_report(y_test_sts, test_preds_class_sts))

   # short-term data and selected parameters, excluding StockTwits sentiment 
hs = None
train_preds_stns, hs = model_stns(x_train_stns.unsqueeze(0), hs)
train_preds_class_stns = np.where(train_preds_stns > 0, 1, -1)
print('TRAINING: short-term data and selected parameters, excluding StockTwits sentiment \n', classification_report(y_train_stns, train_preds_class_stns))

test_preds_stns, hs = model_stns(x_test_stns.unsqueeze(0), hs)
test_preds_class_stns = np.where(test_preds_stns > 0, 1, -1)
print('TESTING: short-term data and selected parameters, excluding StockTwits sentiment \n', classification_report(y_test_stns, test_preds_class_stns))

TRAINING: short-term data and all parameters 
               precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00        35
         1.0       1.00      1.00      1.00        29

    accuracy                           1.00        64
   macro avg       1.00      1.00      1.00        64
weighted avg       1.00      1.00      1.00        64

TESTING: short-term data and all parameters 
               precision    recall  f1-score   support

        -1.0       0.71      0.91      0.80        11
         1.0       0.67      0.33      0.44         6

    accuracy                           0.71        17
   macro avg       0.69      0.62      0.62        17
weighted avg       0.70      0.71      0.67        17

TRAINING: short-term data and selected parameters, including StockTwits sentiment 
               precision    recall  f1-score   support

        -1.0       0.79      0.74      0.76        35
         1.0       0.71      0.76      0.73        29

    a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
