In [2]:
# Example from https://github.com/wcneill/jn-ml-textbook/blob/master/Deep%20Learning/04%20Recurrent%20Networks/pytorch13b_LSTM.ipynb

import torch
from torch import nn, optim
import optuna

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report

from datetime import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from ta import add_all_ta_features #pip install --upgrade ta https://github.com/bukosabino/ta https://medium.datadriveninvestor.com/predicting-the-stock-market-with-python-bba3cf4c56ef
from fastai.tabular.all import add_datepart #pip install fastai https://docs.fast.ai/tabular.core.html https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-price-machine-learningnd-deep-learning-techniques-python/

mpl.rcParams['figure.figsize'] = [12,8]
viz_dict = {
    'axes.titlesize':18,
    'axes.labelsize':16,
}
sns.set_context("notebook", rc=viz_dict)
sns.set_style("whitegrid")

In [3]:
def convert(date_string):
    year, month, day = [int(i) for i in date_string.split('-')]
    return datetime(year=year, month=month, day=day)

In [4]:
def convert_df(df):
    '''
    Converts the dataframe to the appropriate types.
    '''
    # Convert data to float32 for PyTorch
    df = df.astype(np.float32)

    # I'm going to convert this to train and test instead of train and valid.
    # Train will include validation set through cross validation.
    y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)
    df = df.drop(['Close'], axis=1)
    if 'trend_psar_down' in df.columns:
        df = df.drop(['trend_psar_down', 'trend_psar_up', 'Adj Close'], axis=1)
    # Split training and validation data
    split = int(0.8 * len(df))
    x_train = df[:split].to_numpy() #.reshape(-1, 1)
    x_test = df[split:].to_numpy() #.reshape(-1, 1)

    y_train = y[:split].reshape(-1, 1)
    y_test = y[split:].reshape(-1, 1)

    # scale data: MOVED THIS TO HAPPEN WITHIN CROSS VALIDATION

    # t_scaler = MinMaxScaler(feature_range=(-1, 1))
    # v_scaler = MinMaxScaler(feature_range=(-1, 1))
    # x_train = t_scaler.fit_transform(x_train)
    # x_test = v_scaler.fit_transform(x_test)

    # convert training data to tensor
    x_train = torch.tensor(x_train, dtype=torch.float32)
    x_test = torch.tensor(x_test, dtype=torch.float32)

    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    
    return x_train, y_train, x_test, y_test

In [5]:
def get_batches(data, window):
    """
    Takes data with shape (n_samples, n_features) and creates mini-batches
    with shape (1, window). 
    """
    x_data, y_data = data
    L = len(x_data)
    for i in range(L - window):
        x_sequence = x_data[i:i + window]
        y_sequence = y_data[i:i + window]
        yield x_sequence, y_sequence

In [6]:
class stocksLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_p):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_p)
        self.fc   = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hs):
   
        out, hs = self.lstm(x, hs)           # out.shape = (batch_size, seq_len, hidden_size)
        out = out.view(-1, self.hidden_size) # out.shape = (seq_len, hidden_size)     
        out = self.fc(out)
        
        return out, hs

In [7]:
def train(model, epochs, train_set, valid_data=None, lr=0.001, print_every=100):

    criterion = nn.MSELoss()
    opt = optim.Adam(model.parameters(), lr=lr)
    
    train_loss = []
    valid_loss = []
    
    for e in range(epochs):
        
        hs = None
        t_loss = 0
        for x, y in get_batches(train_set, 12):

            opt.zero_grad()
            
            # Create batch_size dimension
            x = x.unsqueeze(0)
            out, hs = model(x, hs)
            hs = tuple([h.data for h in hs])
            
            loss = criterion(out, y)
            loss.backward()
            opt.step()
            t_loss += loss.item()
            
        if valid_data is not None:
                model.eval()
                val_x, val_y = valid_data
                val_x = val_x.unsqueeze(0)
                preds, _ = model(val_x, hs)
                v_loss = criterion(preds, val_y)
                valid_loss.append(v_loss.item())
                
                model.train()
            
        train_loss.append(np.mean(t_loss))
            
        if print_every:    
            if e % print_every == 0:
                print(f'Epoch {e}:\nTraining Loss: {train_loss[-1]}')
                if valid_data is not None:
                    print(f'Validation Loss: {valid_loss[-1]}')
        
    return train_loss, valid_loss
    
#     plt.figure(figsize=[8., 6.])
#     plt.plot(train_loss, label='Training Loss')
#     plt.plot(valid_loss, label='Validation Loss')
#     plt.title('Loss vs Epochs')
#     plt.xlabel('Epochs')
#     plt.legend()
#     plt.show()

In [8]:
def train_cross_valid(model, epochs, lr, training_data):
    train_losses = []
    valid_losses = []
    X_train, y_train = training_data
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, validation_index in tscv.split(X_train):
        # print("TRAIN:", train_index, "VALIDATION:", validation_index)
        # Split training and validation data
        # split = int(0.8 * len(df))
        # x_train = X_train[train_index].to_numpy() #.reshape(-1, 1)
        # x_valid = X_train[validation_index].to_numpy() #.reshape(-1, 1)
        x_train_fold = X_train[train_index]
        x_valid_fold = X_train[validation_index]

        y_train_fold = y_train[train_index].reshape(-1, 1)
        y_valid_fold = y_train[validation_index].reshape(-1, 1)

        # scale data
        # t_scaler = MinMaxScaler(feature_range=(-1, 1))
        # v_scaler = MinMaxScaler(feature_range=(-1, 1))
        t_scaler = MinMaxScaler()
        x_train_fold = t_scaler.fit_transform(x_train_fold)
        x_valid_fold = t_scaler.transform(x_valid_fold)

        # convert training data to tensor
        x_train_fold = torch.tensor(x_train_fold, dtype=torch.float32)
        x_valid_fold = torch.tensor(x_valid_fold, dtype=torch.float32)

        # y_train_fold = torch.tensor(y_train_fold, dtype=torch.float32)
        # y_valid_fold = torch.tensor(y_valid_fold, dtype=torch.float32)

        # Create training and validation set:
        train_data_fold = (x_train_fold, y_train_fold)
        valid_data_fold = (x_valid_fold, y_valid_fold)
        
        train_loss, valid_loss = train(model, epochs, train_data_fold, valid_data=valid_data_fold, lr=lr, print_every=None)
                #Only append losses from the last epoch
        train_losses.append(train_loss[-1])
        valid_losses.append(valid_loss[-1])

    # Compute average loss of folds             
    train_losses_avg = sum(train_losses) / len(train_losses)   
    valid_losses_avg = sum(valid_losses) / len(valid_losses)  
    return train_losses_avg, valid_losses_avg

In [9]:
# Optuna objective function

def objective(trial):

    params = {
              'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
              'hidden_size': trial.suggest_int('hidden_size', 40, 100, 20),
              'num_layers': trial.suggest_int('num_layers', 2, 4),
              'dropout_p': trial.suggest_float('dropout_p', 0.0, 0.2, step=0.1)
              }
    
    input_size = x_train.size()[1]  # 21 in current implementation
    hidden_size = 100
    #num_layers = 1     # tuned by optuna
    output_size = 1

    model = stocksLSTM(input_size, params['hidden_size'], params['num_layers'], output_size, params['dropout_p'])
    
    train_losses_avg, valid_losses_avg = train_cross_valid(model, EPOCHS, params['learning_rate'], train_data)

    return valid_losses_avg

In [10]:
def optimize_hyperparameters(df):
    '''
    Find the optimal hyperparameters given the training data.
    '''
    x_train, y_train, x_test, y_test = convert_df(df)
    train_data = (x_train, y_train)
    test_data = (x_test, y_test)

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler())
    study.optimize(objective, n_trials=30)

    best_trial = study.best_trial

    lr = best_trial.params.get('learning_rate')
    hs = best_trial.params.get('hidden_size')
    nl = best_trial.params.get('num_layers')
    dp = best_trial.params.get('dropout_p')
    return lr, hs, nl, dp, study

In [11]:
EPOCHS = 100

## Obtaining the DataFrames

In this cell, we obtain the four desired DataFrames corresponding to 1.) long-term with all features, 2.) short-term with all features, 3.) short-term with selected features (including StockTwits sentiment), and 4.) short-term with selected features (excluding StockTwits sentiment).

In [12]:
# df = pd.read_csv('final/Stock_Stalkers/data/AAPL_data.csv')
df = pd.read_csv('data/MSFT_data.csv')
df.dropna()

df["Date"]=pd.to_datetime(df.Date, format="%Y-%m-%d")
df.index=df['Date']
data = df.sort_index(ascending=True, axis=0)
add_datepart(df, 'Date', drop=False)
df.drop('Elapsed', axis=1, inplace=True)

df = add_all_ta_features(
    df, high="High", low="Low", open="Open", close="Close", volume="Volume"
)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  self._psar_up = pd.Series(index=self._psar.index)
  self._psar_down = pd.Series(index=self._psar.index)


In [16]:
#All parameters, whole time period:
df_all = df
df_all = df[df['Date'] >= '2017-08-09']
df_all.index=df_all['Date']
df_all = df_all.loc[:, df_all.columns !='Date']
# All parameters, short time period:
df_all_short = df[df['Date'] >= '2022-01-01']
# Add sentiment
df_all_short.index = np.array(range(len(df_all_short)))
# df_sentiment = pd.read_csv('final/Stock_Stalkers/data/AAPL_byday_RoBERTa.csv')
df_sentiment = pd.read_csv('data/MSFT_byday_RoBERTa.csv')
# Drop the first empty column for AAPL
df_sentiment.date = df_sentiment.date.apply(convert)
df_sentiment.rename(columns={'date':'Date'}, inplace=True)
df_all_short = df_all_short.merge(df_sentiment, how='inner', on='Date').fillna(0)
df_all_short.index=df_all_short['Date']
df_all_short = df_all_short.loc[:, df_all_short.columns !='Date']

# Selected parameters, with sentiment:
# sentiment_params = pd.read_csv('feature_selection_AAPL_Sentiment.csv').feature[:20]
sentiment_params = ["momentum_ppo_hist", "volume_nvi", "trend_cci", "volume_adi", "volatility_kcp", "momentum_roc", "trend_adx_neg", "trend_adx_pos", "momentum_pvo_hist", "volatility_atr", "volume_obv", "momentum_stoch_rsi", "momentum_rsi", "trend_macd_signal", "momentum_pvo", "trend_dpo", "momentum_uo", "momentum_stoch_rsi_k", "volume_fi", "trend_mass_index"]
df_selected_sentiment = df_all_short[sentiment_params + ['Close']]

# Selected parameters, no sentiment:
# noSentiment_params = pd.read_csv('feature_selection_AAPL_NoSentiment.csv').feature[:20]
noSentiment_params = ["momentum_ppo_hist", "trend_cci", "volume_nvi", "volatility_kcp", "volume_adi", "momentum_roc", "trend_adx_neg", "momentum_stoch_rsi_k", "trend_adx_pos", "trend_macd_signal", "momentum_pvo_hist", "volume_obv", "volatility_atr", "trend_dpo",  "momentum_pvo", "bearish", "volatility_bbl", "trend_ema_fast", "trend_macd_diff", "bullish"]

df_selected_noSentiment = df_all_short[noSentiment_params + ['Close']]

In [17]:
df_all_short

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Year,Month,Week,Day,...,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr,bullish,bearish
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,338.000000,329.779999,335.350006,334.750000,28865100.0,334.059265,2022,1,1,3,...,0.100116,-9.156754,-5.183771,-3.972983,333.680862,-0.466819,-0.467912,388.971669,177,25
2022-01-04,335.200012,326.119995,334.829987,329.010010,32674300.0,328.331116,2022,1,1,4,...,-0.082147,-5.503078,-5.247632,-0.255446,333.472700,-1.714710,-1.729581,380.587225,188,22
2022-01-05,326.070007,315.980011,325.859985,316.380005,40054300.0,315.727173,2022,1,1,5,...,-0.439052,-0.459711,-4.290048,3.830337,332.655758,-3.838790,-3.914414,362.138488,320,63
2022-01-06,318.700012,311.489990,313.149994,313.880005,39646100.0,313.232330,2022,1,1,6,...,-0.693850,3.112368,-2.809565,5.921933,330.207000,-0.790189,-0.793328,358.486721,308,83
2022-01-07,316.500000,310.089996,314.149994,314.040009,32720000.0,313.391998,2022,1,1,7,...,-0.818308,3.860324,-1.475587,5.335911,327.714002,0.050976,0.050963,358.720439,226,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-21,293.299988,280.059998,288.579987,280.809998,29454600.0,280.809998,2022,4,16,21,...,-0.756290,-6.947910,-6.569408,-0.378502,290.412432,-1.938116,-1.957143,310.181130,231,93
2022-04-22,283.200012,273.380005,281.679993,274.029999,29379300.0,274.029999,2022,4,16,22,...,-0.840588,-5.665255,-6.388577,0.723323,287.996014,-2.414444,-2.444069,300.277538,301,72
2022-04-25,281.109985,270.769989,273.290009,280.720001,35678900.0,280.720001,2022,4,17,25,...,-0.693346,-2.804787,-5.671819,2.867033,287.554414,2.441339,2.412015,310.049671,509,127
2022-04-26,278.359985,270.000000,277.500000,270.220001,46518400.0,270.220001,2022,4,17,26,...,-0.790015,2.250299,-4.087396,6.337695,286.583302,-3.740382,-3.812129,294.712248,3278,1095


In [18]:
df_all

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Year,Month,Week,Day,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-09,72.510002,72.050003,72.250000,72.470001,22213400.0,68.037392,2017,8,32,9,...,0.551929,0.822208,-0.270279,-4.138656,-2.439150,-1.699505,73.014823,-0.439620,-0.440590,5.857438
2017-08-10,72.190002,71.349998,71.900002,71.410004,24734500.0,67.042236,2017,8,32,10,...,0.374954,0.732757,-0.357804,-3.151685,-2.581657,-0.570028,72.809093,-1.462671,-1.473473,4.309092
2017-08-11,72.699997,71.279999,71.610001,72.500000,21443700.0,68.065552,2017,8,32,11,...,0.351996,0.656605,-0.304609,-3.481337,-2.761593,-0.719744,72.803390,1.526392,1.514860,5.901258
2017-08-14,73.720001,72.949997,73.059998,73.589996,20096600.0,69.088890,2017,8,33,14,...,0.449873,0.615259,-0.165386,-4.182864,-3.045847,-1.137017,72.824921,1.503443,1.492254,7.493423
2017-08-15,73.589996,73.040001,73.589996,73.220001,19181400.0,69.107765,2017,8,33,15,...,0.480556,0.588318,-0.107763,-5.033554,-3.443389,-1.590165,72.831996,-0.502779,-0.504047,6.952968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-21,293.299988,280.059998,288.579987,280.809998,29454600.0,280.809998,2022,4,16,21,...,-1.559924,-0.803634,-0.756290,-6.947910,-6.569408,-0.378502,290.412432,-1.938116,-1.957143,310.181130
2022-04-22,283.200012,273.380005,281.679993,274.029999,29379300.0,274.029999,2022,4,16,22,...,-1.854370,-1.013781,-0.840588,-5.665255,-6.388577,0.723323,287.996014,-2.414444,-2.444069,300.277538
2022-04-25,281.109985,270.769989,273.290009,280.720001,35678900.0,280.720001,2022,4,17,25,...,-1.880464,-1.187118,-0.693346,-2.804787,-5.671819,2.867033,287.554414,2.441339,2.412015,310.049671
2022-04-26,278.359985,270.000000,277.500000,270.220001,46518400.0,270.220001,2022,4,17,26,...,-2.174636,-1.384621,-0.790015,2.250299,-4.087396,6.337695,286.583302,-3.740382,-3.812129,294.712248


In [19]:
df_selected_sentiment

Unnamed: 0_level_0,momentum_ppo_hist,volume_nvi,trend_cci,volume_adi,volatility_kcp,momentum_roc,trend_adx_neg,trend_adx_pos,momentum_pvo_hist,volatility_atr,...,momentum_stoch_rsi,momentum_rsi,trend_macd_signal,momentum_pvo,trend_dpo,momentum_uo,momentum_stoch_rsi_k,volume_fi,trend_mass_index,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,0.100116,4576.568248,12.405226,2.567303e+09,0.512020,0.029884,25.726828,23.997685,-3.972983,6.683878,...,0.559966,51.386552,1.720436,-9.156754,-8.797009,53.954190,0.635441,-6.959757e+06,25.835517,334.750000
2022-01-04,-0.082147,4576.568248,-43.766263,2.555428e+09,-0.053099,1.265010,27.081926,21.767517,-0.255446,6.923492,...,0.263428,46.073676,1.652434,-5.503078,-10.038016,51.691331,0.489965,-3.275839e+07,25.748110,329.010010
2022-01-05,-0.439052,4576.568248,-151.069499,2.518549e+09,-1.010018,-2.291533,33.453783,19.033895,3.830337,7.534143,...,0.000000,37.007117,1.287172,-0.459711,-13.000999,35.821721,0.274465,-1.003480e+08,25.680380,316.380005
2022-01-06,-0.693850,4540.404709,-169.964791,2.505187e+09,-1.098192,-1.884905,35.460437,17.708663,5.921933,7.501731,...,0.000000,35.517241,0.711923,3.112368,-4.566495,37.568780,0.087809,-1.001719e+08,25.534831,313.880005
2022-01-07,-0.818308,4542.719228,-153.408600,2.512793e+09,-0.852450,-4.048397,34.609277,16.601962,5.335911,7.392558,...,0.007512,35.695676,0.036144,3.860324,2.296509,38.175834,0.002504,-8.511375e+07,25.441753,314.040009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-21,-0.756290,4758.370190,-81.936750,2.555256e+09,0.061190,-10.845478,28.170727,16.131696,-0.378502,7.946752,...,0.110107,39.726472,-2.356649,-6.947910,13.033507,37.196570,0.194790,-5.437912e+07,23.879157,280.809998
2022-04-22,-0.840588,4643.482029,-112.391102,2.529766e+09,-0.207226,-11.853450,31.612556,14.738533,0.723323,8.134078,...,0.000000,35.805977,-2.967971,-5.665255,3.157002,35.141443,0.123213,-7.506662e+07,24.034462,274.029999
2022-04-25,-0.693346,4643.482029,-96.134716,2.562754e+09,0.361616,-6.270450,31.044329,13.423929,2.867033,8.354670,...,0.284143,41.898912,-3.469144,-2.804787,6.174997,38.388492,0.131416,-3.024397e+07,24.211691,280.720001
2022-04-26,-0.790015,4643.482029,-108.986832,2.518684e+09,-0.171164,-10.336130,28.883254,12.208163,6.337695,8.591203,...,0.022079,36.106435,-4.034620,2.250299,3.799004,36.177822,0.102074,-9.570100e+07,24.409068,270.220001


In [20]:
df_selected_noSentiment

Unnamed: 0_level_0,momentum_ppo_hist,trend_cci,volume_nvi,volatility_kcp,volume_adi,momentum_roc,trend_adx_neg,momentum_stoch_rsi_k,trend_adx_pos,trend_macd_signal,...,volume_obv,volatility_atr,trend_dpo,momentum_pvo,bearish,volatility_bbl,trend_ema_fast,trend_macd_diff,bullish,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,0.100116,12.405226,4576.568248,0.512020,2.567303e+09,0.029884,25.726828,0.635441,23.997685,1.720436,...,2.792682e+09,6.683878,-8.797009,-9.156754,25,320.664423,335.595533,0.337788,177,334.750000
2022-01-04,-0.082147,-43.766263,4576.568248,-0.053099,2.555428e+09,1.265010,27.081926,0.489965,21.767517,1.652434,...,2.760008e+09,6.923492,-10.038016,-5.503078,22,321.075120,334.582375,-0.272011,188,329.010010
2022-01-05,-0.439052,-151.069499,4576.568248,-1.010018,2.518549e+09,-2.291533,33.453783,0.274465,19.033895,1.287172,...,2.719954e+09,7.534143,-13.000999,-0.459711,63,318.072672,331.782011,-1.461045,320,316.380005
2022-01-06,-0.693850,-169.964791,4540.404709,-1.098192,2.505187e+09,-1.884905,35.460437,0.087809,17.708663,0.711923,...,2.680308e+09,7.501731,-4.566495,3.112368,83,314.906103,329.027856,-2.300996,308,313.880005
2022-01-07,-0.818308,-153.408600,4542.719228,-0.852450,2.512793e+09,-4.048397,34.609277,0.002504,16.601962,0.036144,...,2.713028e+09,7.392558,2.296509,3.860324,35,312.279339,326.722033,-2.703119,226,314.040009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-21,-0.756290,-81.936750,4758.370190,0.061190,2.555256e+09,-10.845478,28.170727,0.194790,16.131696,-2.356649,...,2.731496e+09,7.946752,13.033507,-6.947910,93,272.381593,288.774306,-2.219394,231,280.809998
2022-04-22,-0.840588,-112.391102,4643.482029,-0.207226,2.529766e+09,-11.853450,31.612556,0.123213,14.738533,-2.967971,...,2.702117e+09,8.134078,3.157002,-5.665255,72,269.047518,286.505951,-2.445290,301,274.029999
2022-04-25,-0.693346,-96.134716,4643.482029,0.361616,2.562754e+09,-6.270450,31.044329,0.131416,13.423929,-3.469144,...,2.737796e+09,8.354670,6.174997,-2.804787,127,267.305558,285.615805,-2.004692,509,280.720001
2022-04-26,-0.790015,-108.986832,4643.482029,-0.171164,2.518684e+09,-10.336130,28.883254,0.102074,12.208163,-4.034620,...,2.691277e+09,8.591203,3.799004,2.250299,1095,264.220979,283.247220,-2.261902,3278,270.220001


## Testing the Model

In [21]:
# Impute obtained HP to avoid HP optimization, which takes a long time
    # long-term data and all parameters
lr_lta =  1.1883247534334845e-05
hs_lta =  80
nl_lta =  4
dp_lta =  0.1
    # short-term data and all parameters
lr_sta =  1.1883247534334845e-05
hs_sta =  80
nl_sta =  4
dp_sta =  0.1
   # short-term data and selected parameters, including StockTwits sentiment 
lr_sts = 1.1615177249554967e-05
hs_sts = 100
nl_sts =  3
dp_sts =  0.2
    # short-term data and selected parameters, excluding StockTwits sentiment
lr_stns = 1.7691961498826596e-05
hs_stns = 60
nl_stns = 3
dp_stns = 0.2


# Scale data
    # long-term data and all parameters
x_train_lta, y_train_lta, x_test_lta, y_test_lta = convert_df(df_all)

t_scaler_lta = MinMaxScaler()
x_train_lta = t_scaler_lta.fit_transform(x_train_lta)
x_test_lta = t_scaler_lta.transform(x_test_lta)

x_train_lta = torch.tensor(x_train_lta, dtype=torch.float32)
x_test_lta = torch.tensor(x_test_lta, dtype=torch.float32)

train_data_lta = (x_train_lta, y_train_lta)

    # short-term data and all parameters
x_train_sta, y_train_sta, x_test_sta, y_test_sta = convert_df(df_all_short)

t_scaler_sta = MinMaxScaler()
x_train_sta = t_scaler_sta.fit_transform(x_train_sta)
x_test_sta = t_scaler_sta.transform(x_test_sta)

x_train_sta = torch.tensor(x_train_sta, dtype=torch.float32)
x_test_sta = torch.tensor(x_test_sta, dtype=torch.float32)

train_data_sta = (x_train_sta, y_train_sta)

   # short-term data and selected parameters, including StockTwits sentiment 
x_train_sts, y_train_sts, x_test_sts, y_test_sts = convert_df(df_selected_sentiment)

t_scaler_sts = MinMaxScaler()
x_train_sts = t_scaler_sts.fit_transform(x_train_sts)
x_test_sts = t_scaler_sts.transform(x_test_sts)

x_train_sts = torch.tensor(x_train_sts, dtype=torch.float32)
x_test_sts = torch.tensor(x_test_sts, dtype=torch.float32)

train_data_sts = (x_train_sts, y_train_sts)

    # short-term data and selected parameters, excluding StockTwits sentiment
x_train_stns, y_train_stns, x_test_stns, y_test_stns = convert_df(df_selected_noSentiment)

t_scaler_stns = MinMaxScaler()
x_train_stns = t_scaler_stns.fit_transform(x_train_stns)
x_test_stns = t_scaler_stns.transform(x_test_stns)

x_train_stns = torch.tensor(x_train_stns, dtype=torch.float32)
x_test_stns = torch.tensor(x_test_stns, dtype=torch.float32)

train_data_stns = (x_train_stns, y_train_stns)


In [30]:
# Create models with tuned HP
output_size = 1
# model_lta = stocksLSTM(x_train_lta.size()[1], hs_lta, nl_lta, output_size, dp_lta)
model_sta = stocksLSTM(x_train_sta.size()[1], hs_sta, nl_sta, output_size, dp_sta)
model_sts = stocksLSTM(x_train_sts.size()[1], hs_sts, nl_sts, output_size, dp_sts)
model_stns = stocksLSTM(x_train_stns.size()[1], hs_stns, nl_stns, output_size, dp_stns)

In [31]:
# Train models with tuned HP
EPOCHS = 100*2
# train(model_lta, EPOCHS, train_data_lta, valid_data=None, lr=lr_lta*10)
train(model_sta, EPOCHS, train_data_sta, valid_data=None, lr=lr_sta*10)
train(model_sts, EPOCHS, train_data_sts, valid_data=None, lr=lr_sts*10)
train(model_stns, EPOCHS, train_data_stns, valid_data=None, lr=lr_stns*10)

Epoch 0:
Training Loss: 52.3973713517189
Epoch 100:
Training Loss: 11.721979612484574
Epoch 0:
Training Loss: 52.43548119068146
Epoch 100:
Training Loss: 34.634449884295464
Epoch 0:
Training Loss: 52.40644037723541
Epoch 100:
Training Loss: 27.888370171189308


([52.40644037723541,
  52.16966491937637,
  52.10980695486069,
  52.045419335365295,
  51.9400731921196,
  51.784115731716156,
  51.52215391397476,
  51.119039952754974,
  50.66797870397568,
  49.68977463245392,
  48.21837604045868,
  47.043761909008026,
  46.307760536670685,
  48.01606410741806,
  45.553411185741425,
  45.453852474689484,
  44.48126792907715,
  43.88927811384201,
  43.85631346702576,
  43.02502006292343,
  43.03279083967209,
  41.63334000110626,
  42.34192901849747,
  43.44953149557114,
  41.071510285139084,
  41.023627519607544,
  40.12959411740303,
  39.997285574674606,
  39.22275570034981,
  39.98687946796417,
  38.30376523733139,
  39.24128922820091,
  40.63916239142418,
  39.42154571413994,
  39.648467630147934,
  39.06526857614517,
  39.10265004634857,
  39.788908928632736,
  38.293706610798836,
  38.07891121506691,
  37.76626121997833,
  38.644502341747284,
  36.91480705142021,
  36.14734676480293,
  37.54391369223595,
  36.05950079858303,
  36.05584980547428,


In [32]:
# Make predictions and compute testing indicators
    # long-term data and all parameters
# hs = None
# train_preds_lta, hs = model_lta(x_train_lta.unsqueeze(0), hs)
# train_preds_class_lta = np.where(train_preds_lta > 0, 1, -1)
# print('lTRAINING: long-term data and all parameters \n', classification_report(y_train_lta, train_preds_class_lta))
# test_preds_lta, hs = model_lta(x_test_lta.unsqueeze(0), hs)
# test_preds_class_lta = np.where(test_preds_lta > 0, 1, -1)
# print('lTESTING: long-term data and all parameters \n', classification_report(y_test_lta, test_preds_class_lta))
    # short-term data and all parameters
hs = None
train_preds_sta, hs = model_sta(x_train_sta.unsqueeze(0), hs)
train_preds_class_sta = np.where(train_preds_sta > 0, 1, -1)
print('TRAINING: short-term data and all parameters \n', classification_report(y_train_sta, train_preds_class_sta))

test_preds_sta, hs = model_sta(x_test_sta.unsqueeze(0), hs)
test_preds_class_sta = np.where(test_preds_sta > 0, 1, -1)
print('TESTING: short-term data and all parameters \n', classification_report(y_test_sta, test_preds_class_sta))

   # short-term data and selected parameters, including StockTwits sentiment 
hs = None
train_preds_sts, hs = model_sts(x_train_sts.unsqueeze(0), hs)
train_preds_class_sts = np.where(train_preds_sts > 0, 1, -1)
print('TRAINING: short-term data and selected parameters, including StockTwits sentiment \n', classification_report(y_train_sts, train_preds_class_sts))

test_preds_sts, hs = model_sts(x_test_sts.unsqueeze(0), hs)
test_preds_class_sts = np.where(test_preds_sts > 0, 1, -1)
print('TESTING: short-term data and selected parameters, including StockTwits sentiment \n', classification_report(y_test_sts, test_preds_class_sts))

   # short-term data and selected parameters, excluding StockTwits sentiment 
hs = None
train_preds_stns, hs = model_stns(x_train_stns.unsqueeze(0), hs)
train_preds_class_stns = np.where(train_preds_stns > 0, 1, -1)
print('TRAINING: short-term data and selected parameters, excluding StockTwits sentiment \n', classification_report(y_train_stns, train_preds_class_stns))

test_preds_stns, hs = model_stns(x_test_stns.unsqueeze(0), hs)
test_preds_class_stns = np.where(test_preds_stns > 0, 1, -1)
print('TESTING: short-term data and selected parameters, excluding StockTwits sentiment \n', classification_report(y_test_stns, test_preds_class_stns))

TRAINING: short-term data and all parameters 
               precision    recall  f1-score   support

        -1.0       0.85      1.00      0.92        33
         1.0       1.00      0.81      0.89        31

    accuracy                           0.91        64
   macro avg       0.92      0.90      0.90        64
weighted avg       0.92      0.91      0.91        64

TESTING: short-term data and all parameters 
               precision    recall  f1-score   support

        -1.0       0.75      0.67      0.71         9
         1.0       0.62      0.71      0.67         7

    accuracy                           0.69        16
   macro avg       0.69      0.69      0.69        16
weighted avg       0.70      0.69      0.69        16

TRAINING: short-term data and selected parameters, including StockTwits sentiment 
               precision    recall  f1-score   support

        -1.0       0.77      0.91      0.83        33
         1.0       0.88      0.71      0.79        31

    a

In [27]:
output_size = 1
model_lta = stocksLSTM(x_train_lta.size()[1], hs_lta, nl_lta, output_size, dp_lta)
EPOCHS = 100*2
train(model_lta, EPOCHS, train_data_lta, valid_data=None, lr=lr_lta*10)

Epoch 0:
Training Loss: 927.5938911437988
Epoch 100:
Training Loss: 303.3563450118527


([927.5938911437988,
  924.6823346614838,
  922.1299894452095,
  916.1848202347755,
  903.8533713817596,
  901.0459917187691,
  899.4245812296867,
  899.2450836896896,
  899.5562472343445,
  895.495986700058,
  895.4941044449806,
  899.5702518224716,
  898.9428333640099,
  889.975484251976,
  891.1516668200493,
  883.5601490736008,
  875.9467582702637,
  868.4541966915131,
  865.2690479755402,
  846.2658216059208,
  837.2826673090458,
  837.7952584028244,
  822.7619053125381,
  817.3093300461769,
  817.0510545670986,
  803.0243611931801,
  798.53095908463,
  807.1839850693941,
  808.9341454952955,
  807.2079176902771,
  785.7745993286371,
  774.670002296567,
  762.273717738688,
  739.6757710054517,
  741.217675998807,
  721.3470095545053,
  704.4957345053554,
  710.9141810685396,
  703.5214901715517,
  690.5587439984083,
  695.4286284819245,
  691.8389503508806,
  686.3379921391606,
  669.2114994898438,
  658.4758292771876,
  655.8680106177926,
  639.5973670892417,
  626.3875535465777,

In [30]:
    # long-term data and all parameters
hs = None
train_preds_lta, hs = model_lta(x_train_lta.unsqueeze(0), hs)
train_preds_class_lta = np.where(train_preds_lta > 0, 1, -1)
print('lTRAINING: long-term data and all parameters \n', classification_report(y_train_lta, train_preds_class_lta))
test_preds_lta, hs = model_lta(x_test_lta.unsqueeze(0), hs)
test_preds_class_lta = np.where(test_preds_lta > 0, 1, -1)
print('lTESTING: long-term data and all parameters \n', classification_report(y_test_lta, test_preds_class_lta))

lTRAINING: long-term data and all parameters 
               precision    recall  f1-score   support

        -1.0       0.96      0.94      0.95       413
         1.0       0.96      0.97      0.96       537

    accuracy                           0.96       950
   macro avg       0.96      0.96      0.96       950
weighted avg       0.96      0.96      0.96       950

lTESTING: long-term data and all parameters 
               precision    recall  f1-score   support

        -1.0       0.53      0.67      0.59       117
         1.0       0.57      0.42      0.48       121

    accuracy                           0.54       238
   macro avg       0.55      0.54      0.54       238
weighted avg       0.55      0.54      0.54       238

