# Notebook to test the paramters of the BiLSTM trend model with shuffling

In [None]:
#imports
import numpy as np
from math import sqrt
from numpy import concatenate

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

from matplotlib import pyplot
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam

import tensorflow as tf
import seaborn as sn
import seed
import os
tf.get_logger().setLevel('ERROR')

In [None]:
"""
method to create lagged features

data - data
to_keep - number of lagged_features
to_remove - number of days to remove

"""
def create_lagged_features(data, to_keep=1, to_remove=1):
    variables = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    columns, names = list(), list()
    
    for i in range(to_keep, 0, -1):
        columns.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(variables)]

    for i in range(0, to_remove):
        columns.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(variables)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(variables)]
            
    #put it all together
    final = concat(columns, axis=1)
    final.columns = names
    
    #drop rows with NaN values
    final.dropna(inplace=True)
        
    return final

In [None]:
"""
function to calculate rsi

data - data
period - RSI period

"""
def rsi(data, period: int = 14):
    
    delta = data["Close"].diff()

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    gain = up.ewm(com=(period - 1), min_periods=period).mean()
    loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    RS = gain / loss
    return 100 - (100 / (1 + RS))

In [None]:
"""
method to load lag dataset

step - shows the current iteration - used to know how to shuffle
lag_features - the number of lagged features
train_ratio - percentage of dataset to use as training data
lag_granularity - day or hours - the type of lag for dataset to import
lag - day or hours - actual value of lag of dataset to import
dataset_grouped_by - shows whether dataset is grouped daily or hourly
cleaned - whether or not to use cleaned data

"""
def load_lag_sets(step, lag_features, train_ratio, lag_granularity, lag, dataset_grouped_by, cleaned):

    analyser = "vader"
    
    #folder to read
    folder = "./../../datasets/tweets_prices_volumes_sentiment/"+analyser+"/"+dataset_grouped_by+"_datasets"
    #add cleaned if to use cleaned data
    if cleaned:
        folder = folder + '/cleaned'
    #get full filename
    filename = folder+"/final_data_lag_"+lag_granularity+"_"+str(lag)+".csv" if (lag > 0) else folder+"/final_data_no_lag.csv"

    #read dataset
    df = pd.read_csv(filename)
    
    #group by datetime
    df = df.groupby('DateTime').agg(lambda x: x.mean())
    
    #calculate price direction
    df["Change"] = (df["Close"] > df["Close"].shift(1)).astype(int)

    #whether to add RSI, long and short moving averages
    add_RSI = False
    add_longMAvg = False
    add_shortMAvg = False

    if(add_RSI):
        #calculate RSI
        RSI = 14
        df['RSI'] = rsi(df, RSI)
        df = df.iloc[RSI:]

    #calcualte moving averages
    if(add_shortMAvg):
        short_window = 9
        df['short_mavg'] = df.rolling(window=short_window)["Close"].mean()

    if(add_longMAvg):
        long_window = 21
        df["long_mavg"] = df.rolling(window=long_window)["Close"].mean()

    if(add_longMAvg):
        df = df.iloc[long_window:]
    elif(add_RSI):
        df = df.iloc[RSI:]
    elif(add_shortMAvg):
        df = df.iloc[short_window:]
        
    #keep only wanted columns
    features = ['Change', 'subjectivity', 'polarity','Tweet_vol','Volume_(BTC)'] if analyser == "Textblob" else ['Change', 'Close', 'pos_pol', 'neg_pol', 'Tweet_vol']

    if(add_RSI):
        features.append("RSI")

    if(add_longMAvg):
        features.append("long_mavg")

    if(add_shortMAvg):
        features.append("short_mavg")

    #keep only wanted features
    df = df[features]

    #creating copy so that data is not loaded once again
    df_copy = df.copy()
    
    #number of previous records to consider for every example
    n_lag = lag_features
    #number of features
    n_features = len(features)
    #calculate total_features
    total_features = n_lag*n_features

    if(total_features == 0):
        total_features = n_features
        
    #add lagged features
    df_with_lagged = create_lagged_features(df, n_lag, 1)
    df_with_lagged = df_with_lagged.reset_index()
    df_with_lagged = df_with_lagged.drop(['DateTime'], axis=1)
    
    #shuffle
    np.random.seed(1)
    for j in range(0, step+1):
        df_with_lagged = shuffle(df_with_lagged)
    
    #divide df into train and test
    data_len = len(df_with_lagged)
    train_size = int(data_len*train_ratio)

    #get training data
    train = df_with_lagged.iloc[:train_size]
    #get testing data
    test = df_with_lagged.iloc[train_size:]
    
    #get labels
    train_y = train["var1(t)"].values
    test_y = test["var1(t)"].values

    #reshape labels
    train_y = train_y.reshape(len(train_y), 1)
    test_y = test_y.reshape(len(test_y), 1)    
    
    #normalise features
    xscaler = MinMaxScaler(feature_range=(0, 1))
    train = xscaler.fit_transform(train)
    test = xscaler.transform(test)

    train_labels = train_y
    test_labels = test_y

    #remove the last set of values(data of time to be predicted)
    train = train[:, :total_features]
    test = test[:, :total_features]

    #keep only prices array
    train_X, train_y = train[:, :total_features], train_y[:, -1]
    test_X, test_y = test[:, :total_features], test_y[:, -1]

    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], n_lag, n_features))
    test_X = test_X.reshape((test_X.shape[0], n_lag, n_features))

    #get labels as categorical
    train_y = keras.utils.to_categorical(train_y, 2)
    
    return train_X, test_X, train_y, test_y, len(features), df, train_size

In [None]:
"""
function to create a model and test it

epochs - number of epochs
neurons - number of neurons per layer
batch_size - batch size
layers - number of layers
train_X - training data
test_X - testing data
train_y - training labels
test_y - testing labels
lag_features - number of lagged features
features - list of features
df - whole data
train_size - percentage of dataset to use as training data

"""
def create_model_test(epochs, neurons, batch_size, layers, train_X, test_X, train_y, test_y, lag_features, features, df, train_size):
    
    #set seed to reproduce results
    np.random.seed(1)
    tf.random.set_seed(1)

    # design network
    model = Sequential()
    dropout = 0.25
    activ_func = "linear"
    
    #return sequences flag if there are more than 1 layer
    return_seq = layers > 1

    #add first layer
    model.add(Bidirectional(LSTM(neurons, return_sequences=return_seq, input_shape=(train_X.shape[1], train_X.shape[2]), activation=activ_func)))
    model.add(Dropout(dropout))
    
    #add the other layers
    for i in range(1, layers):
        ret_seq = i != (layers-1)
        model.add(Bidirectional(LSTM(neurons, return_sequences=ret_seq, activation=activ_func)))
        model.add(Dropout(dropout))
        
    #add a dense layer to output the prediction
    model.add(Dense(2, activation="softmax"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 
    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience =20)

    # fit network
    history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, verbose=0, shuffle=False,validation_split=0.2, callbacks=[callback])

    #reshape
    test_X = test_X.reshape((test_X.shape[0], lag_features, features))

    #make prediction
    pred = model.predict(test_X)

    #reshape again
    test_X = test_X.reshape((test_X.shape[0], lag_features* features,))
        
    #get prediction
    pred = np.argmax(pred, axis=1)
    
    #calculate accuracy
    prices = pd.DataFrame()
    prices["Actual"] = test_y
    prices["Predicted"] = pred
    prices["Correct"] = (prices["Actual"] - prices["Predicted"]) == 0
    incorrect = prices.loc[prices['Correct'] == False]
    incorrect_len = len(incorrect)
    prices_len = len(prices)
    accuracy = ((prices_len-incorrect_len)/prices_len)
    
    return accuracy

In [None]:
"""
function to implement the running

lag_granularity - this can be days or hours - the lag granularity for the dataset to use
lag - actual value of lag of dataset to use
dataset_grouped_by - whether the dataset to use is grouped daily or hourly - can be day or hour
cleaned - whether to use cleaned dataset

"""
def implement(lag_granularity, lag, dataset_grouped_by, cleaned):
    #build result filename
    filename = '../results/bilstm_trend/lstm_groupedby_'+dataset_grouped_by+"_lag_"+lag_granularity+"_"+str(lag)
    if cleaned:
        filename = filename + '_cleaned'
    full_filename = filename+".csv"
    
    #columns for csv result file
    columns = ["lag", "batch_size", "neurons", "layers", "split", "mean_acc", "min_acc", "max_acc", "diff_acc"]
    
    #try to read data if the result file exists, otherwise create a new dataframe
    try:
        results = pd.read_csv(full_filename)
    except:
        results = pd.DataFrame(columns=columns)
        
    #lagged_features
    lags = [1, 3, 7 ,14]
    #train_ratio
    train_ratio = 0.85
    
    #for each lag feature
    for lag_features in lags:
        
        #combinations
        neurons = [16, 32, 64, 128, 256]
        layers = [1, 2, 3]
        batch_sizes = [5, 20, 50, 80]

        #for each epoch, neuron, layers and batch_size value
        for n in neurons:
            for l in layers:
                for b in  batch_sizes:

                    accuracies = []
                    print("Testing model: lag:", lag_features, ", neurons:", n, ", layers:", l, ", batch_size:", b)

                    #run for 5 times
                    for i in range (0,5):
                        train_X, test_X, train_y, test_y, features, df, train_size = load_lag_sets(i, lag_features, train_ratio, lag_granularity, lag, dataset_grouped_by, cleaned)
                        acc = create_model_test(10000, n, b, l, train_X, test_X, train_y, test_y, lag_features, features, df, train_size)
                        accuracies.append(acc)

                    #calculate mean values
                    accuracies = np.array(accuracies)
                    mean_acc =accuracies.mean()
                    min_acc =accuracies.min()
                    max_acc =accuracies.max()
                    diff_acc = max_acc - min_acc

                    #add to df
                    results = results.append({"lag": lag_features, "batch_size": b, "neurons":n, "layers":l, "split": train_ratio, "mean_acc": mean_acc, "min_acc": min_acc, "max_acc": max_acc, "diff_acc":diff_acc}, ignore_index=True)
                
    return results, full_filename
        

In [None]:
#lag granularity - days or hours
lag_granularity = "days"
#lag value
lag = 1
#dataset grouped type - day or hour
dataset_grouped_by = "day"
#cleaned
cleaned = True

In [None]:
results, full_filename = implement(lag_granularity, lag, dataset_grouped_by, cleaned)

In [None]:
results

In [None]:
#create folder if it does not exist
folder = full_filename.rsplit('/', 1)[0]
if not os.path.exists(full_filename):
    os.makedirs(full_filename)

In [None]:
results.to_csv(full_filename, index=False)