<div align="center"><h1>HW6</h1></div>
<div align="center"><h2>Mohammadreza Ghofrani, 400131076</h2></div>

# Dependancies

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Dataset

## Downloading

In [3]:
!gdown 1SLrmOfW66WQc-zrfo-gX-g2dDA4pAw1F
!unzip -o NN_HW6_Dataset.zip -d data/
!rm 'data/IRX6XTPI0009[2022-05-23-10-18-57].csv'

Downloading...
From: https://drive.google.com/uc?id=1SLrmOfW66WQc-zrfo-gX-g2dDA4pAw1F
To: /content/NN_HW6_Dataset.zip
  0% 0.00/1.80M [00:00<?, ?B/s]100% 1.80M/1.80M [00:00<00:00, 129MB/s]
Archive:  NN_HW6_Dataset.zip
  inflating: data/IRX6XAFF0005.csv   
  inflating: data/IRX6XALS0002.csv   
  inflating: data/IRX6XS300003.csv   
  inflating: data/IRX6XSLC0000.csv   
  inflating: data/IRX6XSNT0009.csv   
  inflating: data/IRX6XTAL0001.csv   
  inflating: data/IRX6XTDP0004.csv   
  inflating: data/IRX6XTPI0009.csv   
  inflating: data/IRX6XTPI0009[2022-05-23-10-18-57].csv  
  inflating: data/IRX6XTPI0025.csv   
  inflating: data/IRX6XWAI0001.csv   
  inflating: data/IRX6XWTH0001.csv   
  inflating: data/IRXWXEXR0007.csv   
  inflating: data/IRXWXEXR0023.csv   
  inflating: data/IRXWXOCI0001.csv   
  inflating: data/IRXWXOCI0027.csv   
  inflating: data/IRXYXTPI0009.csv   
  inflating: data/IRXYXTPI0025.csv   
  inflating: data/IRXZXAGR0009.csv   
  inflating: data/IRXZXBNK0006.csv   


## Read files

In [4]:
data_directory = 'data'
target_file = 'IRX6XTPI0009.csv'
excluded_feature_file = 'IRXZXOBM0003.csv'
dataframes = dict()
for item in os.listdir(data_directory):
    item_path = os.path.join(data_directory, item)
    df = pd.read_csv(item_path, encoding='utf-16')
    if not df.empty and not item == excluded_feature_file:
        dataframes[item] = df

## Preprocess

In [5]:
def preprocess_df(df):
    outdf = df.copy()
    for col in outdf.columns:
        if len(outdf[col].unique()) <= 2:
            outdf.drop(col, axis=1, inplace=True)
    return outdf

In [6]:
df_names = list(dataframes.keys())

merged_df = pd.DataFrame()
for n in df_names:
    df = dataframes[n]
    df = preprocess_df(df)
    if merged_df.empty:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, how='outer', on='<DTYYYYMMDD>', suffixes=('', f'_{n}'))
        
merged_df = merged_df[(merged_df['<DTYYYYMMDD>'] > 20210523) & (merged_df['<DTYYYYMMDD>'] < 20220522)]
merged_df.reset_index(inplace=True, drop=True)

In [7]:
label = []
close_today = 0
close_yesterday = 0

for row in merged_df.iterrows():
    idx, seri = row

    if idx == 0:
        close_yesterday = seri[f'<CLOSE>_{target_file}']
        continue

    close_today = seri[f'<CLOSE>_{target_file}']
    l = np.ceil((np.sign(close_today - close_yesterday) + 1) / 2)
    close_yesterday = close_today
    label.append(l)

merged_df['label'] = np.pad(label, (0,1), mode='constant',constant_values=(np.nan,))

In [8]:
tmp = merged_df.sort_values(by=['<DTYYYYMMDD>'])

tmp.reset_index(drop=True, inplace=True)
for icol, col in enumerate(tmp.columns):
    nan_indices = tmp[tmp[col].isnull()].index.tolist()
    if col == 'label':
        tmp.iloc[irow, icol] = 1
    else:
        for irow in nan_indices:
            tmp.iloc[irow, icol] = (tmp.iloc[irow+1, icol] + tmp.iloc[irow-1, icol])/2

tmp = tmp.drop(labels=[col for col in tmp if col.endswith(target_file)], axis=1)
tmp = tmp.drop(labels=['<DTYYYYMMDD>'], axis=1)
tmp.reset_index(drop=True, inplace=True)
tmp.drop(tmp.tail(1).index, inplace=True) # drop last n rows
xdf, ydf = tmp, tmp['label']

In [9]:
scaler = MinMaxScaler()
xdf = scaler.fit_transform(xdf.to_numpy())

In [10]:
step = 10
n_feature = xdf.shape[1]
n_data = len(ydf) - step
X, y = np.zeros((n_data, step, n_feature)), np.zeros((n_data,))
for i in range(n_data):
    X[i]  = xdf[i:i+step]
    y[i] = ydf[i+step]

## Splitting into sets

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=46)
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.125, random_state=121)

# Question 2

### Baseline

In [12]:
baseline_train_acc = np.sum(ytrain[ytrain==1])/len(ytrain)
baseline_val_acc = np.sum(yval[yval==1])/len(yval)
baseline_test_acc = np.sum(ytest[ytest==1])/len(ytest)

print("Baseline Accuracy")
print(f'train {baseline_train_acc:.4f}')
print(f'val {baseline_val_acc:.4f}')
print(f'test {baseline_test_acc:.4f}')

Baseline Accuracy
train 0.5796
val 0.5652
test 0.5556


### LSTM & GRU
single layer

In [14]:
def rnn_model_creator(hunits, rnn_layer, optimizer):
    inputs = keras.Input((step, n_feature))
    output_layer = keras.layers.Dense(2, activation='softmax')

    out = inputs
    for hunit in hunits[:-1]:
        out = rnn_layer(hunit, return_sequences=True)(out)

    out = rnn_layer(hunits[-1])(out)
    preds = output_layer(out)

    model = keras.Model(inputs, preds)
    model.compile(optimizer=optimizer,
                    loss='sparse_categorical_crossentropy',
                    metrics=['acc'])
    return model

In [71]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

rnn_layer_repr = ['LSTM', 'GRU']
optimizer_repr = ['Adam', 'SGD']
for i ,rnn_layer in enumerate([keras.layers.LSTM, keras.layers.GRU]):
    for j, optimizer in enumerate([keras.optimizers.Adam, keras.optimizers.SGD]):
        for lr in [1e-2, 1e-3, 1e-4]:
            for hunit in [8, 32, 64, 256, 512]:
                tries = 5
                acc_train, acc_val, acc_test = 0,0,0
                for _ in range(tries):
                    model = rnn_model_creator(hunits=[hunit,], rnn_layer=keras.layers.LSTM, optimizer=optimizer(lr))
                    model.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=300, batch_size=8,
                            callbacks=[es_callback], verbose=0)

                    result_train = model.evaluate(xtrain, ytrain, verbose=0)
                    result_val  = model.evaluate(xval, yval, verbose=0)
                    result_test = model.evaluate(xtest, ytest, verbose=0)

                    acc_train += result_train[1] /tries
                    acc_val += result_val[1] /tries
                    acc_test += result_test[1] /tries


                print('rnn_layer', rnn_layer_repr[i], end=' ')
                print('optimizer', optimizer_repr[j], end=' ')
                print('hunit', hunit, end=' ')
                print('lr', lr, end=' ')
                print(f'train acc {acc_train:.2f}', end=' ')
                print(f'val acc {acc_val:.2f}', end=' ')
                print(f'test acc {acc_test:.2f}', end=' ')
                print(f'baseline val {acc_val - baseline_val_acc:.2f}', end=' ')
                print(f'baseline test {acc_test - baseline_test_acc:.2f}', end=' ')
                print()

rnn_layer LSTM optimizer Adam hunit 8 lr 0.01 train acc 0.60 val acc 0.57 test acc 0.54 baseline val -0.00 baseline test -0.01 
rnn_layer LSTM optimizer Adam hunit 32 lr 0.01 train acc 0.63 val acc 0.55 test acc 0.54 baseline val -0.02 baseline test -0.02 
rnn_layer LSTM optimizer Adam hunit 64 lr 0.01 train acc 0.59 val acc 0.57 test acc 0.54 baseline val -0.00 baseline test -0.01 
rnn_layer LSTM optimizer Adam hunit 256 lr 0.01 train acc 0.60 val acc 0.56 test acc 0.54 baseline val -0.01 baseline test -0.01 
rnn_layer LSTM optimizer Adam hunit 512 lr 0.01 train acc 0.66 val acc 0.60 test acc 0.58 baseline val 0.03 baseline test 0.03 
rnn_layer LSTM optimizer Adam hunit 8 lr 0.001 train acc 0.60 val acc 0.57 test acc 0.56 baseline val -0.00 baseline test 0.00 
rnn_layer LSTM optimizer Adam hunit 32 lr 0.001 train acc 0.61 val acc 0.59 test acc 0.56 baseline val 0.03 baseline test 0.00 
rnn_layer LSTM optimizer Adam hunit 64 lr 0.001 train acc 0.62 val acc 0.57 test acc 0.58 baseline v

# Question 3

## Stacked LSTM & GRU

In [None]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

rnn_layer_repr = ['LSTM', 'GRU']
optimizer_repr = ['Adam', 'SGD']
for i ,rnn_layer in enumerate([keras.layers.LSTM]):
    for j, optimizer in enumerate([keras.optimizers.Adam, keras.optimizers.SGD]):
        for lr in [1e-2, 1e-3, 1e-4]:
            for repeatition in [2, 3, 4]:
                for hunit in [8, 32, 64, 256, 512]:
                    tries = 5
                    acc_train, acc_val, acc_test = 0,0,0
                    for _ in range(tries):
                        model = rnn_model_creator(hunits=[hunit]*repeatition, rnn_layer=keras.layers.LSTM, optimizer=optimizer(lr))
                        model.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=300, batch_size=8,
                                callbacks=[es_callback], verbose=0)

                        result_train = model.evaluate(xtrain, ytrain, verbose=0)
                        result_val  = model.evaluate(xval, yval, verbose=0)
                        result_test = model.evaluate(xtest, ytest, verbose=0)

                        acc_train += result_train[1] /tries
                        acc_val += result_val[1] /tries
                        acc_test += result_test[1] /tries


                    print('rnn_layer', rnn_layer_repr[i], end=' ')
                    print('optimizer', optimizer_repr[j], end=' ')
                    print('hunit', hunit, end=' ')
                    print('repeatition', repeatition, end=' ')
                    print('lr', lr, end=' ')
                    print(f'train acc {acc_train:.2f}', end=' ')
                    print(f'val acc {acc_val:.2f}', end=' ')
                    print(f'test acc {acc_test:.2f}', end=' ')
                    print(f'baseline val {acc_val - baseline_val_acc:.2f}', end=' ')
                    print(f'baseline test {acc_test - baseline_test_acc:.2f}', end=' ')
                    print()
                print('------------ repeatition ---------------')
            print('------------ learning rate ---------------')
        print('------------ Optimizer ---------------')
    print('------------ Cell ---------------')

rnn_layer LSTM optimizer Adam hunit 8 repeatition 2 lr 0.01 train acc 0.66 val acc 0.57 test acc 0.55 baseline val -0.00 baseline test -0.00 
rnn_layer LSTM optimizer Adam hunit 32 repeatition 2 lr 0.01 train acc 0.65 val acc 0.59 test acc 0.53 baseline val 0.03 baseline test -0.02 
rnn_layer LSTM optimizer Adam hunit 64 repeatition 2 lr 0.01 train acc 0.66 val acc 0.63 test acc 0.56 baseline val 0.06 baseline test 0.01 
rnn_layer LSTM optimizer Adam hunit 256 repeatition 2 lr 0.01 train acc 0.67 val acc 0.63 test acc 0.58 baseline val 0.07 baseline test 0.03 
rnn_layer LSTM optimizer Adam hunit 512 repeatition 2 lr 0.01 train acc 0.58 val acc 0.57 test acc 0.56 baseline val -0.00 baseline test 0.00 
------------ repeatition ---------------
rnn_layer LSTM optimizer Adam hunit 8 repeatition 3 lr 0.01 train acc 0.64 val acc 0.64 test acc 0.58 baseline val 0.08 baseline test 0.02 
rnn_layer LSTM optimizer Adam hunit 32 repeatition 3 lr 0.01 train acc 0.64 val acc 0.58 test acc 0.57 baseli

In [None]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

rnn_layer_repr = ['GRU']
optimizer_repr = ['Adam', 'SGD']
for i ,rnn_layer in enumerate([keras.layers.GRU]):
    for j, optimizer in enumerate([keras.optimizers.Adam, keras.optimizers.SGD]):
        for lr in [1e-2, 1e-3, 1e-4]:
            for repeatition in [2, 3, 4]:
                for hunit in [8, 32, 64, 256, 512]:
                    tries = 5
                    acc_train, acc_val, acc_test = 0,0,0
                    for _ in range(tries):
                        model = rnn_model_creator(hunits=[hunit]*repeatition, rnn_layer=keras.layers.LSTM, optimizer=optimizer(lr))
                        model.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=300, batch_size=8,
                                callbacks=[es_callback], verbose=0)

                        result_train = model.evaluate(xtrain, ytrain, verbose=0)
                        result_val  = model.evaluate(xval, yval, verbose=0)
                        result_test = model.evaluate(xtest, ytest, verbose=0)

                        acc_train += result_train[1] /tries
                        acc_val += result_val[1] /tries
                        acc_test += result_test[1] /tries


                    print('rnn_layer', rnn_layer_repr[i], end=' ')
                    print('optimizer', optimizer_repr[j], end=' ')
                    print('hunit', hunit, end=' ')
                    print('repeatition', repeatition, end=' ')
                    print('lr', lr, end=' ')
                    print(f'train acc {acc_train:.2f}', end=' ')
                    print(f'val acc {acc_val:.2f}', end=' ')
                    print(f'test acc {acc_test:.2f}', end=' ')
                    print(f'baseline val {acc_val - baseline_val_acc:.2f}', end=' ')
                    print(f'baseline test {acc_test - baseline_test_acc:.2f}', end=' ')
                    print()
                print('------------ repeatition ---------------')
            print('------------ learning rate ---------------')
        print('------------ Optimizer ---------------')
    print('------------ Cell ---------------')

rnn_layer GRU optimizer Adam hunit 8 repeatition 2 lr 0.01 train acc 0.63 val acc 0.57 test acc 0.56 baseline val 0.01 baseline test 0.00 
rnn_layer GRU optimizer Adam hunit 32 repeatition 2 lr 0.01 train acc 0.65 val acc 0.57 test acc 0.54 baseline val 0.01 baseline test -0.01 
rnn_layer GRU optimizer Adam hunit 64 repeatition 2 lr 0.01 train acc 0.64 val acc 0.57 test acc 0.55 baseline val -0.00 baseline test -0.00 
rnn_layer GRU optimizer Adam hunit 256 repeatition 2 lr 0.01 train acc 0.60 val acc 0.60 test acc 0.57 baseline val 0.03 baseline test 0.02 
rnn_layer GRU optimizer Adam hunit 512 repeatition 2 lr 0.01 train acc 0.58 val acc 0.57 test acc 0.56 baseline val -0.00 baseline test 0.00 
------------ repeatition ---------------
rnn_layer GRU optimizer Adam hunit 8 repeatition 3 lr 0.01 train acc 0.65 val acc 0.62 test acc 0.60 baseline val 0.05 baseline test 0.04 
rnn_layer GRU optimizer Adam hunit 32 repeatition 3 lr 0.01 train acc 0.62 val acc 0.60 test acc 0.58 baseline val 