## Importing libraries

In [20]:
from pandas import read_csv
import numpy as np
from keras import Model
from keras.layers import Layer
import keras.backend as K
from keras.layers import Input, Dense, SimpleRNN,LSTM,Bidirectional
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.metrics import mean_squared_error

## Preparing the Dataset
The following function generates a sequence of n Fibonacci numbers (not counting the starting two values). If scale_data is set to True, then it would also use the MinMaxScaler from scikit-learn to scale the values between 0 and 1. Let’s see its output for n=10.

In [2]:
# Prepare data
def get_fib_seq(n, scale_data=True):
    # Get the Fibonacci sequence
    seq = np.zeros(n)
    fib_n1 = 0.0
    fib_n = 1.0 
    for i in range(n):
            seq[i] = fib_n1 + fib_n
            fib_n1 = fib_n
            fib_n = seq[i] 
    scaler = []
    if scale_data:
        scaler = MinMaxScaler(feature_range=(0, 1))
        seq = np.reshape(seq, (n, 1))
        seq = scaler.fit_transform(seq).flatten()        
    return seq, scaler

In [3]:
fib_seq = get_fib_seq(10, False)[0]
print(fib_seq)

[ 1.  2.  3.  5.  8. 13. 21. 34. 55. 89.]


- Function get_fib_XY() that reformats the sequence into training examples and target values to be used by the Keras input layer. 

- Time_steps is number of columns.

In [4]:
def get_fib_XY(total_fib_numbers, time_steps, train_percent, scale_data=True):
    dat, scaler = get_fib_seq(total_fib_numbers, scale_data)    
    Y_ind = np.arange(time_steps, len(dat), 1)
    Y = dat[Y_ind]
    rows_x = len(Y)
    X = dat[0:rows_x]
    for i in range(time_steps-1):
        temp = dat[i+1:rows_x+i+1]
        X = np.column_stack((X, temp))
    # random permutation with fixed seed   
    rand = np.random.RandomState(seed=13)
    idx = rand.permutation(rows_x)
    split = int(train_percent*rows_x)
    train_ind = idx[0:split]
    test_ind = idx[split:]
    trainX = X[train_ind]
    trainY = Y[train_ind]
    testX = X[test_ind]
    testY = Y[test_ind]
    trainX = np.reshape(trainX, (len(trainX), time_steps, 1))    
    testX = np.reshape(testX, (len(testX), time_steps, 1))
    return trainX, trainY, testX, testY, scaler

In [5]:
# Set up parameters
time_steps = 20
hidden_units = 2
epochs = 30

## Modeling

In [None]:
import pandas as pd
performance = pd.DataFrame({"model":[] ,'Train_set_MSE':[],'Test_set_MSE':[]})

### Simple RNN

In [6]:
# Create a traditional RNN network
def create_simple_RNN(hidden_units, dense_units, input_shape, activation):
    model = Sequential()
    model.add(SimpleRNN(hidden_units, input_shape=input_shape, activation=activation[0]))
    model.add(Dense(units=dense_units, activation=activation[1]))
    model.compile(loss='mse', optimizer='adam')
    return model

model_RNN = create_simple_RNN(hidden_units=hidden_units, dense_units=1, input_shape=(time_steps,1), 
                   activation=['tanh', 'tanh'])





In [7]:
# Generate the dataset for the network
trainX, trainY, testX, testY, scaler  = get_fib_XY(1200, time_steps, 0.7)
# Train the network
model_RNN.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)


# Evalute model
train_mse = model_RNN.evaluate(trainX, trainY)
test_mse = model_RNN.evaluate(testX, testY)

# Print error
print(f"Train set MSE = {train_mse:.{7}f}")
print(f"Test set MSE = {test_mse:.{7}f}")

Epoch 1/30

826/826 - 3s - loss: 0.0031 - 3s/epoch - 4ms/step
Epoch 2/30
826/826 - 2s - loss: 0.0028 - 2s/epoch - 3ms/step
Epoch 3/30
826/826 - 2s - loss: 0.0026 - 2s/epoch - 3ms/step
Epoch 4/30
826/826 - 2s - loss: 0.0024 - 2s/epoch - 3ms/step
Epoch 5/30
826/826 - 2s - loss: 0.0022 - 2s/epoch - 3ms/step
Epoch 6/30
826/826 - 2s - loss: 0.0020 - 2s/epoch - 3ms/step
Epoch 7/30
826/826 - 2s - loss: 0.0018 - 2s/epoch - 3ms/step
Epoch 8/30
826/826 - 2s - loss: 0.0016 - 2s/epoch - 3ms/step
Epoch 9/30
826/826 - 2s - loss: 0.0015 - 2s/epoch - 3ms/step
Epoch 10/30
826/826 - 2s - loss: 0.0013 - 2s/epoch - 3ms/step
Epoch 11/30
826/826 - 2s - loss: 0.0012 - 2s/epoch - 3ms/step
Epoch 12/30
826/826 - 2s - loss: 0.0011 - 2s/epoch - 3ms/step
Epoch 13/30
826/826 - 2s - loss: 0.0010 - 2s/epoch - 3ms/step
Epoch 14/30
826/826 - 2s - loss: 9.1181e-04 - 2s/epoch - 3ms/step
Epoch 15/30
826/826 - 2s - loss: 8.0716e-04 - 2s/epoch - 3ms/step
Epoch 16/30
826/826 - 2s - loss: 7.2742e-04 - 2s/epoch - 3ms/step
Epoc

In [9]:
performance = pd.concat([performance, pd.DataFrame({'model':'Simple RNN','Train_set_MSE':train_mse,'Test_set_MSE':test_mse},index=[0])])
performance

Unnamed: 0,model,Train_set_MSE,Test_set_MSE
0,Simple RNN,6.5e-05,2.9e-05


### Attention class 

In [10]:
# Add attention layer to the deep learning network
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape): #weights are automatically tuned
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)

    def call(self,x): # forward pass
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)   
        # Compute the weights
        alpha = K.softmax(e) # 0 to 1 - normalization
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

### Simple RNN with Attention

In [11]:
def create_RNN_with_attention(hidden_units, dense_units, input_shape, activation):
    x=Input(shape=input_shape)
    RNN_layer = SimpleRNN(hidden_units, return_sequences=True, activation=activation)(x)
    attention_layer = attention()(RNN_layer)
    outputs=Dense(dense_units, trainable=True, activation=activation)(attention_layer)
    model=Model(x,outputs)
    model.compile(loss='mse', optimizer='adam')    
    return model    

# Create the model with attention, train and evaluate
model_attention = create_RNN_with_attention(hidden_units=hidden_units, dense_units=1, 
                                  input_shape=(time_steps,1), activation='tanh')
model_attention.summary()    


model_attention.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)

# Evalute model
train_mse = model_attention.evaluate(trainX, trainY)
test_mse = model_attention.evaluate(testX, testY)

# Print error
print(f"Train set MSE = {train_mse:.{7}f}")
print(f"Test set MSE = {test_mse:.{7}f}")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20, 1)]           0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 20, 2)             8         
                                                                 
 attention (attention)       (None, 2)                 22        
                                                                 
 dense_1 (Dense)             (None, 1)                 3         
                                                                 
Total params: 33 (132.00 Byte)
Trainable params: 33 (132.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/30
826/826 - 4s - loss: 0.0017 - 4s/epoch - 5ms/step
Epoch 2/30
826/826 - 2s - loss: 0.0017 - 2s/epoch - 3ms/step
Epoch 3/30
826/826 - 2s - loss: 0.0

In [12]:
performance = pd.concat([performance, pd.DataFrame({'model':'Simple RNN with Attention','Train_set_MSE':train_mse,'Test_set_MSE':test_mse},index=[0])])
performance

Unnamed: 0,model,Train_set_MSE,Test_set_MSE
0,Simple RNN,6.5e-05,2.9e-05
0,Simple RNN with Attention,0.000929,0.000813


### LSTM

In [13]:
def create_LSTM(hidden_units, dense_units, input_shape, activation):
    model = Sequential()
    model.add(LSTM(hidden_units, input_shape=input_shape, activation=activation[0]))
    model.add(Dense(units=dense_units, activation=activation[1]))
    model.compile(loss='mse', optimizer='adam')
    return model

model_LSTM = create_LSTM(hidden_units=hidden_units, dense_units=1, input_shape=(time_steps,1), 
                   activation=['tanh', 'tanh'])

In [14]:
model_LSTM.summary()    


model_LSTM.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)

# Evalute model
train_mse = model_LSTM.evaluate(trainX, trainY)
test_mse = model_LSTM.evaluate(testX, testY)

# Print error
print(f"Train set MSE = {train_mse:.{7}f}")
print(f"Test set MSE = {test_mse:.{7}f}")

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 2)                 32        
                                                                 
 dense_2 (Dense)             (None, 1)                 3         
                                                                 
Total params: 35 (140.00 Byte)
Trainable params: 35 (140.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/30
826/826 - 7s - loss: 0.0014 - 7s/epoch - 8ms/step
Epoch 2/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 3/30
826/826 - 3s - loss: 0.0013 - 3s/epoch - 4ms/step
Epoch 4/30
826/826 - 3s - loss: 0.0013 - 3s/epoch - 4ms/step
Epoch 5/30
826/826 - 3s - loss: 0.0012 - 3s/epoch - 4ms/step
Epoch 6/30
826/826 - 3s - loss: 0.0012 - 3s/epoch - 4ms/step
Epoch 7/30
826/826 - 3s - loss: 0.0011 - 3s/epoc

In [15]:
performance = pd.concat([performance, pd.DataFrame({'model':'LSTM','Train_set_MSE':train_mse,'Test_set_MSE':test_mse},index=[0])])
performance

Unnamed: 0,model,Train_set_MSE,Test_set_MSE
0,Simple RNN,6.5e-05,2.9e-05
0,Simple RNN with Attention,0.000929,0.000813
0,LSTM,3.1e-05,1e-05


### LSTM with Attention

In [16]:
def create_LSTM_with_attention(hidden_units, dense_units, input_shape, activation):
    x=Input(shape=input_shape)
    LSTM_layer = LSTM(hidden_units, return_sequences=True, activation=activation)(x)
    attention_layer = attention()(LSTM_layer)
    outputs=Dense(dense_units, trainable=True, activation=activation)(attention_layer)
    model=Model(x,outputs)
    model.compile(loss='mse', optimizer='adam')    
    return model    

# Create the model with attention, train and evaluate
model_LSTM_with_attention = create_LSTM_with_attention(hidden_units=hidden_units, dense_units=1, 
                                  input_shape=(time_steps,1), activation='tanh')
model_LSTM_with_attention.summary()    


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20, 1)]           0         
                                                                 
 lstm_1 (LSTM)               (None, 20, 2)             32        
                                                                 
 attention_1 (attention)     (None, 2)                 22        
                                                                 
 dense_3 (Dense)             (None, 1)                 3         
                                                                 
Total params: 57 (228.00 Byte)
Trainable params: 57 (228.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
model_LSTM_with_attention.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)

# Evalute model
train_mse = model_LSTM_with_attention.evaluate(trainX, trainY)
test_mse = model_LSTM_with_attention.evaluate(testX, testY)

# Print error
print(f"Train set MSE = {train_mse:.{7}f}")
print(f"Test set MSE = {test_mse:.{7}f}")

Epoch 1/30
826/826 - 7s - loss: 0.0014 - 7s/epoch - 8ms/step
Epoch 2/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 3/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 4/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 5/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 6/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 7/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 8/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 9/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 10/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 11/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 12/30
826/826 - 3s - loss: 0.0013 - 3s/epoch - 4ms/step
Epoch 13/30
826/826 - 4s - loss: 0.0013 - 4s/epoch - 4ms/step
Epoch 14/30
826/826 - 3s - loss: 0.0013 - 3s/epoch - 4ms/step
Epoch 15/30
826/826 - 3s - loss: 0.0013 - 3s/epoch - 4ms/step
Epoch 16/30
826/826 - 3s - loss: 0.0012 - 3s/epoch - 4ms/step
Epoch 17/30
826/8

In [18]:
performance = pd.concat([performance, pd.DataFrame({'model':'LSTM_with_attention','Train_set_MSE':train_mse,'Test_set_MSE':test_mse},index=[0])])
performance

Unnamed: 0,model,Train_set_MSE,Test_set_MSE
0,Simple RNN,6.5e-05,2.9e-05
0,Simple RNN with Attention,0.000929,0.000813
0,LSTM,3.1e-05,1e-05
0,LSTM_with_attention,0.000387,0.000308


### Bi-Directional LSTM

In [21]:
def create_BiLSTM(hidden_units, dense_units, input_shape, activation):
    x=Input(shape=input_shape)
    BiLSTM_layer = Bidirectional(LSTM(hidden_units, return_sequences=True, activation=activation))(x)
    #attention_layer = attention()(LSTM_layer)
    outputs=Dense(dense_units, trainable=True, activation=activation)(BiLSTM_layer)
    model=Model(x,outputs)
    model.compile(loss='mse', optimizer='adam')    
    return model    

# Create the model , train and evaluate
model_BiLSTM = create_BiLSTM(hidden_units=hidden_units, dense_units=1, 
                                  input_shape=(time_steps,1), activation='tanh')
model_BiLSTM.summary()    


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 20, 1)]           0         
                                                                 
 bidirectional (Bidirection  (None, 20, 4)             64        
 al)                                                             
                                                                 
 dense_4 (Dense)             (None, 20, 1)             5         
                                                                 
Total params: 69 (276.00 Byte)
Trainable params: 69 (276.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
model_BiLSTM.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)

# Evalute model
train_mse = model_BiLSTM.evaluate(trainX, trainY)
test_mse = model_BiLSTM.evaluate(testX, testY)

# Print error
print(f"Train set MSE = {train_mse:.{7}f}")
print(f"Test set MSE = {test_mse:.{7}f}")

Epoch 1/30
826/826 - 10s - loss: 0.0014 - 10s/epoch - 12ms/step
Epoch 2/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 5ms/step
Epoch 3/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 4/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 4ms/step
Epoch 5/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 6/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 7/30
826/826 - 3s - loss: 0.0013 - 3s/epoch - 4ms/step
Epoch 8/30
826/826 - 4s - loss: 0.0013 - 4s/epoch - 5ms/step
Epoch 9/30
826/826 - 4s - loss: 0.0013 - 4s/epoch - 5ms/step
Epoch 10/30
826/826 - 4s - loss: 0.0013 - 4s/epoch - 5ms/step
Epoch 11/30
826/826 - 4s - loss: 0.0013 - 4s/epoch - 5ms/step
Epoch 12/30
826/826 - 4s - loss: 0.0012 - 4s/epoch - 4ms/step
Epoch 13/30
826/826 - 4s - loss: 0.0012 - 4s/epoch - 4ms/step
Epoch 14/30
826/826 - 4s - loss: 0.0011 - 4s/epoch - 5ms/step
Epoch 15/30
826/826 - 4s - loss: 0.0010 - 4s/epoch - 5ms/step
Epoch 16/30
826/826 - 3s - loss: 8.3156e-04 - 3s/epoch - 4ms/step
Epoch 17/3

In [23]:
performance = pd.concat([performance, pd.DataFrame({'model':'Bi-LSTM','Train_set_MSE':train_mse,'Test_set_MSE':test_mse},index=[0])])
performance

Unnamed: 0,model,Train_set_MSE,Test_set_MSE
0,Simple RNN,6.5e-05,2.9e-05
0,Simple RNN with Attention,0.000929,0.000813
0,LSTM,3.1e-05,1e-05
0,LSTM_with_attention,0.000387,0.000308
0,Bi-LSTM,0.002267,0.002346


### Bi-Directional With Attention

In [24]:
def create_BiLSTM_with_attention(hidden_units, dense_units, input_shape, activation):
    x=Input(shape=input_shape)
    BiLSTM_layer = Bidirectional(LSTM(hidden_units, return_sequences=True, activation=activation))(x)
    attention_layer = attention()(BiLSTM_layer)
    outputs=Dense(dense_units, trainable=True, activation=activation)(BiLSTM_layer)
    model=Model(x,outputs)
    model.compile(loss='mse', optimizer='adam')    
    return model    

# Create the model with attention, train and evaluate
model_BiLSTM_with_attention = create_BiLSTM_with_attention(hidden_units=hidden_units, dense_units=1, 
                                  input_shape=(time_steps,1), activation='tanh')
model_BiLSTM_with_attention.summary()    


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 20, 1)]           0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 20, 4)             64        
 onal)                                                           
                                                                 
 dense_5 (Dense)             (None, 20, 1)             5         
                                                                 
Total params: 69 (276.00 Byte)
Trainable params: 69 (276.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
model_BiLSTM_with_attention.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)

# Evalute model
train_mse = model_BiLSTM_with_attention.evaluate(trainX, trainY)
test_mse = model_BiLSTM_with_attention.evaluate(testX, testY)

# Print error
print(f"Train set MSE = {train_mse:.{7}f}")
print(f"Test set MSE = {test_mse:.{7}f}")

Epoch 1/30
826/826 - 10s - loss: 0.0016 - 10s/epoch - 12ms/step
Epoch 2/30
826/826 - 3s - loss: 0.0015 - 3s/epoch - 4ms/step
Epoch 3/30
826/826 - 3s - loss: 0.0015 - 3s/epoch - 4ms/step
Epoch 4/30
826/826 - 4s - loss: 0.0015 - 4s/epoch - 5ms/step
Epoch 5/30
826/826 - 3s - loss: 0.0015 - 3s/epoch - 4ms/step
Epoch 6/30
826/826 - 3s - loss: 0.0015 - 3s/epoch - 4ms/step
Epoch 7/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 4ms/step
Epoch 8/30
826/826 - 4s - loss: 0.0015 - 4s/epoch - 5ms/step
Epoch 9/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 4ms/step
Epoch 10/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 5ms/step
Epoch 11/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 5ms/step
Epoch 12/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 5ms/step
Epoch 13/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 14/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 15/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 5ms/step
Epoch 16/30
826/826 - 4s - loss: 0.0014 - 4s/epoch - 4ms/step
Epoch 17/30
82

In [26]:
performance = pd.concat([performance, pd.DataFrame({'model':'Bi-LSTM_with_attention','Train_set_MSE':train_mse,'Test_set_MSE':test_mse},index=[0])])
performance

Unnamed: 0,model,Train_set_MSE,Test_set_MSE
0,Simple RNN,6.5e-05,2.9e-05
0,Simple RNN with Attention,0.000929,0.000813
0,LSTM,3.1e-05,1e-05
0,LSTM_with_attention,0.000387,0.000308
0,Bi-LSTM,0.002267,0.002346
0,Bi-LSTM_with_attention,0.001516,0.001364


In [27]:
performance.sort_values(by=['Train_set_MSE','Test_set_MSE'], ascending=True)

Unnamed: 0,model,Train_set_MSE,Test_set_MSE
0,LSTM,3.1e-05,1e-05
0,Simple RNN,6.5e-05,2.9e-05
0,LSTM_with_attention,0.000387,0.000308
0,Simple RNN with Attention,0.000929,0.000813
0,Bi-LSTM_with_attention,0.001516,0.001364
0,Bi-LSTM,0.002267,0.002346


- After comparing the MSE scores of all the Models LSTM model is the one with least MSE scores so it is better to choose this model. 
- Due to the complexity of the attention models secure higher MSE scores.
- There isnt much difference between train and test accuracies so there is no overfitting in the models.