# Assignment 7 - RNNs and LSTMs

In [1]:
import sys
import os
import json
import pandas
import numpy
import optparse


from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers import SimpleRNN

## Part A: Recurrent Neural Network & Classification
### Data Preprocessing 

In [2]:
dataframe = pandas.read_csv("/Users/parthbansal/Downloads/dev-access.csv", engine='python', quotechar='|', header=None)

In [3]:
dataset = dataframe.values

In [4]:
print(dataset.shape)

(26773, 2)


In [5]:
X = dataset[:,0]
Y = dataset[:,1]

In [6]:
for index, item in enumerate(X):
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [7]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

In [10]:
max_log_length = 1024
X_processed = pad_sequences(X, maxlen=max_log_length)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X_processed, Y, test_size=0.25, random_state=42)

### Model 1 - RNN

In [12]:
model = Sequential()

In [13]:
model.add(Embedding(input_dim=num_words, output_dim=32, input_length=max_log_length))

In [14]:
model.add(SimpleRNN(units=32, activation='relu'))

In [15]:
from keras.layers import Dense

model.add(Dense(units=1, activation='sigmoid'))

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1024, 32)          2016      
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 4,129
Trainable params: 4,129
Non-trainable params: 0
_________________________________________________________________


In [18]:
X_processed = X_processed.astype('float32')
Y = Y.astype('float32')

In [19]:
history = model.fit(X_processed, Y, validation_split=0.25, epochs=3, batch_size=128)

Epoch 1/3


2023-05-08 11:26:37.264164: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/3
Epoch 3/3


In [20]:
loss, accuracy = model.evaluate(X_processed, Y, batch_size=128)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

Test loss: 0.05769364908337593
Test accuracy: 0.9853583574295044


### Model 2 - LSTM + Dropout Layers

In [21]:
model2 = Sequential()
model2.add(Embedding(input_dim=num_words, output_dim=32, input_length=max_log_length))
model2.add(LSTM(units=64, recurrent_dropout=0.5))
model2.add(Dropout(0.5))
model2.add(Dense(units=1, activation='sigmoid'))

In [22]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
print(model2.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1024, 32)          2016      
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
history2 = model2.fit(X_processed, Y, validation_split=0.25, epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:
test_loss, test_acc = model2.evaluate(X_processed, Y, batch_size=128)
print(f'Test loss: {test_loss}, Test accuracy: {test_acc}')

Test loss: 0.08070356398820877, Test accuracy: 0.9808015823364258


### Recurrent Neural Net Model 3

In [26]:
model = Sequential()

model.add(Embedding(input_dim=num_words, output_dim=64, input_length=max_log_length))

model.add(LSTM(units=64, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(units=32, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(units=16, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(units=1, activation='sigmoid'))

model.add(Dropout(rate=0.3))

In [27]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [28]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1024, 64)          4032      
                                                                 
 lstm_1 (LSTM)               (None, 1024, 64)          33024     
                                                                 
 lstm_2 (LSTM)               (None, 1024, 32)          12416     
                                                                 
 lstm_3 (LSTM)               (None, 16)                3136      
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
 dropout_1 (Dropout)         (None, 1)                 0         
                                                                 
Total params: 52,625
Trainable params: 52,625
Non-trai

In [29]:
history = model.fit(X_processed, Y, validation_split=0.25, epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [31]:
loss, accuracy = model.evaluate(X_processed, Y, batch_size=128)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.21218332648277283
Test Accuracy: 0.9821462035179138


### Conceptual Questions 


**Explain the difference between the relu activation function and the sigmoid activation function.**
- The relu (rectified linear unit) activation function outputs 0 for any input less than 0 and the input value for any input greater than or equal to 0, while the sigmoid activation function maps any input to a value between 0 and 1, with outputs closer to 0 indicating negative inputs and outputs closer to 1 indicating positive inputs.

**Describe what one epoch actually is (epoch was a parameter used in the .fit() method).**
- One epoch is a single pass through the entire training dataset during training. In other words, the model is shown every training example once during an epoch, and the model weights are updated based on the errors generated by the predictions.

**Explain how dropout works (you can look at the keras code and/or documentation) for (a) training, and (b) test data sets.**
- Dropout is a regularization technique for neural networks that randomly drops out (sets to zero) a proportion of the neuron outputs during training. During training, dropout works by randomly dropping out neurons with a certain probability so that other neurons have to take over the representation. During testing, dropout is not applied and the model uses all the neurons. Dropout can help to prevent overfitting.

**Explain why problems such as this homework assignment are better modeled with RNNs than CNNs. What type of problem will CNNs outperform RNNs on?**
- RNNs are better suited for modeling sequential data, such as natural language processing or time-series data, where the order of input data is important. CNNs are better suited for problems that involve spatial relationships between inputs, such as image or audio classification.

**Explain what RNN problem is solved using LSTM and briefly describe how.**
- The vanishing gradient problem can occur when training RNNs with traditional gradient descent, where the gradients become very small and the model stops learning. LSTM (Long Short-Term Memory) is a type of RNN that solves this problem by using a gating mechanism to control the flow of information through the network. LSTMs can selectively forget or remember previous inputs, allowing them to learn long-term dependencies in the input sequence.

## Part B: Time Series with LSTMs

In [1]:
import pandas as pd
import numpy as np
from typing import Tuple


def create_data_for_NN(
    data: pd.DataFrame, Y_var: str, lag: int, test_ratio: float
) -> Tuple[np.array, np.array, np.array, np.array]:
    """Function to return lagged time series data after train-test split

    Args:
        data (pd.DataFrame): Raw time series data frame
        Y_var (str): String with the name of y variable
        lag (int): number of lagged records to consider
        test_ratio (float): ratio of data to consider for test set

    Returns:
        Tuple[np.array, np.array, np.array, np.array]: Lagged and split numpy arrays
    """
    y = data[Y_var].tolist()

    X, Y = [], []

    if len(y) - lag <= 0:
        X.append(y)
    else:
        for i in range(len(y) - lag):
            Y.append(y[i + lag])
            X.append(y[i : (i + lag)])

    X, Y = np.array(X), np.array(Y)

    # Reshaping the X array to an LSTM input shape
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    # Creating training and test sets
    X_train = X
    X_test = []

    Y_train = Y
    Y_test = []

    if test_ratio > 0:
        index = round(len(X) * test_ratio)
        X_train = X[: (len(X) - index)]
        X_test = X[-index:]

        Y_train = Y[: (len(X) - index)]
        Y_test = Y[-index:]

    return X_train, X_test, Y_train, Y_test


lag = 3
test_ratio = 0.15

data = pd.read_csv('/Users/parthbansal/Downloads/DAYTON_hourly-2.csv', parse_dates=['Datetime'])
data.Datetime = pd.to_datetime(data.Datetime)
data.sort_values(by="Datetime", inplace=True)

X_train, X_test, Y_train, Y_test = create_data_for_NN(
    data, data.columns[-1], lag, test_ratio)

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf

model1 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units=64, input_shape=(3, 1)),
    tf.keras.layers.Dense(units=1)
])
model1.compile(optimizer="adam", loss="mean_squared_error")
history1 = model1.fit(X_train, Y_train, epochs=20, validation_split=0.2)

Epoch 1/20


2023-05-08 12:27:15.941507: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [4]:
X_train, X_test, Y_train, Y_test = create_data_for_NN(
    data, data.columns[-1], lag=24, test_ratio=test_ratio)

model2 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units=64, input_shape=(24, 1)),
    tf.keras.layers.Dense(units=1)
])
model2.compile(optimizer="adam", loss="mean_squared_error")
history2 = model2.fit(X_train, Y_train, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [5]:
model3 = tf.keras.models.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64), input_shape=(24, 1)),
    tf.keras.layers.Dense(units=1)
])
model3.compile(optimizer="adam", loss="mean_squared_error")
history3 = model3.fit(X_train, Y_train, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred1 = model1.predict(X_test)
rmse1 = np.sqrt(np.mean((Y_test - y_pred1)**2))
print("RMSE (model1):", rmse1)

y_pred2 = model2.predict(X_test)
rmse2 = np.sqrt(np.mean((Y_test - y_pred2)**2))
print("RMSE (model2):", rmse2)

y_pred3 = model3.predict(X_test)
rmse3 = np.sqrt(np.mean((Y_test - y_pred3)**2))
print("RMSE (model3):", rmse3)

fig, axs = plt.subplots(3, 1, figsize=(10, 10))