In [1]:
!pip install wandb


Collecting wandb
  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.43.0-py2.py3-none-any.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wa

In [2]:
%mkdir ../data
!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

--2024-03-23 07:51:00--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘../data/aclImdb_v1.tar.gz’


2024-03-23 07:51:05 (15.7 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
import os
import glob

def read_imdb_data(data_dir='../data/aclImdb'):
    data = {}
    labels = {}

    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}

        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []

            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)

            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)

            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)

    return data, labels

In [4]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))


IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [5]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']

    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)

    # Return a unified training data, test data, training labels, test labels
    return data_train, data_test, labels_train, labels_test

In [6]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print(len(train_X))
print(len(train_y))

25000
25000


The first step in processing the reviews is to make sure that any html tags that appear should be removed. In addition we wish to tokenize our input, that way words such as entertained and entertaining are considered the same with regard to sentiment analysis.

In [7]:
# Import the necessary cleaning Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
from bs4 import BeautifulSoup


In [8]:

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()

    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem

    return words

In [9]:
print(train_X[100])
review_to_words(train_X[100])


I enjoyed this movie, granted it is mainly because I enjoy seeing Sean Connery act and this one has the added bonus of having Ed Harris and Lawrence Fishburne in it too. The story has a grandma seeking out Connery's assistance because her grandson is in prison and she says he was wrongly convicted. At first it seems there may have been some racist aspects of the case, however it later turns out the main officer on the case was black himself and it seems he did some rather bad things to coerce a confession out of the boy. Well the boy tries to point to another killer locked up in the same prison, one who is about to be put to death. He is a particularly nasty person too, as he takes a lot of joy in what he did, writing the relatives of his victims and trying to get people to mail them. A lot of twists and turns in this one with some of it being somewhat unexpected. Me I just enjoyed Sean Connery's character trying to make sense of the whole ordeal. The movie also made me mad in areas, e

['enjoy',
 'movi',
 'grant',
 'mainli',
 'enjoy',
 'see',
 'sean',
 'conneri',
 'act',
 'one',
 'ad',
 'bonu',
 'ed',
 'harri',
 'lawrenc',
 'fishburn',
 'stori',
 'grandma',
 'seek',
 'conneri',
 'assist',
 'grandson',
 'prison',
 'say',
 'wrongli',
 'convict',
 'first',
 'seem',
 'may',
 'racist',
 'aspect',
 'case',
 'howev',
 'later',
 'turn',
 'main',
 'offic',
 'case',
 'black',
 'seem',
 'rather',
 'bad',
 'thing',
 'coerc',
 'confess',
 'boy',
 'well',
 'boy',
 'tri',
 'point',
 'anoth',
 'killer',
 'lock',
 'prison',
 'one',
 'put',
 'death',
 'particularli',
 'nasti',
 'person',
 'take',
 'lot',
 'joy',
 'write',
 'rel',
 'victim',
 'tri',
 'get',
 'peopl',
 'mail',
 'lot',
 'twist',
 'turn',
 'one',
 'somewhat',
 'unexpect',
 'enjoy',
 'sean',
 'conneri',
 'charact',
 'tri',
 'make',
 'sens',
 'whole',
 'ordeal',
 'movi',
 'also',
 'made',
 'mad',
 'area',
 'especi',
 'find',
 'ultim',
 'happen',
 'get',
 'good',
 'interplay',
 'conneri',
 'harri',
 'conneri',
 'fishburn',
 

In [10]:
import os
import pickle

cache_dir = "../data"  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay

    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        #words_train = list(map(review_to_words, data_train))
        #words_test = list(map(review_to_words, data_test))
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]

        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])

    return words_train, words_test, labels_train, labels_test


In [11]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)


  text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags


Wrote preprocessed data to cache file: preprocessed_data.pkl


In [12]:
# Determine the size of the validation set (20% of the training data)
val_size = int(0.2 * len(train_X))

# Split the data into training and validation sets
val_X = train_X[:val_size]
val_y = train_y[:val_size]

# Update the training data to exclude the validation set
train_X = train_X[val_size:]
train_y = train_y[val_size:]


In [13]:
print(len(train_X))

20000


Since we will be using a recurrent neural network, it will be convenient if the length of each review is the same. To do this, we will fix a size for our reviews and then pad short reviews with the category 'no word' (which we will label 0) and truncate long reviews. Basically in the dictionary the one with the most higher rank is the one that occurs most frequently. You dont care for the first two these are no words

In [14]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    word_count = {}

    for sentence in data:
        for word in sentence:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

    # DONE: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.

    sorted_words = sorted(word_count, key=word_count.get, reverse=True)
    print("this is the first word having most frequency",sorted_words[0])
    print("this is the word having least frequency",sorted_words[-1])

    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels

    return word_dict

In [15]:
import os
import pickle

data_dir = '../data'  # The folder we will use for storing data
word_dict_file = os.path.join(data_dir, 'word_dict.pkl')

if os.path.exists(word_dict_file):
    # If the word dictionary file exists, load it
    with open(word_dict_file, "rb") as f:
        word_dict = pickle.load(f)
    print("Loaded word dictionary from:", word_dict_file)
else:
    # If the word dictionary file doesn't exist, build it
    word_dict = build_dict(train_X)
    with open(word_dict_file, "wb") as f:
        pickle.dump(word_dict, f)
    print("Built and saved word dictionary to:", word_dict_file)

print(list(word_dict.keys())[0:5])


this is the first word having most frequency movi
this is the word having least frequency lauper
Built and saved word dictionary to: ../data/word_dict.pkl
['movi', 'film', 'one', 'like', 'time']


Now that we have our word dictionary which allows us to transform the words appearing in the reviews into integers, it is time to make use of it and convert our reviews to their integer sequence representation, making sure to pad or truncate to a fixed length, which in our case is 500.

In [16]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict

    working_sentence = [NOWORD] * pad

    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ

    return working_sentence, min(len(sentence), pad)

In [17]:
def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []

    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)

    return np.array(result), np.array(lengths)

In [18]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
test_X, test_X_len = convert_and_pad_data(word_dict, test_X)
val_X,val_X_len=convert_and_pad_data(word_dict,val_X)


In [19]:
import os
import pandas as pd

data_dir = '../data'  # The folder we will use for storing data
train_csv_file = os.path.join(data_dir, 'train.csv')

if not os.path.exists(train_csv_file):
    # If the train.csv file doesn't exist, create it
    pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1) \
        .to_csv(train_csv_file, header=False, index=False)
    print("Created train.csv file:", train_csv_file)
else:
    print("train.csv file already exists, skipping creation.")

Created train.csv file: ../data/train.csv


In [20]:
data_dir = '../data'  # The folder we will use for storing data
val_csv_file = os.path.join(data_dir, 'val.csv')

if not os.path.exists(val_csv_file):
    # If the train.csv file doesn't exist, create it
    pd.concat([pd.DataFrame(val_y), pd.DataFrame(val_X_len), pd.DataFrame(val_X)], axis=1) \
        .to_csv(val_csv_file, header=False, index=False)
    print("Created val.csv file:", val_csv_file)
else:
    print("val.csv file already exists, skipping creation.")

Created val.csv file: ../data/val.csv


In [21]:
pd.concat([pd.DataFrame(test_y), pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [22]:
print(train_X[0])
print(train_X_len[0])

[   5    3  563  619  468   22  214  132    5    2 2146    1 2918   52
  134    1    1 2184    1    1    1 3305    1 1261    1 1111    3 1083
  193  191  257  352  658   90  264  692    1  313 2490 1889    1 2163
   94 1157   60  149    1  235    1   27 3651  422   90 1332  815   80
  329  711  497    1  815   95 1080    3 1734  115   22 2146  588 2174
    1   44  267    1    1  257  418  921    3 3096  150   22  545 1001
   85   17   33  357  393  449    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [23]:
import torch.nn as nn

class RNNClassifier(nn.Module):
    """ RNNClassifier class for initializing the layers for the simple
    recurrent neural network model (RNN) used for Sentiment Analysis of
    IMDB reviews.

    Attributes:
        embedding_dim (int): Dimensionality of the embedding layer
        hidden_dim (int): Dimensionality of the hidden layer(s)
        vocab_size (int): Size of the vocabulary used by Bag of Words

    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(RNNClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.dense = nn.Linear(hidden_dim, 1)
        self.sig = nn.Sigmoid()

        self.word_dict = None

    def forward(self, x):
        lengths = x[:, 0]  # Extract lengths from the input
        reviews = x[:, 1:]  # Extract reviews from the input
        embeds = self.embedding(reviews)
        rnn_out, _ = self.rnn(embeds)
        out = self.dense(rnn_out[:, -1, :])  # Select the last time step's output
        return self.sig(out.squeeze())


In [24]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    """ LSTMClassifier class for initializing the layers for the simple
    recurrent neural network model (RNN) used for Sentiment Analysis of
    IMDB reviews.

    Attributes:
        embedding_dim (int) dimensionality of the embedding layer
        hidden_dim (int) dimensionality of the hidden layer(s)
        vocab_size (int) size of the vocabulary used by Bag of Words

    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):

        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dense = nn.Linear(in_features=hidden_dim, out_features=1)
        self.sig = nn.Sigmoid()

        self.word_dict = None


    def forward(self, x):
        x = x.t()
        lengths = x[0,:]
        reviews = x[1:,:]
        embeds = self.embedding(reviews)
        lstm_out, _ = self.lstm(embeds)
        out = self.dense(lstm_out)
        out = out[lengths - 1, range(len(lengths))]
        return self.sig(out.squeeze())

In [25]:
def evaluate(model, dataloader, loss_fn, device):
    """
    Function to evaluate the model on a given dataset.

    Args:
    - model: the PyTorch model to evaluate
    - dataloader: DataLoader object for the dataset
    - loss_fn: loss function used for training
    - device: device to run the evaluation on (e.g., "cpu" or "cuda")

    Returns:
    - accuracy: accuracy of the model on the dataset
    - loss: average loss on the dataset
    """

    model.eval()  # Set the model to evaluation mode

    correct = 0
    total = 0
    running_loss = 0.0

    with torch.no_grad():  # Disable gradient calculation during evaluation
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Compute loss
            loss = loss_fn(outputs, labels)
            running_loss += loss.item()

            # Compute accuracy
            predicted = torch.round(outputs)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    avg_loss = running_loss / len(dataloader)

    return accuracy, avg_loss


In [26]:
def train(model, train_loader, val_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch_X, batch_y = batch

            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            output = model(batch_X)
            loss = loss_fn(output, batch_y)
            loss.backward()
            optimizer.step()

            total_loss += loss.data.item()

        # Validation phase
        val_accuracy, val_loss = evaluate(model, val_loader, loss_fn, device)

        # Calculate average training loss
        avg_train_loss = total_loss / len(train_loader)

        # Print and log the losses
        print("Epoch: {}, Train Loss: {:.4f}, Val Loss: {:.4f}, Val Accuracy: {:.4f}".format(epoch, avg_train_loss, val_loss, val_accuracy))
        wandb.log({"train_loss": avg_train_loss, "val_loss": val_loss})


In [27]:
import torch
import torch.utils.data
# Read in only the first 250 rows
train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)

# Read in only the first 250 rows
val_sample = pd.read_csv(os.path.join(data_dir, 'val.csv'), header=None, names=None)

# Turn the input pandas dataframe into tensors
val_sample_y = torch.from_numpy(val_sample[[0]].values).float().squeeze()
val_sample_X = torch.from_numpy(val_sample.drop([0], axis=1).values).long()

# Build the dataset
val_sample_ds = torch.utils.data.TensorDataset(val_sample_X, val_sample_y)

test_sample = pd.read_csv(os.path.join(data_dir, 'test.csv'), header=None, names=None)

# Turn the input pandas dataframe into tensors
test_sample_y = torch.from_numpy(test_sample[[0]].values).float().squeeze()
test_sample_X = torch.from_numpy(test_sample.drop([0], axis=1).values).long()

# Build the dataset
test_sample_ds = torch.utils.data.TensorDataset(test_sample_X, test_sample_y)

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [29]:
import wandb
wandb.login(key="ac8ef4bdae5edab6284aaf71af5502f523fcb79f")
# Define hyperparameter configurations
hyperparameters = [
    {"embedding_dim": 32, "hidden_dim": 64, "batch_size": 64},
    {"embedding_dim": 64, "hidden_dim": 128, "batch_size": 128},
    {"embedding_dim": 128, "hidden_dim": 256, "batch_size": 128},
    {"embedding_dim": 256, "hidden_dim": 512, "batch_size": 512},
    {"embedding_dim": 512, "hidden_dim": 1024, "batch_size": 512},
]

# Initialize Wandb
wandb.init(project="hyperparameter-tuning_lstm_final", name="LSTM_Experiment")

# Experiment loop
for i, config in enumerate(hyperparameters, 1):
    config_name = f"run_{i}"
    train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=config["batch_size"])
    test_sample_dl = torch.utils.data.DataLoader(test_sample_ds,  batch_size=config["batch_size"])
    val_sample_dl = torch.utils.data.DataLoader(val_sample_ds,  batch_size=config["batch_size"])

    # Initialize Wandb for the experiment
    wandb.init(config=config, name=config_name)
    config = wandb.config

    # Create model with specified hyperparameters
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMClassifier(config.embedding_dim, config.hidden_dim, vocab_size=5000).to(device)

    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = torch.nn.BCELoss()

    # Train the model
    train(model, train_sample_dl,val_sample_dl, 15, optimizer, loss_fn, device)

    # Evaluate the model on validation set
    accuracy, avg_loss = evaluate(model, test_sample_dl, loss_fn, device)

    # Log metrics using Wandb
    print(f"LSTM Run {i}: Accuracy on validation set: {accuracy:.4f}, Average loss on validation set: {avg_loss:.4f}")
    wandb.log({"accuracy": accuracy, "avg_loss": avg_loss})

    # Finish Wandb run
    wandb.finish()


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprishadcmc[0m ([33mprisha[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Epoch: 1, Train Loss: 0.6278, Val Loss: 0.5434, Val Accuracy: 0.7302
Epoch: 2, Train Loss: 0.4529, Val Loss: 0.4608, Val Accuracy: 0.7820
Epoch: 3, Train Loss: 0.3680, Val Loss: 0.3908, Val Accuracy: 0.8224
Epoch: 4, Train Loss: 0.3229, Val Loss: 0.3730, Val Accuracy: 0.8366
Epoch: 5, Train Loss: 0.2995, Val Loss: 0.3516, Val Accuracy: 0.8480
Epoch: 6, Train Loss: 0.2659, Val Loss: 0.3455, Val Accuracy: 0.8498
Epoch: 7, Train Loss: 0.2335, Val Loss: 0.3631, Val Accuracy: 0.8592
Epoch: 8, Train Loss: 0.2323, Val Loss: 0.3734, Val Accuracy: 0.8536
Epoch: 9, Train Loss: 0.2053, Val Loss: 0.3642, Val Accuracy: 0.8592
Epoch: 10, Train Loss: 0.1960, Val Loss: 0.3633, Val Accuracy: 0.8596
Epoch: 11, Train Loss: 0.1746, Val Loss: 0.3749, Val Accuracy: 0.8578
Epoch: 12, Train Loss: 0.1622, Val Loss: 0.3845, Val Accuracy: 0.8608
Epoch: 13, Train Loss: 0.1540, Val Loss: 0.4032, Val Accuracy: 0.8432
Epoch: 14, Train Loss: 0.1443, Val Loss: 0.4035, Val Accuracy: 0.8580
Epoch: 15, Train Loss: 0.1300

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
avg_loss,▁
train_loss,█▆▄▄▃▃▂▂▂▂▂▁▁▁▁
val_loss,█▅▃▂▁▁▂▂▂▂▂▂▃▃▃

0,1
accuracy,0.85188
avg_loss,0.41014
train_loss,0.12997
val_loss,0.4095


Epoch: 1, Train Loss: 0.6061, Val Loss: 0.4785, Val Accuracy: 0.7684
Epoch: 2, Train Loss: 0.4235, Val Loss: 0.4012, Val Accuracy: 0.8110
Epoch: 3, Train Loss: 0.3401, Val Loss: 0.3820, Val Accuracy: 0.8212
Epoch: 4, Train Loss: 0.3122, Val Loss: 0.3782, Val Accuracy: 0.8414
Epoch: 5, Train Loss: 0.2906, Val Loss: 0.3404, Val Accuracy: 0.8510
Epoch: 6, Train Loss: 0.2383, Val Loss: 0.3391, Val Accuracy: 0.8596
Epoch: 7, Train Loss: 0.2210, Val Loss: 0.3465, Val Accuracy: 0.8510
Epoch: 8, Train Loss: 0.2204, Val Loss: 0.3481, Val Accuracy: 0.8488
Epoch: 9, Train Loss: 0.1936, Val Loss: 0.3438, Val Accuracy: 0.8596
Epoch: 10, Train Loss: 0.2062, Val Loss: 0.3668, Val Accuracy: 0.8454
Epoch: 11, Train Loss: 0.1646, Val Loss: 0.3671, Val Accuracy: 0.8514
Epoch: 12, Train Loss: 0.1708, Val Loss: 0.4269, Val Accuracy: 0.8560
Epoch: 13, Train Loss: 0.1299, Val Loss: 0.3938, Val Accuracy: 0.8462
Epoch: 14, Train Loss: 0.1035, Val Loss: 0.4146, Val Accuracy: 0.8492
Epoch: 15, Train Loss: 0.0931

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
avg_loss,▁
train_loss,█▆▄▄▄▃▃▃▂▃▂▂▂▁▁
val_loss,█▄▃▃▁▁▁▁▁▂▂▅▄▅▆

0,1
accuracy,0.84764
avg_loss,0.4529
train_loss,0.0931
val_loss,0.43909


Epoch: 1, Train Loss: 0.5871, Val Loss: 0.4859, Val Accuracy: 0.7752
Epoch: 2, Train Loss: 0.5186, Val Loss: 0.5062, Val Accuracy: 0.7520
Epoch: 3, Train Loss: 0.4512, Val Loss: 0.4168, Val Accuracy: 0.8098
Epoch: 4, Train Loss: 0.3808, Val Loss: 0.4570, Val Accuracy: 0.7776
Epoch: 5, Train Loss: 0.3777, Val Loss: 0.3855, Val Accuracy: 0.8254
Epoch: 6, Train Loss: 0.2766, Val Loss: 0.3601, Val Accuracy: 0.8468
Epoch: 7, Train Loss: 0.2721, Val Loss: 0.3745, Val Accuracy: 0.8514
Epoch: 8, Train Loss: 0.2262, Val Loss: 0.3653, Val Accuracy: 0.8414
Epoch: 9, Train Loss: 0.2153, Val Loss: 0.3935, Val Accuracy: 0.8518
Epoch: 10, Train Loss: 0.1723, Val Loss: 0.3902, Val Accuracy: 0.8562
Epoch: 11, Train Loss: 0.1901, Val Loss: 0.3815, Val Accuracy: 0.8562
Epoch: 12, Train Loss: 0.1668, Val Loss: 0.4449, Val Accuracy: 0.8508
Epoch: 13, Train Loss: 0.1183, Val Loss: 0.4584, Val Accuracy: 0.8570
Epoch: 14, Train Loss: 0.0822, Val Loss: 0.4722, Val Accuracy: 0.8520
Epoch: 15, Train Loss: 0.0631

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
avg_loss,▁
train_loss,█▇▆▅▅▄▄▃▃▂▃▂▂▁▁
val_loss,▇█▄▆▂▁▂▁▃▂▂▅▆▆█

0,1
accuracy,0.85032
avg_loss,0.51538
train_loss,0.06314
val_loss,0.50257


Epoch: 1, Train Loss: 0.6025, Val Loss: 0.6764, Val Accuracy: 0.7290
Epoch: 2, Train Loss: 0.4569, Val Loss: 0.4228, Val Accuracy: 0.8102
Epoch: 3, Train Loss: 0.3985, Val Loss: 0.4187, Val Accuracy: 0.8156
Epoch: 4, Train Loss: 0.3101, Val Loss: 0.3719, Val Accuracy: 0.8444
Epoch: 5, Train Loss: 0.2865, Val Loss: 0.4561, Val Accuracy: 0.8054
Epoch: 6, Train Loss: 0.2654, Val Loss: 0.3901, Val Accuracy: 0.8456
Epoch: 7, Train Loss: 0.2142, Val Loss: 0.4242, Val Accuracy: 0.8492
Epoch: 8, Train Loss: 0.1956, Val Loss: 0.3949, Val Accuracy: 0.8622
Epoch: 9, Train Loss: 0.1389, Val Loss: 0.4286, Val Accuracy: 0.8588
Epoch: 10, Train Loss: 0.1011, Val Loss: 0.4608, Val Accuracy: 0.8502
Epoch: 11, Train Loss: 0.0750, Val Loss: 0.5055, Val Accuracy: 0.8492
Epoch: 12, Train Loss: 0.0840, Val Loss: 0.9708, Val Accuracy: 0.8140
Epoch: 13, Train Loss: 0.2522, Val Loss: 0.4428, Val Accuracy: 0.8378
Epoch: 14, Train Loss: 0.1025, Val Loss: 0.5312, Val Accuracy: 0.8352
Epoch: 15, Train Loss: 0.0799

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
avg_loss,▁
train_loss,█▆▅▄▄▄▃▃▂▁▁▁▃▁▁
val_loss,▅▂▂▁▂▁▂▁▂▂▃█▂▃▃

0,1
accuracy,0.83296
avg_loss,0.56893
train_loss,0.07993
val_loss,0.57394


Epoch: 1, Train Loss: 0.6170, Val Loss: 0.5422, Val Accuracy: 0.7040
Epoch: 2, Train Loss: 0.4494, Val Loss: 0.4412, Val Accuracy: 0.8006
Epoch: 3, Train Loss: 0.3552, Val Loss: 0.3831, Val Accuracy: 0.8458
Epoch: 4, Train Loss: 0.2604, Val Loss: 0.4144, Val Accuracy: 0.8394
Epoch: 5, Train Loss: 0.2288, Val Loss: 0.4488, Val Accuracy: 0.8112
Epoch: 6, Train Loss: 0.1690, Val Loss: 0.4397, Val Accuracy: 0.8144
Epoch: 7, Train Loss: 0.0977, Val Loss: 0.5149, Val Accuracy: 0.7856
Epoch: 8, Train Loss: 0.0874, Val Loss: 0.5798, Val Accuracy: 0.7770
Epoch: 9, Train Loss: 0.0811, Val Loss: 0.5469, Val Accuracy: 0.8344
Epoch: 10, Train Loss: 0.0351, Val Loss: 0.5998, Val Accuracy: 0.8272
Epoch: 11, Train Loss: 0.0141, Val Loss: 0.7831, Val Accuracy: 0.8122
Epoch: 12, Train Loss: 0.0170, Val Loss: 0.7537, Val Accuracy: 0.8202
Epoch: 13, Train Loss: 0.0227, Val Loss: 0.7027, Val Accuracy: 0.8178
Epoch: 14, Train Loss: 0.0265, Val Loss: 0.7421, Val Accuracy: 0.8448
Epoch: 15, Train Loss: 0.0066

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
avg_loss,▁
train_loss,█▆▅▄▄▃▂▂▂▁▁▁▁▁▁
val_loss,▄▂▁▂▂▂▃▄▄▅█▇▇▇█

0,1
accuracy,0.84728
avg_loss,0.74928
train_loss,0.00661
val_loss,0.78936


In [30]:
# Initialize Wandb for the RNN experiment
wandb.init(project="hyperparameter-tuning_rnn_final", name="RNN_Experiment")

# Define hyperparameters for the RNN
config_rnn = {"embedding_dim": 64, "hidden_dim": 128, "batch_size": 100}

# Create the RNN model with specified hyperparameters
model_rnn = RNNClassifier(config_rnn["embedding_dim"], config_rnn["hidden_dim"], vocab_size=5000).to(device)

# Define optimizer and loss function for RNN
optimizer_rnn = torch.optim.Adam(model_rnn.parameters())
loss_fn_rnn = torch.nn.BCELoss()

# DataLoader for RNN
train_sample_dl_rnn = torch.utils.data.DataLoader(train_sample_ds, batch_size=config_rnn["batch_size"])
test_sample_dl_rnn = torch.utils.data.DataLoader(test_sample_ds, batch_size=config_rnn["batch_size"])
val_sample_dl_rnn = torch.utils.data.DataLoader(val_sample_ds, batch_size=config_rnn["batch_size"])

# Train the RNN model
train(model_rnn, train_sample_dl_rnn, val_sample_dl_rnn, 15, optimizer_rnn, loss_fn_rnn, device)

# Evaluate the RNN model on the test set
accuracy_rnn, avg_loss_rnn = evaluate(model_rnn, test_sample_dl_rnn, loss_fn_rnn, device)

# Log metrics using Wandb
print(f"RNN: Accuracy on test set: {accuracy_rnn:.4f}, Average loss on test set: {avg_loss_rnn:.4f}")
wandb.log({"accuracy_rnn": accuracy_rnn, "avg_loss_rnn": avg_loss_rnn})

# Finish Wandb run for RNN
wandb.finish()


Epoch: 1, Train Loss: 0.6937, Val Loss: 0.6941, Val Accuracy: 0.4984
Epoch: 2, Train Loss: 0.6985, Val Loss: 0.6923, Val Accuracy: 0.5174
Epoch: 3, Train Loss: 0.6950, Val Loss: 0.6949, Val Accuracy: 0.5026
Epoch: 4, Train Loss: 0.6954, Val Loss: 0.6929, Val Accuracy: 0.5106
Epoch: 5, Train Loss: 0.6947, Val Loss: 0.6951, Val Accuracy: 0.4954
Epoch: 6, Train Loss: 0.6949, Val Loss: 0.6945, Val Accuracy: 0.4920
Epoch: 7, Train Loss: 0.6943, Val Loss: 0.6932, Val Accuracy: 0.5042
Epoch: 8, Train Loss: 0.6947, Val Loss: 0.6945, Val Accuracy: 0.4956
Epoch: 9, Train Loss: 0.6940, Val Loss: 0.6939, Val Accuracy: 0.5070
Epoch: 10, Train Loss: 0.6947, Val Loss: 0.6962, Val Accuracy: 0.4944
Epoch: 11, Train Loss: 0.6958, Val Loss: 0.6967, Val Accuracy: 0.4948
Epoch: 12, Train Loss: 0.6956, Val Loss: 0.6964, Val Accuracy: 0.4948
Epoch: 13, Train Loss: 0.6955, Val Loss: 0.6964, Val Accuracy: 0.4948
Epoch: 14, Train Loss: 0.6955, Val Loss: 0.6963, Val Accuracy: 0.4950
Epoch: 15, Train Loss: 0.6954

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy_rnn,▁
avg_loss_rnn,▁
train_loss,▁█▃▃▂▃▂▂▁▂▄▄▄▄▃
val_loss,▄▁▅▂▅▄▂▄▄▇█▇█▇▇

0,1
accuracy_rnn,0.48712
avg_loss_rnn,0.69713
train_loss,0.69543
val_loss,0.69613


In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
wandb.init(project="hyperparameter-tuning_rnn_2_FINAL", name="RNN_Experiment_2")

class RNNLastOutput(nn.Module):
    """ RNNLastOutput class for initializing the layers for the RNN model
    that only picks the last output of the RNN layer.

    Attributes:
        embedding_dim (int): Dimensionality of the embedding layer
        hidden_dim (int): Dimensionality of the hidden layer(s)
        vocab_size (int): Size of the vocabulary used by Bag of Words

    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(RNNLastOutput, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.dense = nn.Linear(hidden_dim, 1)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        lengths = x[:, 0]  # Extract lengths from the input
        reviews = x[:, 1:]  # Extract reviews from the input
        embeds = self.embedding(reviews)
        rnn_out, _ = self.rnn(embeds)
        last_output = rnn_out[:, -1, :]  # Select the last time step's output
        output = self.dense(last_output)
        return self.sig(output.squeeze())

class RNNMeanOutput(nn.Module):
    """ RNNMeanOutput class for initializing the layers for the RNN model
    that takes the mean of all outputs in the RNN layer.

    Attributes:
        embedding_dim (int): Dimensionality of the embedding layer
        hidden_dim (int): Dimensionality of the hidden layer(s)
        vocab_size (int): Size of the vocabulary used by Bag of Words

    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(RNNMeanOutput, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.dense = nn.Linear(hidden_dim, 1)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        lengths = x[:, 0]  # Extract lengths from the input
        reviews = x[:, 1:]  # Extract reviews from the input
        embeds = self.embedding(reviews)
        rnn_out, _ = self.rnn(embeds)
        mean_output = torch.mean(rnn_out, dim=1)
        output = self.dense(mean_output)
        return self.sig(output.squeeze())

# Initialize the RNN models
model_last_output = RNNLastOutput(embedding_dim=32, hidden_dim=100, vocab_size=5000).to(device)
model_mean_output = RNNMeanOutput(embedding_dim=32, hidden_dim=100, vocab_size=5000).to(device)

# Define optimizer and loss function
optimizer = optimizer = torch.optim.Adam(model_last_output.parameters())
loss_fn = torch.nn.BCELoss()

# Train and evaluate the RNN model with last output
train(model_last_output, train_sample_dl, val_sample_dl, 15, optimizer, loss_fn, device)
accuracy_last_output, avg_loss_last_output = evaluate(model_last_output, test_sample_dl, loss_fn, device)
print("RNN with last output: Accuracy =", accuracy_last_output, ", Avg Loss =", avg_loss_last_output)
optimizer = torch.optim.Adam(model_mean_output.parameters())

# Train and evaluate the RNN model with mean output
train(model_mean_output, train_sample_dl, val_sample_dl, 15, optimizer, loss_fn, device)
accuracy_mean_output, avg_loss_mean_output = evaluate(model_mean_output, test_sample_dl, loss_fn, device)
print("RNN with mean output: Accuracy =", accuracy_mean_output, ", Avg Loss =", avg_loss_mean_output)


Epoch: 1, Train Loss: 0.6936, Val Loss: 0.6930, Val Accuracy: 0.4998
Epoch: 2, Train Loss: 0.6928, Val Loss: 0.6927, Val Accuracy: 0.4996
Epoch: 3, Train Loss: 0.6923, Val Loss: 0.6925, Val Accuracy: 0.4992
Epoch: 4, Train Loss: 0.6917, Val Loss: 0.6922, Val Accuracy: 0.5034
Epoch: 5, Train Loss: 0.6911, Val Loss: 0.6925, Val Accuracy: 0.5030
Epoch: 6, Train Loss: 0.6938, Val Loss: 0.6919, Val Accuracy: 0.5034
Epoch: 7, Train Loss: 0.6919, Val Loss: 0.6921, Val Accuracy: 0.5038
Epoch: 8, Train Loss: 0.6916, Val Loss: 0.6920, Val Accuracy: 0.5038
Epoch: 9, Train Loss: 0.6913, Val Loss: 0.6920, Val Accuracy: 0.5042
Epoch: 10, Train Loss: 0.6910, Val Loss: 0.6919, Val Accuracy: 0.5036
Epoch: 11, Train Loss: 0.6907, Val Loss: 0.6919, Val Accuracy: 0.5030
Epoch: 12, Train Loss: 0.6904, Val Loss: 0.6920, Val Accuracy: 0.5008
Epoch: 13, Train Loss: 0.6901, Val Loss: 0.6920, Val Accuracy: 0.5008
Epoch: 14, Train Loss: 0.6900, Val Loss: 0.6920, Val Accuracy: 0.5044
Epoch: 15, Train Loss: 0.6897