In [None]:
import json
import gzip
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from sklearn.metrics import accuracy_score
import random
from collections import Counter
from transformers import BertTokenizer
import numpy as np



In [None]:
import pandas as pd

In [None]:
!wget 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Musical_Instruments_5.json.gz'

--2024-06-11 08:20:40--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Musical_Instruments_5.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39356138 (38M) [application/x-gzip]
Saving to: ‘Musical_Instruments_5.json.gz’


2024-06-11 08:20:47 (6.91 MB/s) - ‘Musical_Instruments_5.json.gz’ saved [39356138/39356138]



In [None]:
!wget 'imdb.csv' 'https://huggingface.co/datasets/scikit-learn/imdb/resolve/main/IMDB%20Dataset.csv?download=true'

--2024-06-11 08:20:47--  http://imdb.csv/
Resolving imdb.csv (imdb.csv)... failed: Name or service not known.
wget: unable to resolve host address ‘imdb.csv’
--2024-06-11 08:20:47--  https://huggingface.co/datasets/scikit-learn/imdb/resolve/main/IMDB%20Dataset.csv?download=true
Resolving huggingface.co (huggingface.co)... 54.230.71.56, 54.230.71.103, 54.230.71.2, ...
Connecting to huggingface.co (huggingface.co)|54.230.71.56|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/77/fa/77fa70b48eef1c98bf08d7b3e43b710623c24c69b4f78d4484f43c3361e9d2af/dfc447764f82be365fa9c2beef4e8df89d3919e3da95f5088004797d79695aa2?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27IMDB%2520Dataset.csv%3B+filename%3D%22IMDB+Dataset.csv%22%3B&response-content-type=text%2Fcsv&Expires=1718353247&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxODM1MzI0N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmc

In [None]:
!mv '/content/IMDB Dataset.csv?download=true' '/content/IMDB Dataset.csv'

In [None]:
def load_data(filepath):
    with gzip.open(filepath, 'rt', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return data

data = load_data('/content/Musical_Instruments_5.json.gz')

In [None]:
len(data)

231392

# **Torch Tokenizer**

In [None]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data):
    for entry in data:
      if 'reviewText' in entry:
        yield tokenizer(entry['reviewText'])
vocab = build_vocab_from_iterator(yield_tokens(data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
def text_pipeline(text):
    return vocab(tokenizer(text))

labels = [entry['overall'] for entry in data if 'reviewText' in entry]
texts = [text_pipeline(entry['reviewText']) for entry in data if 'reviewText' in entry]


# **Bert Tokenizer**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def yield_tokens(data):
    for entry in data:
      if 'reviewText' in entry:
        yield tokenizer.tokenize(entry['reviewText'])
vocab = tokenizer.get_vocab()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def text_pipeline(text):
    return tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True, padding='max_length')

labels = [entry['overall'] for entry in data if 'reviewText' in entry]
texts = [text_pipeline(entry['reviewText']) for entry in data if 'reviewText' in entry]

# **Data Loading**

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': torch.tensor(self.texts[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

dataset = ReviewsDataset(texts, labels)


In [None]:
def collate_fn(batch):
    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    return texts_padded, labels_tensor


In [None]:

# Split the dataset into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


# **LSTM**

In [None]:
import torch
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm1 = nn.LSTM(embed_size, lstm_units, batch_first=True)
        # self.dropout = nn.Dropout(0.25)
        self.lstm2 = nn.LSTM(lstm_units, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        # x = self.dropout(x)
        x, _ = self.lstm2(x)
        x = x[:, -1, :]  # Taking only the last output
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
lstm_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model = RNNModel(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


RNNModel(
  (embedding): Embedding(112680, 200)
  (lstm1): LSTM(200, 128, batch_first=True)
  (lstm2): LSTM(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
model.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_dataloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_dataloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 0.9556, Train Accuracy: 0.6995
Epoch 2/20, Loss: 0.7966, Train Accuracy: 0.7079
Epoch 3/20, Loss: 0.6546, Train Accuracy: 0.7434
Epoch 4/20, Loss: 0.5750, Train Accuracy: 0.7761
Epoch 5/20, Loss: 0.5100, Train Accuracy: 0.8049
Epoch 6/20, Loss: 0.4478, Train Accuracy: 0.8330
Epoch 7/20, Loss: 0.3933, Train Accuracy: 0.8578
Epoch 8/20, Loss: 0.3490, Train Accuracy: 0.8762
Epoch 9/20, Loss: 0.3128, Train Accuracy: 0.8907
Epoch 10/20, Loss: 0.2846, Train Accuracy: 0.9010
Epoch 11/20, Loss: 0.2635, Train Accuracy: 0.9091
Epoch 12/20, Loss: 0.2444, Train Accuracy: 0.9165
Epoch 13/20, Loss: 0.2304, Train Accuracy: 0.9216
Epoch 14/20, Loss: 0.2179, Train Accuracy: 0.9259
Epoch 15/20, Loss: 0.2140, Train Accuracy: 0.9269
Epoch 16/20, Loss: 0.1996, Train Accuracy: 0.9324
Epoch 17/20, Loss: 0.1885, Train Accuracy: 0.9364
Epoch 18/20, Loss: 0.1840, Train Accuracy: 0.9379
Epoch 19/20, Loss: 0.1774, Train Accuracy: 0.9402
Epoch 20/20, Loss: 0.1726, Train Accuracy: 0.9425
Test Loss

# **Data Balance**

# **Balanced Data**

In [None]:
# Count the occurrences of each class
class_counts = Counter(labels)
print(class_counts)
# Determine the size of the minority class
min_class_size = min(class_counts.values())

# Define the maximum desired size for each class
max_class_size = min_class_size

# Sample indices for each class to balance the dataset
balanced_indices = []
for label in class_counts:
    indices = [i for i, l in enumerate(labels) if l == label]
    if len(indices) > max_class_size:
        indices = random.sample(indices, max_class_size)
    balanced_indices.extend(indices)

# Create balanced dataset
balanced_texts = [texts[i] for i in balanced_indices]
balanced_labels = [labels[i] for i in balanced_indices]
balanced_dataset = ReviewsDataset(balanced_texts, balanced_labels)
train_balanced_size = int(0.8 * len(balanced_dataset))
test_balanced_size = len(balanced_dataset) - train_balanced_size
train_balanced, test_balanced = random_split(balanced_dataset, [train_balanced_size, test_balanced_size])

train_balancedloader = DataLoader(train_balanced, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_balancedloader = DataLoader(test_balanced, batch_size=64, shuffle=False, collate_fn=collate_fn)

Counter({5.0: 161798, 4.0: 38776, 3.0: 16029, 1.0: 7465, 2.0: 7276})


In [None]:
class_counts = Counter(balanced_labels)
print(class_counts)

Counter({5.0: 7276, 4.0: 7276, 1.0: 7276, 3.0: 7276, 2.0: 7276})


# **Balanced LSTM**

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
model.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 1.6099, Train Accuracy: 0.1996
Epoch 2/20, Loss: 1.6089, Train Accuracy: 0.2004
Epoch 3/20, Loss: 1.6077, Train Accuracy: 0.1993
Epoch 4/20, Loss: 1.6056, Train Accuracy: 0.2047
Epoch 5/20, Loss: 1.5882, Train Accuracy: 0.2267
Epoch 6/20, Loss: 1.3318, Train Accuracy: 0.4239
Epoch 7/20, Loss: 1.1183, Train Accuracy: 0.5249
Epoch 8/20, Loss: 0.9673, Train Accuracy: 0.5974
Epoch 9/20, Loss: 0.8263, Train Accuracy: 0.6727
Epoch 10/20, Loss: 0.7048, Train Accuracy: 0.7351
Epoch 11/20, Loss: 0.5939, Train Accuracy: 0.7873
Epoch 12/20, Loss: 0.5105, Train Accuracy: 0.8250
Epoch 13/20, Loss: 0.4412, Train Accuracy: 0.8535
Epoch 14/20, Loss: 0.3761, Train Accuracy: 0.8794
Epoch 15/20, Loss: 0.3361, Train Accuracy: 0.8939
Epoch 16/20, Loss: 0.3007, Train Accuracy: 0.9069
Epoch 17/20, Loss: 0.2774, Train Accuracy: 0.9150
Epoch 18/20, Loss: 0.2567, Train Accuracy: 0.9222
Epoch 19/20, Loss: 0.2362, Train Accuracy: 0.9282
Epoch 20/20, Loss: 0.2235, Train Accuracy: 0.9326
Test Loss

In [None]:
import torch
import torch.nn as nn

class RNNModel2(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel2, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm1 = nn.LSTM(embed_size, lstm_units, batch_first=True)
        # self.dropout = nn.Dropout(0.25)
        self.lstm2 = nn.LSTM(lstm_units, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, hidden_size)
        #self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        # x = self.dropout(x)
        x, _ = self.lstm2(x)
        x = torch.mean(x, dim=1)
        x = self.fc1(x)
        # x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
lstm_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model2 = RNNModel2(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2.to(device)


RNNModel2(
  (embedding): Embedding(30522, 200)
  (lstm1): LSTM(200, 128, batch_first=True)
  (lstm2): LSTM(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

# Bert Tokenizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)

num_epochs = 20
model2.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model2(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model2.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model2(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 1.5541, Train Accuracy: 0.2695
Epoch 2/20, Loss: 1.4804, Train Accuracy: 0.3162
Epoch 3/20, Loss: 1.2816, Train Accuracy: 0.4111
Epoch 4/20, Loss: 1.1684, Train Accuracy: 0.4688
Epoch 5/20, Loss: 1.0808, Train Accuracy: 0.5159
Epoch 6/20, Loss: 0.9961, Train Accuracy: 0.5634
Epoch 7/20, Loss: 0.9144, Train Accuracy: 0.6064
Epoch 8/20, Loss: 0.8325, Train Accuracy: 0.6491
Epoch 9/20, Loss: 0.7494, Train Accuracy: 0.6941
Epoch 10/20, Loss: 0.6688, Train Accuracy: 0.7354
Epoch 11/20, Loss: 0.5951, Train Accuracy: 0.7686
Epoch 12/20, Loss: 0.5234, Train Accuracy: 0.8017
Epoch 13/20, Loss: 0.4607, Train Accuracy: 0.8304
Epoch 14/20, Loss: 0.4115, Train Accuracy: 0.8507
Epoch 15/20, Loss: 0.3571, Train Accuracy: 0.8733
Epoch 16/20, Loss: 0.3216, Train Accuracy: 0.8886
Epoch 17/20, Loss: 0.3017, Train Accuracy: 0.8950
Epoch 18/20, Loss: 0.2685, Train Accuracy: 0.9070
Epoch 19/20, Loss: 0.2450, Train Accuracy: 0.9168
Epoch 20/20, Loss: 0.2203, Train Accuracy: 0.9253
Test Loss

# Torch Tokenizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)

num_epochs = 20
model2.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model2(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model2.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model2(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 1.4824, Train Accuracy: 0.3067
Epoch 2/20, Loss: 1.2369, Train Accuracy: 0.4475
Epoch 3/20, Loss: 1.0933, Train Accuracy: 0.5176
Epoch 4/20, Loss: 0.9795, Train Accuracy: 0.5763
Epoch 5/20, Loss: 0.8686, Train Accuracy: 0.6331
Epoch 6/20, Loss: 0.7624, Train Accuracy: 0.6863
Epoch 8/20, Loss: 0.5682, Train Accuracy: 0.7820
Epoch 9/20, Loss: 0.4816, Train Accuracy: 0.8226
Epoch 10/20, Loss: 0.4036, Train Accuracy: 0.8546
Epoch 11/20, Loss: 0.3513, Train Accuracy: 0.8748
Epoch 12/20, Loss: 0.3090, Train Accuracy: 0.8922
Epoch 13/20, Loss: 0.2636, Train Accuracy: 0.9087
Epoch 14/20, Loss: 0.2395, Train Accuracy: 0.9204
Epoch 15/20, Loss: 0.2155, Train Accuracy: 0.9277
Epoch 16/20, Loss: 0.2064, Train Accuracy: 0.9288
Epoch 17/20, Loss: 0.1927, Train Accuracy: 0.9361
Epoch 18/20, Loss: 0.1896, Train Accuracy: 0.9349
Epoch 19/20, Loss: 0.1628, Train Accuracy: 0.9443
Epoch 20/20, Loss: 0.1551, Train Accuracy: 0.9470
Test Loss: 3.9185, Test Accuracy: 0.5014


In [None]:
import torch
import torch.nn as nn

class RNNModel3(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel3, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm1 = nn.LSTM(embed_size, lstm_units, batch_first=True)
        # self.dropout = nn.Dropout(0.25)
        self.lstm2 = nn.LSTM(lstm_units, lstm_units, batch_first=True)
        self.lstm3 = nn.LSTM(lstm_units, lstm_units, batch_first=True)
        # self.fc1 = nn.Linear(lstm_units, hidden_size)
        #self.relu = nn.ReLU()
        self.fc = nn.Linear(lstm_units, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        # x = self.dropout(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)

        x = torch.mean(x, dim=1)
        # x = self.fc1(x)
        # x = self.relu(x)
        x = self.fc(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
lstm_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model3 = RNNModel3(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model3.to(device)


RNNModel3(
  (embedding): Embedding(112680, 200)
  (lstm1): LSTM(200, 128, batch_first=True)
  (lstm2): LSTM(128, 128, batch_first=True)
  (lstm3): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model3.parameters(), lr=0.001)

num_epochs = 20
model3.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model3(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model3.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model3(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 1.5600, Train Accuracy: 0.2634
Epoch 2/20, Loss: 1.5248, Train Accuracy: 0.2987
Epoch 3/20, Loss: 1.5306, Train Accuracy: 0.3083
Epoch 4/20, Loss: 1.4827, Train Accuracy: 0.3223
Epoch 5/20, Loss: 1.4127, Train Accuracy: 0.3558
Epoch 6/20, Loss: 1.3943, Train Accuracy: 0.3591
Epoch 7/20, Loss: 1.3660, Train Accuracy: 0.3705
Epoch 8/20, Loss: 1.2665, Train Accuracy: 0.4134
Epoch 9/20, Loss: 1.1932, Train Accuracy: 0.4584
Epoch 10/20, Loss: 1.1354, Train Accuracy: 0.4878
Epoch 11/20, Loss: 1.0661, Train Accuracy: 0.5299
Epoch 12/20, Loss: 1.0034, Train Accuracy: 0.5657
Epoch 13/20, Loss: 0.9320, Train Accuracy: 0.6076
Epoch 14/20, Loss: 0.8700, Train Accuracy: 0.6439
Epoch 15/20, Loss: 0.8065, Train Accuracy: 0.6795
Epoch 16/20, Loss: 0.7403, Train Accuracy: 0.7147
Epoch 17/20, Loss: 0.6933, Train Accuracy: 0.7379
Epoch 18/20, Loss: 0.6286, Train Accuracy: 0.7685
Epoch 19/20, Loss: 0.5768, Train Accuracy: 0.7937
Epoch 20/20, Loss: 0.5223, Train Accuracy: 0.8189
Test Loss

# **GRU**

In [None]:
import torch
import torch.nn as nn

class RNNModel4(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel4, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru1 = nn.GRU(embed_size, lstm_units, batch_first=True)
        # self.dropout = nn.Dropout(0.25)
        self.gru2 = nn.GRU(lstm_units, lstm_units, batch_first=True)
        self.gru3 = nn.GRU(lstm_units, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru1(x)
        # x = self.dropout(x)
        x, _ = self.gru2(x)
        x, _ = self.gru3(x)

        x = torch.mean(x, dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
lstm_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model4 = RNNModel4(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model4.to(device)


RNNModel4(
  (embedding): Embedding(112680, 200)
  (gru1): GRU(200, 128, batch_first=True)
  (gru2): GRU(128, 128, batch_first=True)
  (gru3): GRU(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

# Imbalanced

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model4.parameters(), lr=0.001)

num_epochs = 20
model4.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model4(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_dataloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model4.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model4(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_dataloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 0.7047, Train Accuracy: 0.7348
Epoch 2/20, Loss: 0.5778, Train Accuracy: 0.7770
Epoch 3/20, Loss: 0.5103, Train Accuracy: 0.8027
Epoch 4/20, Loss: 0.4432, Train Accuracy: 0.8314
Epoch 5/20, Loss: 0.3785, Train Accuracy: 0.8583
Epoch 6/20, Loss: 0.3243, Train Accuracy: 0.8805
Epoch 7/20, Loss: 0.2834, Train Accuracy: 0.8975
Epoch 8/20, Loss: 0.2524, Train Accuracy: 0.9086
Epoch 9/20, Loss: 0.2292, Train Accuracy: 0.9175
Epoch 10/20, Loss: 0.2124, Train Accuracy: 0.9235
Epoch 11/20, Loss: 0.1980, Train Accuracy: 0.9290
Epoch 12/20, Loss: 0.1868, Train Accuracy: 0.9327
Epoch 13/20, Loss: 0.1784, Train Accuracy: 0.9354
Epoch 14/20, Loss: 0.1695, Train Accuracy: 0.9390
Epoch 15/20, Loss: 0.1642, Train Accuracy: 0.9413
Epoch 16/20, Loss: 0.1580, Train Accuracy: 0.9426
Epoch 17/20, Loss: 0.1528, Train Accuracy: 0.9449
Epoch 18/20, Loss: 0.1494, Train Accuracy: 0.9457
Epoch 19/20, Loss: 0.1472, Train Accuracy: 0.9466
Epoch 20/20, Loss: 0.1435, Train Accuracy: 0.9479
Test Loss

# Balanced

In [None]:
import torch
import torch.nn as nn

class RNNModel5(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel5, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru1 = nn.GRU(embed_size, lstm_units, batch_first=True)
        self.dropout = nn.Dropout(0.15)
        self.gru2 = nn.GRU(lstm_units, lstm_units, batch_first=True)
        self.gru3 = nn.GRU(lstm_units, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru1(x)

        x, _ = self.gru2(x)
        x, _ = self.gru3(x)

        x = torch.mean(x, dim=1)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
lstm_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model5 = RNNModel5(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model5.to("cuda")


RNNModel5(
  (embedding): Embedding(112680, 200)
  (gru1): GRU(200, 128, batch_first=True)
  (dropout): Dropout(p=0.15, inplace=False)
  (gru2): GRU(128, 128, batch_first=True)
  (gru3): GRU(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model5.parameters(), lr=0.001)

num_epochs = 20
model5.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model5(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model5.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model5(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 1.4748, Train Accuracy: 0.3073
Epoch 2/20, Loss: 1.1779, Train Accuracy: 0.4748
Epoch 3/20, Loss: 1.0266, Train Accuracy: 0.5495
Epoch 4/20, Loss: 0.9014, Train Accuracy: 0.6160
Epoch 5/20, Loss: 0.7781, Train Accuracy: 0.6792
Epoch 6/20, Loss: 0.6433, Train Accuracy: 0.7459
Epoch 7/20, Loss: 0.5213, Train Accuracy: 0.8014
Epoch 8/20, Loss: 0.4129, Train Accuracy: 0.8482
Epoch 9/20, Loss: 0.3347, Train Accuracy: 0.8810
Epoch 10/20, Loss: 0.2759, Train Accuracy: 0.9014
Epoch 11/20, Loss: 0.2363, Train Accuracy: 0.9172
Epoch 12/20, Loss: 0.2078, Train Accuracy: 0.9274
Epoch 13/20, Loss: 0.1853, Train Accuracy: 0.9340
Epoch 14/20, Loss: 0.1690, Train Accuracy: 0.9409
Epoch 15/20, Loss: 0.1553, Train Accuracy: 0.9458
Epoch 16/20, Loss: 0.1418, Train Accuracy: 0.9499
Epoch 17/20, Loss: 0.1370, Train Accuracy: 0.9533
Epoch 18/20, Loss: 0.1327, Train Accuracy: 0.9530
Epoch 19/20, Loss: 0.1260, Train Accuracy: 0.9550
Epoch 20/20, Loss: 0.1150, Train Accuracy: 0.9588
Test Loss

# **RNN**

In [None]:

class RNNModel6(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel6, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn1 = nn.RNN(embed_size, lstm_units, batch_first=True)
        # self.dropout = nn.Dropout(0.1)
        self.rnn2 = nn.RNN(lstm_units, lstm_units, batch_first=True)
        self.rnn3 = nn.RNN(lstm_units, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn1(x)
        x, _ = self.rnn2(x)
        x, _ = self.rnn3(x)

        x = torch.mean(x, dim=1)
        # x = self.dropout(x)
        x = self.fc1(x)
        # x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
lstm_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model6 = RNNModel6(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model6.to(device)

RNNModel6(
  (embedding): Embedding(112680, 200)
  (rnn1): RNN(200, 128, batch_first=True)
  (rnn2): RNN(128, 128, batch_first=True)
  (rnn3): RNN(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model6.parameters(), lr=0.001)

num_epochs = 20
model6.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_dataloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model6.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_dataloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 0.9358, Train Accuracy: 0.6975
Epoch 2/20, Loss: 0.9210, Train Accuracy: 0.6980
Epoch 3/20, Loss: 0.9260, Train Accuracy: 0.6992
Epoch 4/20, Loss: 0.9003, Train Accuracy: 0.6990
Epoch 5/20, Loss: 0.8866, Train Accuracy: 0.6989


KeyboardInterrupt: 

In [None]:

class RNNModel6(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel6, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn1 = nn.RNN(embed_size, lstm_units, batch_first=True)
        # self.dropout = nn.Dropout(0.1)
        self.rnn2 = nn.RNN(lstm_units, lstm_units, batch_first=True)
        self.rnn3 = nn.RNN(lstm_units, lstm_units, batch_first=True)
        self.fc1 = nn.Linear(lstm_units, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn1(x)
        x, _ = self.rnn2(x)
        x, _ = self.rnn3(x)

        x = torch.mean(x, dim=1)
        # x = self.dropout(x)
        x = self.fc1(x)
        # x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
lstm_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model6 = RNNModel6(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model6.to(device)

RNNModel6(
  (embedding): Embedding(112680, 200)
  (rnn1): RNN(200, 128, batch_first=True)
  (rnn2): RNN(128, 128, batch_first=True)
  (rnn3): RNN(128, 128, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model6.parameters(), lr=0.002)

num_epochs = 20
model6.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_dataloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model6.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_dataloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 0.9493, Train Accuracy: 0.6975
Epoch 2/20, Loss: 0.9167, Train Accuracy: 0.6995


KeyboardInterrupt: 

In [None]:

class RNNModel6(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel6, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn1 = nn.RNN(embed_size, rnn_units, batch_first=True)
        # self.dropout = nn.Dropout(0.1)
        self.rnn2 = nn.RNN(rnn_units, rnn_units,4, batch_first=True)
        self.fc1 = nn.Linear(rnn_units, hidden_size)
        # self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn1(x)
        x, _ = self.rnn2(x)
        # x, _ = self.rnn3(x)

        x = torch.mean(x, dim=1)
        # x = self.dropout(x)
        x = self.fc1(x)
        # x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
rnn_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model6 = RNNModel6(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model6.to(device)

RNNModel6(
  (embedding): Embedding(112680, 200)
  (rnn1): RNN(200, 128, batch_first=True)
  (rnn2): RNN(128, 128, num_layers=4, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model6.parameters(), lr=0.001)

num_epochs = 20
model6.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_dataloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model6.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_dataloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 0.9495, Train Accuracy: 0.6961
Epoch 2/20, Loss: 0.9282, Train Accuracy: 0.6994
Epoch 3/20, Loss: 0.8975, Train Accuracy: 0.6989
Epoch 4/20, Loss: 0.8879, Train Accuracy: 0.6996
Epoch 5/20, Loss: 0.8843, Train Accuracy: 0.6997
Epoch 6/20, Loss: 0.8758, Train Accuracy: 0.6996
Epoch 7/20, Loss: 0.8695, Train Accuracy: 0.6994
Epoch 8/20, Loss: 0.8542, Train Accuracy: 0.6993
Epoch 9/20, Loss: 0.8606, Train Accuracy: 0.6995
Epoch 10/20, Loss: 0.8542, Train Accuracy: 0.6995


KeyboardInterrupt: 

In [None]:

class RNNModel6(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel6, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn1 = nn.RNN(embed_size, rnn_units, batch_first=True)
        self.rnn2 = nn.RNN(rnn_units, rnn_units,2, batch_first=True)
        self.fc2 = nn.Linear(rnn_units, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn1(x)
        x, _ = self.rnn2(x)
        x = torch.mean(x, dim=1)
        x = self.fc2(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
rnn_units = 128
hidden_size = 64
output_size = 5
maxlen = 100

model6 = RNNModel6(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model6.to(device)

RNNModel6(
  (embedding): Embedding(30522, 200)
  (rnn1): RNN(200, 128, batch_first=True)
  (rnn2): RNN(128, 128, num_layers=2, batch_first=True)
  (fc2): Linear(in_features=128, out_features=5, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model6.parameters(), lr=0.0007)

num_epochs = 10
model6.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_dataloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model6.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_dataloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/10, Loss: 0.9398, Train Accuracy: 0.6970
Epoch 2/10, Loss: 0.9271, Train Accuracy: 0.6961
Epoch 3/10, Loss: 0.9057, Train Accuracy: 0.6976
Epoch 4/10, Loss: 0.8908, Train Accuracy: 0.6988
Epoch 5/10, Loss: 0.8770, Train Accuracy: 0.6994
Epoch 6/10, Loss: 0.8353, Train Accuracy: 0.7032
Epoch 7/10, Loss: 0.8215, Train Accuracy: 0.7039
Epoch 8/10, Loss: 0.8005, Train Accuracy: 0.7076
Epoch 9/10, Loss: 0.7895, Train Accuracy: 0.7113
Epoch 10/10, Loss: 0.7949, Train Accuracy: 0.7106
Test Loss: 0.8130, Test Accuracy: 0.7058


# **RNN Balanced**

# RNN Balanced BERT tokenizer One FC layer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model6.parameters(), lr=0.005)

num_epochs = 20
model6.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model6.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 1.6053, Train Accuracy: 0.2224
Epoch 2/20, Loss: 1.5991, Train Accuracy: 0.2279
Epoch 3/20, Loss: 1.5818, Train Accuracy: 0.2410
Epoch 4/20, Loss: 1.5649, Train Accuracy: 0.2488
Epoch 5/20, Loss: 1.5489, Train Accuracy: 0.2758
Epoch 6/20, Loss: 1.5108, Train Accuracy: 0.2982
Epoch 7/20, Loss: 1.5178, Train Accuracy: 0.3103
Epoch 8/20, Loss: 1.5083, Train Accuracy: 0.2941
Epoch 9/20, Loss: 1.5029, Train Accuracy: 0.2991
Epoch 10/20, Loss: 1.4908, Train Accuracy: 0.3189
Epoch 11/20, Loss: 1.4642, Train Accuracy: 0.3405
Epoch 12/20, Loss: 1.4454, Train Accuracy: 0.3330
Epoch 13/20, Loss: 1.4416, Train Accuracy: 0.3366
Epoch 14/20, Loss: 1.4408, Train Accuracy: 0.3338
Epoch 15/20, Loss: 1.5277, Train Accuracy: 0.2991
Epoch 16/20, Loss: 1.4326, Train Accuracy: 0.3368
Epoch 17/20, Loss: 1.5101, Train Accuracy: 0.3160
Epoch 18/20, Loss: 1.4660, Train Accuracy: 0.3400
Epoch 19/20, Loss: 1.3987, Train Accuracy: 0.3732
Epoch 20/20, Loss: 1.3110, Train Accuracy: 0.3984
Test Loss

# RNN 2FC layers

In [None]:

class RNNModel6(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(RNNModel6, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn1 = nn.RNN(embed_size, rnn_units, batch_first=True)
        self.rnn2 = nn.RNN(rnn_units, rnn_units,2, batch_first=True)
        self.fc1 = nn.Linear(rnn_units, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn1(x)
        x, _ = self.rnn2(x)
        x = torch.mean(x, dim=1)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

# Model parameters
vocab_size = len(vocab)
embed_size = 200
rnn_units = 256
hidden_size = 64
output_size = 5
maxlen = 100

model6 = RNNModel6(vocab_size, embed_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model6.to(device)

RNNModel6(
  (embedding): Embedding(30522, 200)
  (rnn1): RNN(200, 256, batch_first=True)
  (rnn2): RNN(256, 256, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=5, bias=True)
)

# RNN Balanced Torch tokenizer Two FC layer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model6.parameters(), lr=0.0003)

num_epochs = 20
model6.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model6.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model6(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 1.6093, Train Accuracy: 0.2066
Epoch 2/20, Loss: 1.6049, Train Accuracy: 0.2230
Epoch 3/20, Loss: 1.6014, Train Accuracy: 0.2314
Epoch 4/20, Loss: 1.5893, Train Accuracy: 0.2447
Epoch 5/20, Loss: 1.5313, Train Accuracy: 0.2926
Epoch 6/20, Loss: 1.4635, Train Accuracy: 0.3287
Epoch 7/20, Loss: 1.4170, Train Accuracy: 0.3546
Epoch 8/20, Loss: 1.4611, Train Accuracy: 0.3470
Epoch 9/20, Loss: 1.7289, Train Accuracy: 0.3164
Epoch 10/20, Loss: 1.5812, Train Accuracy: 0.2586
Epoch 11/20, Loss: 1.5560, Train Accuracy: 0.3168
Epoch 12/20, Loss: 1.4113, Train Accuracy: 0.3628
Epoch 13/20, Loss: 1.4085, Train Accuracy: 0.3604
Epoch 14/20, Loss: 1.3429, Train Accuracy: 0.3930
Epoch 15/20, Loss: 1.3145, Train Accuracy: 0.4101
Epoch 16/20, Loss: 1.2874, Train Accuracy: 0.4231
Epoch 17/20, Loss: 1.2640, Train Accuracy: 0.4399
Epoch 18/20, Loss: 1.2194, Train Accuracy: 0.4517
Epoch 19/20, Loss: 1.1911, Train Accuracy: 0.4722
Epoch 20/20, Loss: 1.1564, Train Accuracy: 0.4901
Test Loss

# **Second Part**

# Transfer label =2,4 to Test set

In [None]:
filtered_data = [entry for entry in data if 'reviewText' in entry and entry['overall'] not in [2, 4]]
separated_data = [entry for entry in data if 'reviewText' in entry and entry['overall'] in [2, 4]]

# Extract labels and encoded texts for filtered data
filtered_labels = [entry['overall'] for entry in filtered_data if 'reviewText' in entry]
filtered_texts = [text_pipeline(entry['reviewText']) for entry in filtered_data if 'reviewText' in entry]

# Extract labels and encoded texts for separated data
separated_labels = [entry['overall'] for entry in separated_data if 'reviewText' in entry]
separated_texts = [text_pipeline(entry['reviewText']) for entry in separated_data if 'reviewText' in entry]

In [None]:
filtered_dataset = ReviewsDataset(filtered_texts, filtered_labels)
filtered_train_size = int(0.8 * len(filtered_dataset))
filtered_test_size = len(filtered_dataset) - filtered_train_size
filtered_train, filtered_test = random_split(filtered_dataset, [filtered_train_size, filtered_test_size])
separated_dataset = ReviewsDataset(separated_texts, separated_labels)
filtered_train_dataloader = DataLoader(filtered_train, batch_size=64, shuffle=True, collate_fn=collate_fn)
filtered_test_dataloader = DataLoader(filtered_test, batch_size=64, shuffle=True, collate_fn=collate_fn)
separated_dataloader = DataLoader(separated_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

# Filtered & Unbalanced

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model4.parameters(), lr=0.001)

num_epochs = 10
model4.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in filtered_train_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model4(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(filtered_train_dataloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model4.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in filtered_test_dataloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model4(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(filtered_test_dataloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/10, Loss: 0.2879, Train Accuracy: 0.8986
Epoch 2/10, Loss: 0.1814, Train Accuracy: 0.9345
Epoch 3/10, Loss: 0.1353, Train Accuracy: 0.9531
Epoch 4/10, Loss: 0.0989, Train Accuracy: 0.9671
Epoch 5/10, Loss: 0.0739, Train Accuracy: 0.9765
Epoch 6/10, Loss: 0.0578, Train Accuracy: 0.9820
Epoch 7/10, Loss: 0.0480, Train Accuracy: 0.9851
Epoch 8/10, Loss: 0.0422, Train Accuracy: 0.9869
Epoch 9/10, Loss: 0.0376, Train Accuracy: 0.9882
Epoch 10/10, Loss: 0.0340, Train Accuracy: 0.9896
Test Loss: 0.3247, Test Accuracy: 0.9318


In [None]:
def select_random_samples(dataloader, num_samples):
    all_samples = []
    for batch in dataloader:
        inputs, labels = batch
        for input, label in zip(inputs, labels):
            all_samples.append((input, label))

    random_samples = random.sample(all_samples, num_samples)
    inputs, labels = zip(*random_samples)
    return inputs, labels

random_inputs, random_labels = select_random_samples(separated_dataloader, 200)

# Step 3: Prepare the inputs for the model
inputs_tensor = pad_sequence(random_inputs, batch_first=True)
labels_tensor = torch.tensor(random_labels)

# Step 4: Pass the inputs through the model
model4.eval()
with torch.no_grad():
    inputs_tensor = inputs_tensor.to(device)
    outputs = model4(inputs_tensor)

# Step 5: Evaluate the model's performance
predicted_labels = torch.argmax(outputs, dim=1)
def decode_text(numerical_text):
    return ' '.join([vocab.lookup_token(token) for token in numerical_text if token != vocab['<unk>']])

# Print the results for each sample
for i in range(len(random_inputs)):
    decoded_input = decode_text(random_inputs[i])
    print(f"Sample {i+1}:")
    print(f"Text: {decoded_input}")
    print(f"Expected output (Actual label): {labels_tensor[i]}")
    print(f"Current output (Predicted label): {predicted_labels[i]+1}")
    print("-" * 30)

Sample 1:
Text: good shoulder rest
Expected output (Actual label): 4
Current output (Predicted label): 5
------------------------------
Sample 2:
Text: fun pedal . little bit fiddly because the pedal is pretty ambitious for the size of the knobs .
Expected output (Actual label): 4
Current output (Predicted label): 5
------------------------------
Sample 3:
Text: well , its chinese made which quality wise raises a red flag with me ( no pun intended ) . besides that it makes a good starter harmonica for someone who wants to learn how to play one .
Expected output (Actual label): 4
Current output (Predicted label): 5
------------------------------
Sample 4:
Text: smooth playability . i strung a second guitar and they did not last very long sitting idle on a stand . picked it up about 3 months later and they seemed harsh . if these are going on your main axe and you will be restringing at least once a month . go for it !
Expected output (Actual label): 4
Current output (Predicted label): 5

# Filtered & Balanced

In [None]:
class_counts = Counter(filtered_labels)
print(class_counts)
# Determine the size of the minority class
min_class_size = min(class_counts.values())

# Define the maximum desired size for each class
max_class_size = min_class_size

# Sample indices for each class to balance the dataset
balanced_indices = []
for label in class_counts:
    indices = [i for i, l in enumerate(filtered_labels) if l == label]
    if len(indices) > max_class_size:
        indices = random.sample(indices, max_class_size)
    balanced_indices.extend(indices)

# Create balanced dataset
filtered_balanced_texts = [filtered_texts[i] for i in balanced_indices]
filtered_balanced_labels = [filtered_labels[i] for i in balanced_indices]
filtered_balanced_dataset = ReviewsDataset(filtered_balanced_texts, filtered_balanced_labels)
filtered_train_balanced_size = int(0.8 * len(filtered_balanced_dataset))
filtered_test_balanced_size = len(filtered_balanced_dataset) - filtered_train_balanced_size
filtered_train_balanced, filtered_test_balanced = random_split(filtered_balanced_dataset, [filtered_train_balanced_size, filtered_test_balanced_size])

filtered_train_balancedloader = DataLoader(filtered_train_balanced, batch_size=32, shuffle=True, collate_fn=collate_fn)
filtered_test_balancedloader = DataLoader(filtered_test_balanced, batch_size=32, shuffle=False, collate_fn=collate_fn)

Counter({5.0: 161798, 3.0: 16029, 1.0: 7465})


In [None]:
class_counts = Counter(filtered_balanced_labels)
print(class_counts)

Counter({5.0: 7465, 1.0: 7465, 3.0: 7465})


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model5.parameters(), lr=0.001, weight_decay = 1e-5)

num_epochs = 20
model5.train()
for epoch in range(num_epochs):
    all_preds = []
    all_labels = []
    train_loss = 0
    for batch in filtered_train_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        # print(labels)
        optimizer.zero_grad()
        outputs = model5(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    avg_loss = train_loss / len(filtered_train_balancedloader)
    train_accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

# Evaluate the model
model5.eval()
all_preds = []
all_labels = []
test_loss = 0
with torch.no_grad():
    for batch in filtered_test_balancedloader:
        texts = batch[0].to(device)
        labels = batch[1].to(device)
        labels = labels - 1
        outputs = model5(texts)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_loss /= len(filtered_test_balancedloader)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/20, Loss: 0.9396, Train Accuracy: 0.5049
Epoch 2/20, Loss: 0.6077, Train Accuracy: 0.7386
Epoch 3/20, Loss: 0.4586, Train Accuracy: 0.8171
Epoch 4/20, Loss: 0.3519, Train Accuracy: 0.8667
Epoch 5/20, Loss: 0.2520, Train Accuracy: 0.9121
Epoch 6/20, Loss: 0.1828, Train Accuracy: 0.9388
Epoch 7/20, Loss: 0.1355, Train Accuracy: 0.9545
Epoch 8/20, Loss: 0.1150, Train Accuracy: 0.9602
Epoch 9/20, Loss: 0.0997, Train Accuracy: 0.9665
Epoch 10/20, Loss: 0.0856, Train Accuracy: 0.9712
Epoch 11/20, Loss: 0.0786, Train Accuracy: 0.9734
Epoch 12/20, Loss: 0.0731, Train Accuracy: 0.9746
Epoch 13/20, Loss: 0.0705, Train Accuracy: 0.9762
Epoch 14/20, Loss: 0.0660, Train Accuracy: 0.9772
Epoch 15/20, Loss: 0.0645, Train Accuracy: 0.9772
Epoch 16/20, Loss: 0.0648, Train Accuracy: 0.9778
Epoch 17/20, Loss: 0.0589, Train Accuracy: 0.9783
Epoch 18/20, Loss: 0.0521, Train Accuracy: 0.9824
Epoch 19/20, Loss: 0.0517, Train Accuracy: 0.9814
Epoch 20/20, Loss: 0.0555, Train Accuracy: 0.9805
Test Loss

In [None]:
random_inputs, random_labels = select_random_samples(separated_dataloader, 200)

# Step 3: Prepare the inputs for the model
inputs_tensor = pad_sequence(random_inputs, batch_first=True)
labels_tensor = torch.tensor(random_labels)

# Step 4: Pass the inputs through the model
model5.eval()
with torch.no_grad():
    inputs_tensor = inputs_tensor.to(device)
    outputs = model5(inputs_tensor)

# Step 5: Evaluate the model's performance
predicted_labels = torch.argmax(outputs, dim=1)
def decode_text(numerical_text):
    return ' '.join([vocab.lookup_token(token) for token in numerical_text if token != vocab['<unk>']])

# Print the results for each sample
for i in range(len(random_inputs)):
    decoded_input = decode_text(random_inputs[i])
    print(f"Sample {i+1}:")
    print(f"Text: {decoded_input}")
    print(f"Expected output (Actual label): {labels_tensor[i]}")
    print(f"Current output (Predicted label): {predicted_labels[i]+1}")
    print("-" * 30)

Sample 1:
Text: good solid feel . price was right and serves the purpose .
Expected output (Actual label): 4
Current output (Predicted label): 3
------------------------------
Sample 2:
Text: good
Expected output (Actual label): 4
Current output (Predicted label): 5
------------------------------
Sample 3:
Text: serves the purpose very well and looks awesome in my home studio
Expected output (Actual label): 4
Current output (Predicted label): 5
------------------------------
Sample 4:
Text: i like it a lot . i do wish the it was little longer . i could add my light tripod if it was . i do like the durable quality of the material .
Expected output (Actual label): 4
Current output (Predicted label): 5
------------------------------
Sample 5:
Text: it ain ' t fancy , but it does its job well for the price . folk who said it felt flimsy must either be new to mandolin straps or yank pretty hard whether at play or rest . i have an a style mandolin & had to put it on when re-stringing because

# IMDB Loading

In [None]:
imdb = pd.read_csv('IMDB Dataset.csv')

In [None]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
texts = imdb['review'].tolist()
labels = imdb['sentiment'].tolist()
texts = [text_pipeline(text) for text in texts]
label_map = {"negative": 0, "positive": 1}
labels = [label_map[label] for label in labels]
imdb_dataset = ReviewsDataset(texts, labels)
imdb_dataloader = DataLoader(imdb_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
random_inputs, random_labels = select_random_samples(imdb_dataloader, 200)
inputs_tensor = pad_sequence(random_inputs, batch_first=True)
labels_tensor = torch.tensor(random_labels)


# Unbalanced IMDB

In [None]:
model4.eval()
with torch.no_grad():
    inputs_tensor = inputs_tensor.to(device)
    outputs = model4(inputs_tensor)

# Step 5: Evaluate the model's performance
predicted_labels = torch.argmax(outputs, dim=1)
for i in range(len(random_inputs)):
    decoded_input = decode_text(random_inputs[i])
    actual_label = "positive" if labels_tensor[i].item() == 1 else "negative"
    print(f"Sample {i+1}:")
    print(f"Text: {decoded_input}")
    print(f"Expected output (Actual label): {actual_label}")
    print(f"Current output (Predicted label): {predicted_labels[i].item()}")
    print("-" * 30)


Sample 1:
Text: the film is about the battle of . for those of you who don ' t know anything about it , it was the worst battle in the second world war . over 1 million people died in the course of the battle . this is the only film that i ' ve seen that seems to have actually captured how bad things were in the war between russia and germany . what i really liked about it is that the two ( and communism ) were nowhere in the film . unlike most american films , the germans are not seen as blood thirsty , but what the average german foot soldier was , a person . the film around four soldiers fighting in . they were transferred there to try and take the city . the film follows these men from august of to early 1943 . during this time , they learn about the horrors of war and try to find a way out of the battle . through the entire film , one feels the desperation of the entire battle . unlike enemy at the gates the film makers didn ' t try to put some sappy love story or dress up occurre

# Balanced IMDB

In [None]:
model5.eval()
with torch.no_grad():
    inputs_tensor = inputs_tensor.to(device)
    outputs = model5(inputs_tensor)

# Step 5: Evaluate the model's performance
predicted_labels = torch.argmax(outputs, dim=1)
for i in range(len(random_inputs)):
    decoded_input = decode_text(random_inputs[i])
    actual_label = "positive" if labels_tensor[i].item() == 1 else "negative"
    print(f"Sample {i+1}:")
    print(f"Text: {decoded_input}")
    print(f"Expected output (Actual label): {actual_label}")
    print(f"Current output (Predicted label): {predicted_labels[i].item()}")
    print("-" * 30)


Sample 1:
Text: the film is about the battle of . for those of you who don ' t know anything about it , it was the worst battle in the second world war . over 1 million people died in the course of the battle . this is the only film that i ' ve seen that seems to have actually captured how bad things were in the war between russia and germany . what i really liked about it is that the two ( and communism ) were nowhere in the film . unlike most american films , the germans are not seen as blood thirsty , but what the average german foot soldier was , a person . the film around four soldiers fighting in . they were transferred there to try and take the city . the film follows these men from august of to early 1943 . during this time , they learn about the horrors of war and try to find a way out of the battle . through the entire film , one feels the desperation of the entire battle . unlike enemy at the gates the film makers didn ' t try to put some sappy love story or dress up occurre