In [1]:
!wget https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv


--2025-03-15 17:44:32--  https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 720259 (703K) [text/plain]
Saving to: ‘train.tsv’


2025-03-15 17:44:32 (18.0 MB/s) - ‘train.tsv’ saved [720259/720259]



In [None]:
!wget https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv

--2025-03-15 17:21:09--  https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 188749 (184K) [text/plain]
Saving to: ‘test.tsv’


2025-03-15 17:21:09 (7.66 MB/s) - ‘test.tsv’ saved [188749/188749]



Dataset Preparation

Task 1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Dataset 1
df = pd.read_csv("/content/train.tsv", sep="\t", header=None, names=["text", "label"])


df.dropna(inplace=True)

# Split into training (80%) and validation (20%)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# Load test dataset
test_df = pd.read_csv("/content/test.tsv", sep="\t", header=None, names=["text", "label"])
test_texts, test_labels = test_df["text"], test_df["label"]

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")


Training samples: 5536
Validation samples: 1384
Test samples: 1821


In [None]:
import pandas as pd

imdb_url = "https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv"

# Load IMDB dataset
imdb_df = pd.read_csv(imdb_url)

print(imdb_df.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
# Convert sentiment to binary labels (0 = negative, 1 = positive)
imdb_df["label"] = imdb_df["sentiment"].map({"positive": 1, "negative": 0})

imdb_df.drop(columns=["sentiment"], inplace=True)

print(imdb_df.head())


                                              review  label
0  One of the other reviewers has mentioned that ...      1
1  A wonderful little production. <br /><br />The...      1
2  I thought this was a wonderful way to spend ti...      1
3  Basically there's a family where a little boy ...      0
4  Petter Mattei's "Love in the Time of Money" is...      1


In [None]:
from sklearn.model_selection import train_test_split

#  Split into 80% train and 20% test
imdb_train_texts, imdb_test_texts, imdb_train_labels, imdb_test_labels = train_test_split(
    imdb_df["review"], imdb_df["label"], test_size=0.2, random_state=42, stratify=imdb_df["label"]
)

#  Split the 80% train data into 75% train and 25% validation (resulting in 60-20-20)
imdb_train_texts, imdb_val_texts, imdb_train_labels, imdb_val_labels = train_test_split(
    imdb_train_texts, imdb_train_labels, test_size=0.25, random_state=42, stratify=imdb_train_labels
)

print(f"IMDB training samples: {len(imdb_train_texts)}")
print(f"IMDB validation samples: {len(imdb_val_texts)}")
print(f"IMDB test samples: {len(imdb_test_texts)}")


IMDB training samples: 30000
IMDB validation samples: 10000
IMDB test samples: 10000


Construct a Multi-Layer Perceptron (MLP) model.

Task 2

In [None]:
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter("runs/Text_Classification")

# Define MLP Model
class MLP(nn.Module):
    def __init__(self, input_size=768, hidden_sizes=[512, 256, 128, 64], output_size=2):
        super(MLP, self).__init__()
        layers = []
        in_features = input_size

        for hidden in hidden_sizes:
            layers.append(nn.Linear(in_features, hidden))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.3))
            in_features = hidden

        layers.append(nn.Linear(in_features, output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


model = MLP()

# Log Model Architecture and Hyperparameters
writer.add_text("Model Architecture", str(model))


writer.add_hparams({
    "learning_rate": 0.0001,
    "hidden_layers": len([512, 256, 128, 64]),
    "batch_size": 16,
    "optimizer": "Adam"
}, {})


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Trainable Parameters: {count_parameters(model)}")


Trainable Parameters: 566338


Task 3:  Implement case 1: Bag-of-words

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with max_features=10000
vectorizer = CountVectorizer(max_features=10000)


train_vectors = vectorizer.fit_transform(train_texts).toarray()
val_vectors = vectorizer.transform(val_texts).toarray()
test_vectors = vectorizer.transform(test_texts).toarray()

# Convert labels to numpy arrays
import numpy as np
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

print(f"Feature shape (train): {train_vectors.shape}")
print(f"Feature shape (validation): {val_vectors.shape}")
print(f"Feature shape (test): {test_vectors.shape}")


Feature shape (train): (5536, 10000)
Feature shape (validation): (1384, 10000)
Feature shape (test): (1821, 10000)


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

batch_size = 16

# Convert BoW features and labels into PyTorch tensors
X_train = torch.tensor(train_vectors, dtype=torch.float32)
y_train = torch.tensor(train_labels, dtype=torch.long)

X_val = torch.tensor(val_vectors, dtype=torch.float32)
y_val = torch.tensor(val_labels, dtype=torch.long)

X_test = torch.tensor(test_vectors, dtype=torch.float32)
y_test = torch.tensor(test_labels, dtype=torch.long)

# Create DataLoaders
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

print("BOW DataLoaders created successfully!")


BOW DataLoaders created successfully!


Task 4:  Implement case 2: Construct a function to use embeddings on the same model.

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

class TextEmbeddings:
    def __init__(self, model_name="bert-base-uncased"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device).half()  # Use float16 for efficiency

        self.embedding_size = self.model.config.hidden_size
        self.model_loaded = True

    def get_embedding(self, texts, max_length=128):

        if isinstance(texts, np.ndarray) or isinstance(texts, pd.Series):
            texts = texts.tolist()


        if not all(isinstance(t, str) for t in texts):
            raise TypeError(f"Expected a list of strings, but got: {type(texts[0])}")

        # Tokenize input texts
        tokens = self.tokenizer(
            texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
        ).to(self.device)

        # Get embeddings from the model
        with torch.no_grad():
            outputs = self.model(**tokens)


        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy().astype('float16')
        return embeddings


# Initialize model with BERT base
embedding_model = TextEmbeddings(model_name="bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import torch
import numpy as np

# Ensure train_texts is a list
train_texts = list(train_texts)

# Sample 200 texts for embedding
num_samples = 200
train_subset = np.random.choice(len(train_texts), num_samples, replace=False)


train_subset_labels = [train_labels[i] for i in train_subset]

# Convert text datasets to embeddings
X_train_embed = torch.tensor(embedding_model.get_embedding([train_texts[i] for i in train_subset]), dtype=torch.float32)
y_train = torch.tensor(train_subset_labels, dtype=torch.long)

X_val_embed = torch.tensor(embedding_model.get_embedding(val_texts[:num_samples]), dtype=torch.float32)
y_val = torch.tensor(val_labels[:num_samples], dtype=torch.long)

X_test_embed = torch.tensor(embedding_model.get_embedding(test_texts[:num_samples]), dtype=torch.float32)
y_test = torch.tensor(test_labels[:num_samples], dtype=torch.long)

# Check shapes before creating DataLoader
print(f"Train: {X_train_embed.shape}, {y_train.shape}")
print(f"Val: {X_val_embed.shape}, {y_val.shape}")
print(f"Test: {X_test_embed.shape}, {y_test.shape}")

# Create DataLoaders
batch_size = 16
train_loader_embed = DataLoader(TensorDataset(X_train_embed, y_train), batch_size=batch_size, shuffle=True)
val_loader_embed = DataLoader(TensorDataset(X_val_embed, y_val), batch_size=batch_size, shuffle=False)
test_loader_embed = DataLoader(TensorDataset(X_test_embed, y_test), batch_size=batch_size, shuffle=False)

print("Embedding DataLoaders created successfully!")


Train: torch.Size([200, 768]), torch.Size([200])
Val: torch.Size([200, 768]), torch.Size([200])
Test: torch.Size([200, 768]), torch.Size([200])
Embedding DataLoaders created successfully!


Train the model with 10 epochs and create the best-performing model (checkpoint.pt) on the Dataset 1.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from tabulate import tabulate
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter("runs/Training")


def train_model(model, train_loader, val_loader, checkpoint_path, dataset_name, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    best_val_acc = 0.0  # Store the best validation accuracy
    results = []

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")

        # Training phase
        model.train()
        correct, total, train_loss = 0, 0, 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            predicted = torch.argmax(outputs, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total

        # Validation phase
        model.eval()
        correct, total, val_loss = 0, 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                predicted = torch.argmax(outputs, dim=1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        val_acc = correct / total

        # Log metrics to TensorBoard
        writer.add_scalar(f"Loss/Train_{dataset_name}", train_loss / len(train_loader), epoch)
        writer.add_scalar(f"Loss/Validation_{dataset_name}", val_loss / len(val_loader), epoch)
        writer.add_scalar(f"Accuracy/Train_{dataset_name}", train_acc, epoch)
        writer.add_scalar(f"Accuracy/Validation_{dataset_name}", val_acc, epoch)

        # Save best model checkpoint
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), checkpoint_path)

        # Store results
        results.append([epoch+1, train_acc, val_acc])


    headers = ["Epoch", "Train Acc", "Val Acc"]
    print(tabulate(results, headers=headers, tablefmt="grid"))

    print(f"\nBest Validation Accuracy: {best_val_acc:.4f}")
    print(f"Best model saved at '{checkpoint_path}'")

    writer.flush()
    return best_val_acc


# Train BoW Model
model_bow = MLP(input_size=10000)  # Input size matches BoW features
checkpoint_bow = "checkpoint_bow.pt"
print("\nTraining model for BoW...")
best_val_acc_bow = train_model(model_bow, train_loader, val_loader, checkpoint_bow, "SST2_BoW", num_epochs=10)

# Train Embedding Model
model_embed = MLP(input_size=768)  # Input size matches embedding features
checkpoint_embed = "checkpoint_embed.pt"
print("\nTraining model for Embeddings...")
best_val_acc_embed = train_model(model_embed, train_loader_embed, val_loader_embed, checkpoint_embed, "SST2_Embeddings", num_epochs=10)

# Compare Best Checkpoints & Save Final Best
final_best_checkpoint = "best_checkpoint.pt"

if best_val_acc_bow >= best_val_acc_embed:
    print(f"\nBoW model performed better! ({best_val_acc_bow:.4f} vs. {best_val_acc_embed:.4f})")
    best_model_state = torch.load(checkpoint_bow)
else:
    print(f"\nEmbedding model performed better! ({best_val_acc_embed:.4f} vs. {best_val_acc_bow:.4f})")
    best_model_state = torch.load(checkpoint_embed)

# Save the final best model's weights
torch.save(best_model_state, final_best_checkpoint)
print(f"Final best model saved as '{final_best_checkpoint}'")

print("\nFinal best model selection complete!")



Training model for BoW...

Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10
+---------+-------------+-----------+
|   Epoch |   Train Acc |   Val Acc |
|       1 |    0.702673 |  0.760116 |
+---------+-------------+-----------+
|       2 |    0.901553 |  0.765896 |
+---------+-------------+-----------+
|       3 |    0.979769 |  0.763728 |
+---------+-------------+-----------+
|       4 |    0.994942 |  0.770954 |
+---------+-------------+-----------+
|       5 |    0.997471 |  0.768064 |
+---------+-------------+-----------+
|       6 |    0.997832 |  0.765173 |
+---------+-------------+-----------+
|       7 |    0.997832 |  0.776734 |
+---------+-------------+-----------+
|       8 |    0.995303 |  0.767341 |
+---------+-------------+-----------+
|       9 |    0.998736 |  0.760116 |
+---------+-------------+-----------+
|      10 |    0.998916 |  0.754335 |
+---------+-------------+-----------+

Best Validation