<b> Import Libraries </b>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

In [2]:
import torch #PyTorch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW 
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset #Used to load pre-defined datasets (e.g., IMDB) from the Hugging Face datasets library
import evaluate #Calculate metrics like accuracy

In [3]:
# Load IMDB dataset
imdb_data = load_dataset("imdb")

<b>IMDB Dataset Overview</b>:

- The IMDB dataset contains 50,000 movie reviews, with each review labeled as positive (1) or negative (0).

- It's split into a training set (25,000 examples) and a test set (25,000 examples).



In [None]:
# Display the dataset structure
print(imdb_data)

In [None]:
#Sample of the training data
print(imdb_data['train'][:5])

<b> BERT Tokenizer and Model </b>

In [None]:
#Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #Loads a pre-trained BERT tokenizer (bert-base-uncased), which splits text into tokens compatible with the BERT model.

In [None]:
#Loads a pre-trained BERT model for binary classification (num_labels=2)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

<b> Start of Example Code. Ignore when working on the real data. </b>

In [4]:
#Example to understand the model better
text = "The movie was absolutely fantastic, with brilliant performances!"
label = torch.tensor([1])  # Positive sentiment

In [5]:
#Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #Loads a pre-trained BERT tokenizer (bert-base-uncased), which splits text into tokens compatible with the BERT model.

In [6]:
inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt')
inputs

{'input_ids': tensor([[  101,  1996,  3185,  2001,  7078, 10392,  1010,  2007,  8235,  4616,
           999,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

input_ids: Tokenized IDs of the text, including special tokens:
- [101] = [CLS] (start of the sentence)
- [102] = [SEP] (end of the sentence)

attention_mask: Indicates which tokens are real (1) and which are padding (0)

In [7]:
#Loads a pre-trained BERT model for binary classification (num_labels=2)
model1 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
outputs = model1(**inputs,labels=label)
outputs

SequenceClassifierOutput(loss=tensor(1.2709, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3429, -0.5987]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

bert-base-uncased:

- Pre-trained BERT model with 12 transformer layers.
- Processes input_ids and computes contextual embeddings for each token.
- The [CLS] token’s embedding is used as the representation of the entire input sequence.

Classification Head:

- A fully connected layer is applied to the [CLS] token’s embedding.
- Outputs logits: raw scores for each class (positive and negative sentiment).
- [ 0.1753, -0.1100 ]: Higher score for the first class (0 → negative sentiment).

In [9]:
predictions = torch.softmax(outputs.logits, dim=1)
print(predictions)
predicted_label = torch.argmax(predictions, dim=1).item()
print(predicted_label)

tensor([[0.7194, 0.2806]], grad_fn=<SoftmaxBackward0>)
0


- 0.5708 --> Probability of Class 0 (negative class)
- 0.4292 --> Probability of Class 1 (positive class)

- 0 : Predicted Label 

In [11]:
#Configures the optimizer with model parameters and a learning rate of 2e-5
optimizer = AdamW(model1.parameters(), lr=2e-5)

In [12]:
# Loss Calculation: During training, the model compares logits with the ground truth (label = 1) using cross-entropy loss.
loss = outputs.loss
loss

tensor(1.2709, grad_fn=<NllLossBackward0>)

In [13]:
# Backpropagation: Compute gradients and update model weights
loss.backward()
optimizer.step()

In [14]:
logits = outputs.logits

In [17]:
# Print results
print(f"Loss: {loss.item()}")
print(f"Logits: {logits}")

Loss: 1.2709351778030396
Logits: tensor([[ 0.3429, -0.5987]], grad_fn=<AddmmBackward0>)


For evaluation:
- Use the trained model to predict sentiments for test data.
- Compare predictions with ground truth labels and compute metrics like accuracy:

In [20]:
# Use torch.argmax to get the class with the highest probability
predictions = torch.argmax(logits, dim=1)

In [21]:
# Detach tensors and convert them to lists/NumPy arrays
predictions = predictions.detach().cpu().numpy()
references = label.detach().cpu().numpy()

In [22]:
accuracy_metric = evaluate.load("accuracy")
accuracy = accuracy_metric.compute(predictions=predictions, references=references)
accuracy

{'accuracy': 0.0}

<b> End of Example Code </b>

<b> Custom Dataset Class </b>

In [None]:
class IMDbDataset(Dataset):
    #Initializes the dataset with tokenized encodings and corresponding labels
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    #Returns the number of examples in the dataset
    def __len__(self):
        return len(self.labels)

    #Retrieves a single example at a given index as a dictionary containing: input_ids, attention_mask, and labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

<b> Tokenization </b>

In [None]:
#Extracts text and labels for training and testing splits
train_texts = imdb_data['train']['text']
train_labels = imdb_data['train']['label']
test_texts = imdb_data['test']['text']
test_labels = imdb_data['test']['label']

In [None]:
#Tokenizes the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128) 
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

truncation=True: Truncates text longer than 128 tokens

padding=True: Pads shorter text to 128 tokens

<b> Dataset and DataLoader

In [None]:
#Creates IMDbDataset objects for training and testing data
train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [None]:
#Wraps datasets in DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

batch_size=16: Each batch contains 16 examples

shuffle=True: Shuffles training data

<b> Optimizer </b>

In [None]:
#Configures the optimizer with model parameters and a learning rate of 2e-5
optimizer = AdamW(model.parameters(), lr=2e-5)

<b> Device Configuration </b>

In [None]:
#Moves the model to GPU (cuda) if available; otherwise, uses CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

<b> Training Loop </b>

In [None]:
#Trains the model for 3 epochs
for epoch in range(3): 
    model.train()
    total_loss = 0

In [None]:
for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()
    loss.backward()
    optimizer.step()

For each batch:

- Clears gradients: optimizer.zero_grad()

- Processes inputs: input_ids, attention_mask, and labels

- Computes loss

- Backpropagates gradients: loss.backward()

- Updates model parameters: optimizer.step()

In [None]:
#Prints the average loss after each epoch
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

<b> Evaluation </b>

In [None]:
#Loads the accuracy matrix
accuracy_metric = evaluate.load("accuracy")

In [None]:
#Sets the model to evaluation mode and initializes storage for predictions and labels
model.eval()
predictions = []
references = []

In [None]:
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    preds = torch.argmax(outputs.logits, dim=-1)
    predictions.extend(preds.cpu().numpy())
    references.extend(labels.cpu().numpy())

For each batch in the test set:

- Moves inputs to the appropriate device.

- Predicts logits without computing gradients: torch.no_grad().

- Converts logits to predictions: torch.argmax().

- Stores predictions and labels.

In [None]:
#Computes and prints the test set accuracy
accuracy = accuracy_metric.compute(predictions=predictions, references=references)
print(f"Test Accuracy: {accuracy['accuracy']}")

<b> Save Model </b>

In [None]:
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')