In [1]:
#Bert is transformer based neural network
#Attention Mechanism
#Self-Attention: In a transformer-based model, self-attention enables the network to capture the relationships between different words 
#Parallelization
#Transformer Blocks: The transformer architecture consists of a stack of multiple layers called "transformer blocks." Each transformer block 
# - typically contains multiple self-attention mechanisms and feed-forward neural networks.

#Versatility: Transformer-based models, like BERT, GPT, and others, have shown versatility in various natural language processing (NLP)



import torch
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Load dataset
dataset = load_dataset("imdb")
train_data = dataset['train'].select(range(len(dataset['train']) // 10))  # Take one-tenth of the training data
val_data = dataset['test'].select(range(len(dataset['test']) // 10))  # Take one-tenth of the test data


# Initialize BERT tokenizer. Here "tokenizer" is a function with its specific parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#This model is one variant of the BERT architecture and has been pre-trained 
# - on a large corpus of text, learning a contextualized representation of words and their relationships within sentences.

# Tokenize and convert texts to token IDs
train_encodings = tokenizer(train_data['text'], truncation=True, padding=True)
val_encodings = tokenizer(val_data['text'], truncation=True, padding=True)
#Setting truncation=True truncates the sequences to a maximum length accepted by the model
#the process of shortening or limiting the length of sequences to fit within a maximum length accepted by the model during tokenization.


# Convert token IDs and labels to PyTorch tensors
#TensorDataset is used to create a PyTorch dataset by combining tensors that represent the input features and labels for
# - training a machine learning model, likely in a PyTorch-based framework.

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),#torch.tensor() function to convert the tokenized input data and labels into PyTorch tensors.
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['label'])
)

# Like if you have x = torch.tensor([[1, 2, 3], [4, 5, 6]]) it will be a matrix of 
#([1,2,3]
#  [4,5,6]) and if another set, it will for to a 3D dimension.

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_data['label'])
)

# Create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


In [2]:
# Instantiate the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
#num_labels=2: says there are two output classes for the pertrained sequence catogorizing: positive or negative
 

# Define optimizer as a "AdamW" optimizer a variant of the Adam optimizer, which is an adaptive learning rate optimization algorithm
#AdamW includes a modification to the original Adam algorithm that incorporates weight decay (also known as L2 regularization) directly 
# - into the update step of the optimizer. This helps in preventing overfitting by penalizing large weights.

optimizer = AdamW(model.parameters(), lr=2e-5)
# lr=2e-5: The lr parameter stands for learning rate, which controls the step size taken during the optimization process. 
# A learning rate of 2e-5 (or 0.00002) determines how much the optimizer adjusts the model's parameters based on the computed gradients.



# Define number of epochs
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0.0
    
    for step, batch in enumerate(train_dataloader):
        batch_inputs, batch_attention_masks, batch_labels = batch 
# here batch is to remind that the data comes in 16 data batches otherwise the data we get is the train_dataset which here it is 3D
        #you can write something else instead of batch.... like output
        
        optimizer.zero_grad()##This function sets the gradients of all the model parameters to zero.
#During the training of a neural network, gradients are computed with respect to the loss function
# - , and these gradients are used to update the model parameters using an optimization algorithm
        
        outputs = model(input_ids=batch_inputs, attention_mask=batch_attention_masks, labels=batch_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        ## item assing a scalar value to the loss. we want the loss for one iteration for one batch
        
        loss.backward()
#is a method in PyTorch that is used to compute gradients with respect to the loss. In the context 
# - of training a neural network, it plays a crucial role in the backpropagation algorithm.
        
        optimizer.step()
#After calling loss.backward(), which computes the gradients of the loss with respect 
# - to the model parameters, optimizer.step() is used to update the model parameters based on these computed gradients.

    avg_train_loss = total_train_loss / len(train_dataloader)
#it represents the number of batches required to cover the entire training dataset. like here if you have 1000 training data it equals 1000/16

    
### Validation after each epoch
    
    model.eval() #model.eval(), it switches the model to evaluation mode rather than training mode.
    total_eval_accuracy = 0.0 #initializes a variable to zero
    #This variable is commonly used to accumulate the total accuracy computed during the evaluation phase.

    
    for batch in val_dataloader:
        batch_inputs, batch_attention_masks, batch_labels = batch
        
        with torch.no_grad(): #It temporarily disables gradient calculation during the execution of the code block 
                                #when you're not interested in updating the model's parameters (as in training)
            outputs = model(input_ids=batch_inputs, attention_mask=batch_attention_masks, labels=batch_labels)
            
        logits = outputs.logits
#In PyTorch, when working with models from the Transformers library (such as BERT), 
        # - outputs.logits typically refers to the raw, unnormalized predictions made by the model.
        
        predictions = torch.argmax(logits, dim=-1)# will give which tensor (tensor 0,1,2, or....) has the highest number
        
        correct = (predictions == batch_labels).float().sum() #.float(): Converts the boolean values (True and False) to float values
                                                            #.sum(): Calculates the sum of all elements in the resulting tensor afterwards
        total_eval_accuracy += correct / batch_labels.size(0)

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    
    print(f"Epoch {epoch + 1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f} | Avg Val Accuracy: {avg_val_accuracy:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Avg Train Loss: 0.0468 | Avg Val Accuracy: 1.0000
Epoch 2/3 - Avg Train Loss: 0.0011 | Avg Val Accuracy: 1.0000
Epoch 3/3 - Avg Train Loss: 0.0005 | Avg Val Accuracy: 1.0000
