# Finetune Bert

## Model

In [None]:
%%capture
!pip install torch transformers datasets

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load BERT model and tokenizer
model_name = "bert-base-uncased"
num_labels = 2  # Binary classification (positive/negative)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
model.base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

## Dataset

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
dataset["train"] = dataset["train"].select(range(100))
dataset["test"] = dataset["test"].select(range(100))
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Tokenizer data

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=100)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
import pandas as pd
pd.DataFrame(dataset['unsupervised']).head(2)

Unnamed: 0,text,label
0,This is just a precious little diamond. The pl...,-1
1,When I say this is my favourite film of all ti...,-1


## Example: From normal data --> DatasetDict

In [None]:
import pandas as pd
import random
from datasets import Dataset, DatasetDict

# Step 1: Create Random Data
data = {
    "text": [
        "I love this movie!", "This is terrible.", "Amazing experience!",
        "Worst ever.", "I would watch it again!", "Not my taste.", "Incredible!",
        "Bad storyline.", "Superb acting!", "Disappointing."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
}

# Convert to DataFrame
df = pd.DataFrame(data)
df.head(4)

Unnamed: 0,text,label
0,I love this movie!,1
1,This is terrible.,0
2,Amazing experience!,1
3,Worst ever.,0


In [None]:
# Step 2: Split into Train and Test (80% train, 20% test)
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Step 3: Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

# Step 4: Create DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Print DatasetDict
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2
    })
})

## Continue Finetuning

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [None]:
from torch.utils.data import DataLoader

# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Create dataloaders
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=8, shuffle=True)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8, shuffle=False)


In [None]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

epochs = 3  # Number of training epochs

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_dataloader)}")


cuda
Epoch 1 - Loss: 0.14316068933560297
Epoch 2 - Loss: 0.007690496241243986
Epoch 3 - Loss: 0.002582362727620281


In [None]:
for param in model.base_model.parameters():
    param.requires_grad = False  # Freeze BERT weights
for name, param in model.named_parameters():
    print(name, param.requires_grad)


bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

epochs = 3  # Number of training epochs

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_dataloader)}")


cuda
Epoch 1 - Loss: 0.0015897694079635234
Epoch 2 - Loss: 0.001525132952687832
Epoch 3 - Loss: 0.0015223398625564117


In [None]:
pd.DataFrame(tokenized_datasets['train']).head(3)

Unnamed: 0,labels,input_ids,token_type_ids,attention_mask
0,tensor(0),"[tensor(101), tensor(1045), tensor(12524), ten...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
1,tensor(0),"[tensor(101), tensor(1000), tensor(1045), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
2,tensor(0),"[tensor(101), tensor(2065), tensor(2069), tens...","[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."


In [None]:
for batch in train_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  print(batch['labels'])
  outputs = model(**batch)
  print(outputs)
  # print(batch['input_ids'])
  break

tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
SequenceClassifierOutput(loss=tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.0097, -4.6042],
        [ 4.0435, -4.5907],
        [ 4.0267, -4.5969],
        [ 4.0357, -4.5838],
        [ 4.0310, -4.5928],
        [ 4.0279, -4.5981],
        [ 4.0259, -4.6012],
        [ 3.9955, -4.5660]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
for batch in train_dataloader:
  # print(batch.items())
  for k, v in batch.items():
    print(k)
    print(v)
    break
  break

labels
tensor([0, 0, 0, 0, 0, 0, 0, 0])


In [None]:
from torch.nn.functional import softmax

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        correct += (predictions == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

print(f"Accuracy: {correct / total:.4f}")


Accuracy: 1.0000


# Test

In [None]:
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb", split="train")
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
# Convert Hugging Face dataset to PyTorch
dataset.set_format(type="torch", columns=["text", "label"])

# Create DataLoader
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Iterate through batches
for batch in train_loader:
    print(batch)  # Dictionary with keys: 'text', 'label'
    break  # Just to show one batch


{'text': ["This movie is good for TV. I like it because I'm a HUGE fan of disaster films even though this is a family film. Accuracy on the film from the book is half-and-half They got the characters names right but in the book there was no storm chaser, the the car scene involving the Hatch family running away from the tornado wasn't in the book instead it involved Dan hatch and his friend riding with a police officer on their way to the police station for safety. and in the book Dan and his friend are both 12-years old. Thats all i can think of. Overall this was a good movie even though it could of have been a little more accurate to the book. Did you know the book was based on a true story of a series of tornadoes devastating a small Nebraska town in 1980?", "Kudos to Fawcett to taking on roles that, at the time were considered controversial. To my recollection, rape was still a taboo subject in the 1980's, and women's rights and emotions were rarely so deeply examined during that t

In [None]:
len(train_loader)

3125

# Testing model

In [None]:
def predict_sentiment(sentence, model, tokenizer, device):
    # Tokenize input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move tensors to the same device as the model (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get model predictions
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # No gradient calculation needed during inference
        outputs = model(**inputs)

    # Extract logits and apply softmax to get probabilities
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # The predicted class (0 or 1)

    # Convert the prediction to the sentiment label (positive/negative)
    sentiment = "positive" if predictions.item() == 1 else "negative"

    return sentiment


In [None]:
sentence = "I love this movie, it was fantastic!"
sentiment = predict_sentiment(sentence, model, tokenizer, device)
print(f"Sentiment: {sentiment}")
# Because we finetune in 100% negative (just for testing)

Sentiment: negative
