### BERT - TASK: SENTENCE SIMILARITY

In [31]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.metrics import mean_squared_error
import numpy as np
import torch



### DATA PREPROCESSING

In [32]:
dataset = load_dataset("glue", "stsb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

dataset = dataset.map(preprocess, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [33]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    return {
        "pearson": np.corrcoef(preds[:, 0], labels)[0, 1],
        # "mse": mean_squared_error(labels, preds[:, 0]),
    }

# TrainingArguments factory
def get_args(run_name):
    return TrainingArguments(
        output_dir=f"./{run_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_pearson",
        greater_is_better=True,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        learning_rate=2e-5,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_dir=f"./logs/{run_name}",
        report_to="none",
        run_name=run_name
    )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


### Standard Fine-Tuning

In [34]:
# ================================
# 1. Standard Fine-Tuning
# ================================
model_std = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

trainer_std = Trainer(
    model=model_std,
    args=get_args("standard"),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

trainer_std.train()
result_std = trainer_std.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_std = Trainer(


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.834692,0.854996
2,0.911900,0.640639,0.865546
3,0.367500,0.556502,0.869597
4,0.367500,0.62585,0.866678


### Layer-wise Learning Rate Decay

In [35]:
# ================================
# 2. Layer-wise Learning Rate Decay
# ================================
model_llrd = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# LLRD Optimizer Setup
def get_llrd_optimizer(model, base_lr=2e-5, decay=0.9):
    layers = [model.bert.embeddings] + list(model.bert.encoder.layer)
    opt_params = []
    for i, layer in enumerate(layers):
        lr = base_lr * (decay ** (len(layers) - i))
        opt_params.append({
            "params": layer.parameters(),
            "lr": lr
        })
    opt_params.append({
        "params": model.classifier.parameters(),
        "lr": base_lr
    })
    return torch.optim.AdamW(opt_params)

optimizer_llrd = get_llrd_optimizer(model_llrd)

trainer_llrd = Trainer(
    model=model_llrd,
    args=get_args("llrd"),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()],
    optimizers=(optimizer_llrd, None)
)

trainer_llrd.train()
result_llrd = trainer_llrd.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_llrd = Trainer(


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.681254,0.83625
2,1.407500,0.630779,0.85209
3,0.528400,0.613193,0.858501
4,0.528400,0.594928,0.859213
5,0.369200,0.617551,0.856618


### Freeze Lower BERT Layers

In [36]:
# ================================
# 3. Freeze Lower BERT Layers
# ================================
def freeze_bert_layers(model, freeze_until=6):
    for name, param in model.bert.named_parameters():
        if any(f"encoder.layer.{i}." in name for i in range(freeze_until)):
            param.requires_grad = False

model_freeze = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
freeze_bert_layers(model_freeze)

trainer_freeze = Trainer(
    model=model_freeze,
    args=get_args("freeze"),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

trainer_freeze.train()
result_freeze = trainer_freeze.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_freeze = Trainer(


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.79886,0.829452
2,1.213400,0.834619,0.842361
3,0.536200,0.638539,0.851647
4,0.536200,0.715231,0.840921


### Inference: Predict Similarity

In [37]:
# ================================
# Inference: Predict Similarity
# ================================
def predict_similarity(sentence1, sentence2, model, tokenizer):
    device = next(model.parameters()).device  # get model's device (cuda or cpu)

    # Tokenize and move inputs to the same device as model
    inputs = tokenizer(sentence1, sentence2, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    model.eval()
    with torch.no_grad():
        logits = model(**inputs).logits

    return logits.item()

# Example
s1 = "A man is eating food."
s2 = "A person is consuming a meal."
similarity_score = predict_similarity(s1, s2, model_std, tokenizer)
print(f"Similarity between: '{s1}' and '{s2}' -> {similarity_score:.4f}")


Similarity between: 'A man is eating food.' and 'A person is consuming a meal.' -> 2.1137


### Summary table

In [39]:
# Summary table with tabulate
from tabulate import tabulate

results = [
    ["Standard", result_std["eval_pearson"]],
    ["LLRD", result_llrd["eval_pearson"]],
    ["Freeze Layers", result_freeze["eval_pearson"]],
]

print("\\nFine-tuning techniques comparison:")
print(tabulate(results, headers=["Technique", "Pearson Correlation"], floatfmt=".4f"))


\nFine-tuning techniques comparison:
Technique        Pearson Correlation
-------------  ---------------------
Standard                      0.8696
LLRD                          0.8592
Freeze Layers                 0.8516


### CONCLUSION

We compare three fine-tuning strategies for BERT on a sentence similarity task using Pearson Correlation as the evaluation metric.

- **Standard fine-tuning** achieves the best performance with a Pearson correlation of **0.8696**.
- **LLRD (Layer-wise Learning Rate Decay)** comes slightly behind at **0.8592**, showing competitive results while potentially improving training stability and efficiency.
- **Freeze Layers** yields the lowest correlation, **0.8516**, likely due to limited capacity to adapt the model to task-specific features.

### Conclusion:
Full fine-tuning (Standard) is the most effective approach for sentence similarity tasks using BERT.  
LLRD is a reasonable alternative when balancing performance with training efficiency.  
Freezing layers may be suitable in low-resource settings, but comes at the cost of reduced accuracy.