<a href="https://colab.research.google.com/github/manuelrucci7/deep-learning-course/blob/main/colab/LLMFineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install transformers datasets evaluate

In [None]:
#  here is how we would train a sequence classifier on one batch in PyTorch:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

# Of course, just training the model on two sentences is not going to yield very good results. To get better results, you will need to prepare a bigger dataset.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# In this section we will use as an example the MRPC (Microsoft Research Paraphrase Corpus) dataset,
# The dataset consists of 5,801 pairs of sentences, with a label indicating if they are paraphrases or not (i.e., if both sentences mean the same thing).
# https://gluebenchmark.com/

In [None]:
# Browse dataset here: https://huggingface.co/datasets
# Dataset loading docs: https://huggingface.co/docs/datasets/loading
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
# This command downloads and caches the dataset, by default in ~/.cache/huggingface/datasets.
raw_datasets


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]
print(raw_train_dataset[0])
print(raw_train_dataset.features)

# Behind the scenes, label is of type ClassLabel, and the mapping of integers to label name is stored in the names folder. 0 corresponds to not_equivalent, and 1 corresponds to equivalent.

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


In [None]:
# Tokenize
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])


In [None]:
# We discussed the input_ids and attention_mask keys in Chapter 2, but we put off talking about token_type_ids. In this example, this is what tells the model which part of the input is the first sentence and which is the second sentence.
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# So we see the model expects the inputs to be of the form [CLS] sentence1 [SEP] sentence2 [SEP] when there are two sentences. Aligning this with the token_type_ids gives us:
#['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']
#[      0,      0,    0,     0,       0,          0,   0,       0,      1,    1,     1,        1,     1,   1,       1]
# Note that if you select a different checkpoint, you won’t necessarily have the token_type_ids in your tokenized inputs (for instance, they’re not returned if you use a DistilBERT model). They are only returned when the model will know what to do with them, because it has seen them during its pretraining.
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [None]:
# tokenize our whole dataset:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)
# This works well, but it has the disadvantage of returning a dictionary (with our keys, input_ids, attention_mask, and token_type_ids, and values that are lists of lists). It will also only work if you have enough RAM to store your whole dataset during the tokenization (whereas the datasets from the 🤗 Datasets library are Apache Arrow files stored on the disk, so you only keep the samples you ask for loaded in memory).

In [None]:
def tokenize_function(example):
    # We do not do padding becase is not really efficient
    # it’s better to pad the samples when we’re building a batch, as then we only need to pad to the maximum length in that batch, and not the maximum length in the entire dataset
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [None]:
# The last thing we will need to do is pad all the examples to the length of the longest element when we batch elements together — a technique we refer to as dynamic padding.
# The 🤗 Transformers library provides us with such a function via DataCollatorWithPadding. It takes a tokenizer when you instantiate it (to know which padding token to use, and whether the model expects padding to be on the left or on the right of the inputs) and will do everything you need:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

## Fine Tuning

In [None]:
# --> Load dataset
# Browse dataset here: https://huggingface.co/datasets
# Dataset loading docs: https://huggingface.co/docs/datasets/loading
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
# This command downloads and caches the dataset, by default in ~/.cache/huggingface/datasets.
raw_datasets

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# The first step before we can define our Trainer is to define a TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and evaluation. The only argument you have to provide is a directory where the trained model will be saved, as well as the checkpoints along the way. For all the rest, you can leave the defaults, which should work pretty well for a basic fine-tuning.
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")
training_args

In [None]:
# This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Complete example


In [1]:
%%capture
!pip install transformers datasets evaluate

In [3]:
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments, AutoModelForSequenceClassification
import numpy as np
import os

In [4]:
# In this section we will use as an example the MRPC (Microsoft Research Paraphrase Corpus) dataset,
# The dataset consists of 5,801 pairs of sentences, with a label indicating if they are paraphrases or not (i.e., if both sentences mean the same thing).
# https://gluebenchmark.com/

# https://huggingface.co/datasets/nyu-mll/glue

raw_datasets = load_dataset("glue", "mrpc")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [5]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [7]:
import torch
from datasets import DatasetDict, Dataset

# Define some example data
data = {
    "sentence1": ["I love programming.", "The weather is nice.", "AI is the future.", "Python is great."],
    "sentence2": ["Coding is fun.", "Sunny days are lovely.", "Technology is evolving fast.", "Python has many libraries."],
    "label":     [1, 0, 1, 0],  # Example labels
    "idx":       [0, 1, 2, 3]      # Index for each sample
}

# Convert data into a Dataset
full_dataset = Dataset.from_dict(data)
full_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 4
})

In [None]:
for i in range(0,len(raw_datasets["train"])):
    sentence1 = raw_datasets["train"]["sentence1"][i]
    sentence2 = raw_datasets["train"]["sentence2"][i]
    label = raw_datasets["train"]["label"][i]
    idx = raw_datasets["train"]["idx"][i]
    print("--------")
    print(sentence1)
    print(sentence2)
    print(label)
    print(idx)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


In [None]:
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased" # bert-base-uncased or bert-base-cased
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
import wandb
wandb.init(mode="disabled")
# https://huggingface.co/docs/datasets/v1.13.0/quickstart.html
# https://huggingface.co/docs/transformers/main_classes/trainer
# https://huggingface.co/learn/nlp-course/chapter7/3
#os.environ["WANDB_DISABLED"] = "true"
trainer.train()
# The Trainer will work out of the box on multiple GPUs or TPUs and provides lots of options, like mixed-precision training (use fp16 = True in your training arguments). We will go over everything it supports in Chapter 10.

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.514786,0.776961,0.858034
2,0.504300,0.514033,0.845588,0.892675
3,0.269700,0.780903,0.838235,0.886986


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

TrainOutput(global_step=1377, training_loss=0.31747924095491786, metrics={'train_runtime': 235.0023, 'train_samples_per_second': 46.825, 'train_steps_per_second': 5.86, 'total_flos': 405114969714960.0, 'train_loss': 0.31747924095491786, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=-1)
print(predictions.predictions.shape, predictions.label_ids.shape)

(1725, 2) (1725,)


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

model_path = "/content/test-trainer/checkpoint-1377"  # Update this path to where your model is saved
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
torch.set_grad_enabled(False)  # Disable gradients for inference

#sentence1 = "A man is walking in the city" # I hate you
#sentence2 = "In the city a man was walking"
sentence1 = "I hate you"
sentence2 = "Disapper please"

encoding = tokenizer(sentence1, sentence2, return_tensors="pt", padding=True, truncation=True)  # Ensure tensor shapes are correct

inputs = {k: v.to(model.device) for k, v in encoding.items()}
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
probabilities = torch.softmax(logits, dim=-1)
predicted_class = torch.argmax(probabilities, dim=-1)
print(f"Predicted class: {predicted_class.item()}")

# 0 means phrase are different, 1 means they are similar
# 0 means the sentences are not paraphrases, and 1 means they are paraphrases, assuming the datasets library has this mapping correctly set up. This is the standard interpretation for the MRPC dataset based on the task definition from GLUE benchmarks.

Predicted class: 0


## Dataset Custom

In [12]:
# Dataset quanto due frasi sono simili
from datasets import DatasetDict, Dataset

# Creare il dataset di esempio in italiano
data = {
    "sentence1": [
        "Mi piace programmare.",
        "Il sole splende oggi.",
        "L'intelligenza artificiale cambierà il mondo."
    ],
    "sentence2": [
        "Scrivere codice è creativo.",
        "Le giornate estive sono luminose.",
        "Le macchine diventeranno più intelligenti."
        ],
    "label": [1, 0, 1],
    "idx": [list(range(3)) ] # 50 indici
}

# Creare il Dataset da addestramento e validazione
dataset = Dataset.from_dict(data)

# Dividere il dataset (50 train, 30 validation)
split = dataset.train_test_split(test_size=30, seed=42)

# DatasetDict
final_dataset = DatasetDict({
    "train": split["train"],
    "validation": split["test"]
})

# Visualizza la struttura finale
print(final_dataset)


ArrowInvalid: Column 3 named idx expected length 3 but got length 1

In [None]:
import torch
from datasets import DatasetDict, Dataset

# Define some example data
data = {
    "sentence1": ["I love programming.", "The weather is nice.", "AI is the future.", "Python is great."],
    "sentence2": ["Coding is fun.", "Sunny days are lovely.", "Technology is evolving fast.", "Python has many libraries."],
    "label":     [1, 0, 1, 0],  # Example labels
    "idx":       [0, 1, 2, 3]      # Index for each sample
}

# Convert data into a Dataset
full_dataset = Dataset.from_dict(data)

# Define splits (70% train, 15% validation, 15% test)
split = full_dataset.train_test_split(test_size=0.3, seed=42)
validation_test_split = split["test"].train_test_split(test_size=0.5, seed=42)

# Create the final DatasetDict
dataset = DatasetDict({
    "train": split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
})

# Display the dataset structure
print(dataset)