<a href="https://colab.research.google.com/github/moreira-matheus/llm-studies/blob/main/Fine_tune_a_pre_trained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune a pre-trained model:

References:
- [Tutorial: Fine-tune a pretrained model](https://huggingface.co/docs/transformers/training).
- [Datasets](https://huggingface.co/docs/datasets/index).
- [BERT paper](https://arxiv.org/pdf/1810.04805.pdf).
- [Virtual Env](https://stackoverflow.com/a/77222570).

In [None]:
def create_venv(venv_name, drive_path='/content/drive/', libs_list=None, install_venv=True):
  if install_venv:
    print('Installing virtualenv...')
    !pip install virtualenv

  print('Mounting drive...')
  from google.colab import drive
  drive.mount(drive_path)

  print('Activating venv...')
  !source /content/drive/MyDrive/bert/bin/activate

  if libs_list:
    print('Installing libs...')
    for lib in libs_list:
      print(f"Library: {lib}")
      !pip install $lib

  print('Venv created and activated successfully.')

CREATE_VENV = False
VENV_NAME = 'bert'
LIBS = ['transformers', 'datasets', 'transformers[torch]', 'evaluate', 'tensorflow==2.14']

if CREATE_VENV:
  create_venv(VENV_NAME)

import sys
sys.path.append(f"/content/drive/MyDrive/{VENV_NAME}/lib/python3.10/site-packages")


In [None]:
# !pip install virtualenv

#from google.colab import drive
#drive.mount("/content/drive")

#!virtualenv /content/drive/MyDrive/bert

#LIBS = ['transformers', 'datasets', 'transformers[torch]', 'evaluate']
#for lib in LIBS:
#  !source /content/drive/MyDrive/bert/bin/activate; pip install $lib

## Prepare dataset:

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("yelp_review_full")

In [None]:
dataset["train"][95]

{'label': 2,
 'text': 'Talk about overpriced. $18 for a fairly basic pasta with some obviously frozen chicken chopped up over it. The latter was terrible, thin and flabby and rather unappealing. The pasta itself was ok, as was the sauce. The desserts are pretty good. But honestly, that is a $10 dish whose price has been inflated.'}

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def tokenize_func(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_func, batched=True)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

## Train (with PyTorch Trainer):

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Training hyperparameters:

In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(output_dir="test_trainer")

#### Evaluate:

In [None]:
import numpy as np
import evaluate

In [None]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)


In [None]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


## Train (with Tensorflow + Keras):