# Fine-tune the LLM

## Install or Upgrade Required Python Packages

In [None]:
!pip install --upgrade --quiet \
    pip \
    transformers[torch]==4.41.2 \
    datasets==2.20.0 \
    evaluate==0.4.2 \
    accelerate==0.31.0 \
    bottleneck==1.3.6 \
    mlflow==2.13.2 \
    oracle-ads==2.11.12 \
    oci-cli==3.43.1

## Load Dataset for Model Training

Load the [Yelp review dataset](https://huggingface.co/datasets/Yelp/yelp_review_full) containing user-submitted reviews on Yelp's website. Each row contains the review content and a label corresponding to the number of stars (from 1 to 5) awarded by the reviewer. The dataset is split into 650K rows for training, and 50K rows for testing.

In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][1]

## Set Variables

Reduce the amounts of `number_of_train_records` and `number_of_test_records` for faster training but lower accuracy. Increasing these values will improve the model accuracy but the training will take a longer time to complete. You may also modify the number of epochs or training cycles. Training speed will greatly improved if performed on a compute with GPU resources.

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

pretrained_model_name = "google-bert/bert-base-uncased"
number_of_train_records = 50
number_of_test_records = 50
number_of_epochs = 3

## Prepare the Training and Testing Datasets

The `text` element in the dataset must first be tokenized.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(number_of_train_records))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(number_of_test_records))

## Setup the Trainer

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=5)

training_args = TrainingArguments(output_dir="review_trainer", eval_strategy="epoch", 
                                  num_train_epochs=number_of_epochs)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

## Perform the Training

In [None]:
trainer.train()

## Use the Model to Predict a Label

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

inference = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)

review = """
I have been through other brands of individual pods coffee brewing, like Keurig or Kienna, not Tassimo. 
This time I would like to try Nespresso which uses up to 19-bar high pressure to extract concentrated 
coffee of two sizes, espresso and lungo. The delivery was Prime and quick, the product is brand new 
and made by well known quality Breville brand. I have only used it for about 10 times in 4 days and so 
far, the temperature is hotter (preferred) than my Keurig or even the regular Hamilton Beach brew 
station. The only downside I may add would be when I will have to descale the machine using only the 
proprietary Nespresso descaling liquid sold separately. I usually only used white vinegar to clean 
my coffee drip machines or Keurig. This is part of the only Nespresso model ( Original Line, not 
Virtuo) that may be able to use cheaper compatible pods that may go as low as almost half the price of 
original Nespresso pods.
"""

inference(review)

## Save the Model to the Storage for the Next Lab

In [None]:
model.save_pretrained('./review_model')