# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
#!pip install scikit-learn
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding
from peft import LoraConfig
from peft import get_peft_model
from peft import AutoPeftModelForSequenceClassification
from transformers import AutoModelForCausalLM
import numpy as np
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score

In [2]:
# https://huggingface.co/docs/transformers/training
# or load the separate splits if the dataset has train/validation/test splits
train_dataset = load_dataset("cmotions/NL_restaurant_reviews", split="train")
valid_dataset = load_dataset("cmotions/NL_restaurant_reviews", split="validation")
test_dataset  = load_dataset("cmotions/NL_restaurant_reviews", split="test")

#commented out code used for education and debug
#look at full dataset
#print("look at full dataset")
dataset = load_dataset("cmotions/NL_restaurant_reviews")
#print(dataset)
#print("\n")

#length of train split of dataset
#print("length of train split of dataset")
#print(len(train_dataset))
#print("\n")

#looks at characteristics of train
#print("Characteristics of train")
#print(train_dataset)
#print("\n")

# Inspect the first example. See what a restaurant review looks like
#print("Train first example")
#print(train_dataset[0])
#print("\n")

train_dataset
print("\n")
test_dataset





Dataset({
    features: ['restaurant_ID', 'restaurant_review_ID', 'michelin_label', 'score_total', 'score_food', 'score_service', 'score_decor', 'fame_reviewer', 'reviewscore_food', 'reviewscore_service', 'reviewscore_ambiance', 'reviewscore_waiting', 'reviewscore_value', 'reviewscore_noise', 'review_text', 'review_length'],
    num_rows: 14587
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

splits = ["train", "validation", "test"]

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["review_text"], truncation=True), batched=True
    )
    
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "negative review", 1: "positive review"},
    label2id={"negative review": 0, "positive review": 1},
)

print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [4]:
# For Education
# I run into issues with untimeError: The size of tensor a (591) must match the size of tensor b (512) 
# at non-singleton dimension 1. 
# max_len is created to truncate input sent will making it < 512 tokens avoiding the runtime indexing error.
max_len = model.config.max_position_embeddings 
predictions = []
labels = []

for example in test_dataset:    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    
    model.to(device)

    # Prepare the input text
    # max_len acquired above is used here
    inputs = tokenizer(
    example["review_text"],
    return_tensors="pt",
    truncation=True,       # chop anything beyond max_len
    max_length=max_len,    # explicit cap
    padding="max_length"   # optional padding up to max_len
).to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits        

    probabilities = torch.nn.functional.softmax(logits, dim=1)    
    predicted_class_id = probabilities.argmax().item()
    
    # Here are the lists for the predicted output and the ground truth
    predictions.append(predicted_class_id)
    labels.append(example["michelin_label"])

In [5]:
def compute_metrics(labels, preds):
    acc = accuracy_score(labels, preds)
    #precision = precision_score(labels, preds)
    #recall = recall_score(labels, preds)
    #f1 = f1_score(labels, preds)
    return {"accuracy": acc}

# Compute evaluation metrics
evaluation_metrics = compute_metrics(labels, predictions)
print(evaluation_metrics)

{'accuracy': 0.02968396517447042}


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [6]:
# adapter configuration for your parameter-efficient fine-tuning process.
# arbitrary string identifier for this LoRA adapter. "lora" used.
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"], # found in printed model above
    task_type="SEQ_CLS",
    # installed version of PEFT, LoraConfig doesn’t accept adapter_name in its constructor hence no adapter_name here
)
lora_model = get_peft_model(model, config)
lora_model.print_trainable_parameters()

trainable params: 1,331,716 || all params: 67,694,596 || trainable%: 1.967241225577297


In [7]:
lora_model.save_pretrained("distilbert-base-uncased-lora")
lora_model = AutoPeftModelForSequenceClassification.from_pretrained("distilbert-base-uncased-lora")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

training_args = TrainingArguments(
    output_dir="review_classifier",
    evaluation_strategy="epoch",
    num_train_epochs=3,            
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    push_to_hub=True,
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()

ValueError: Token is required (write-access action) but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

###  ⚠️ IMPORTANT ⚠️

Due to workspace storage constraints, you should not store the model weights in the same directory but rather use `/tmp` to avoid workspace crashes which are irrecoverable.
Ensure you save it in /tmp always.

In [None]:
# Saving the model
#model.save("/tmp/your_model_name")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.