# Exercise 2: Finetuning a Pretrained Model for Binary Text Classification
Tên : Nguyễn Văn Thương

Mssv : 077205005581

# Load dataset (Yelp Reviews)

In [30]:
from datasets import load_dataset

# Load yelp_review_full as in the docs
dataset = load_dataset("yelp_review_full")
print(dataset)
# dataset has 'train' and 'test' splits and a 'text' and 'label' column.


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [31]:
def to_binary(example):
    # original labels: 0..4 (5 classes) for yelp_review_full
    # map: 0,1 -> 0 (negative); 2,3,4 -> 1 (positive)
    orig = example["label"]
    return {"label": 0 if orig <= 1 else 1}

dataset = dataset.map(to_binary)
# kiểm tra
print(dataset["train"][0])


{'label': 1, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}


# Load pretrained model và tokenizer

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# thông báo: classifier.* mới được khởi tạo lại — cần fine-tune. (This is expected.)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Preprocess dataset (tokenize, pad, truncate)

In [40]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# map tokenizer
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# optional: select small subsets for quick testing (khuyên bởi docs)
small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

# If you want to train on full: use tokenized_datasets["train"]


# hyperparameters

In [43]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="yelp_binary_classifier",
    eval_strategy="epoch",  # evaluate at end of each epoch
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    push_to_hub=False,  # set True if you want to push to HF Hub and have logged in
)

# Tạo Trainer và finetune model

In [None]:
import numpy as np
import evaluate
from transformers import Trainer

# load accuracy metric (as docs)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# tạo trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,  # or tokenized_datasets["train"] for full
    eval_dataset=small_eval,
    compute_metrics=compute_metrics,
)

# train
trainer.train()

# Evaluate the finetuned model

In [None]:
# evaluate on eval dataset
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# optional: make predictions on new sentences
predict_texts = [
    "The food was awful and service was terrible.",
    "Excellent place! I loved the pizza and the staff were very friendly."
]
# tokenize inputs
encodings = tokenizer(predict_texts, padding=True, truncation=True, return_tensors="pt")
# use trainer.predict to get logits then map to labels
predictions = trainer.predict(tokenized_datasets["test"].shuffle(seed=42).select(range(10)))
# or simpler: use trainer.predict on new dataset wrapped as Dataset object
