# Starter Notebook

Install and import required libraries

In [None]:
!pip uninstall transformers -y

In [None]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3

In [None]:
import re
import json
import evaluate
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification, get_cosine_schedule_with_warmup
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

In [None]:
# import re
# import json
# import evaluate
# import numpy as np
# import matplotlib.pyplot as plt
# from tqdm import tqdm
# from torch.utils.data import DataLoader
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Load Tokenizer and Preprocess Data

In [None]:
base_model = 'roberta-base'

# ✅ 新增：加载 + 清洗 + 过滤文本
dataset = load_dataset('ag_news', split='train')
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

def clean_text(example):
    text = example["text"]
    text = re.sub(r'[^\w\s]', '', text)  # 去掉特殊字符
    text = re.sub(r'\s+', ' ', text).strip()
    example["text"] = text
    return example

def filter_fn(example):
    text_len = len(example['text'].split())
    return 5 < text_len < 200  # 丢掉太短或太长的样本

# 应用清洗和过滤
dataset = dataset.map(clean_text).filter(filter_fn)

tokenizer = RobertaTokenizer.from_pretrained(base_model)

# def preprocess(examples):
#     tokenized = tokenizer(examples['text'], truncation=True, padding=True)
#     return tokenized
def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
    
tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

## Anything from here on can be modified

In [None]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [None]:
# ✅ 修改 LoRA 设置：更高秩、更强表达能力
peft_config = LoraConfig(
    r=16, 
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value", "dense"],  #记得检查参数数量
    task_type="SEQ_CLS"
)

In [None]:
peft_model = get_peft_model(model, peft_config)
peft_model

In [None]:
# ✅ 新增：打印可训练参数，确保不超过 1M
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total trainable parameters: {count_trainable_params(peft_model):,}")

print("Trainable parameters:")
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(name)

In [None]:
#print parameters
print('PEFT Model')
peft_model.print_trainable_parameters()

## Training Setup

In [None]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ✅ 修改：加入更多指标
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average='weighted', zero_division=0)
    rec = recall_score(labels, preds, average='weighted', zero_division=0)
    f1 = f1_score(labels, preds, average='weighted', zero_division=0)

    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1
    }

### Start Training

In [None]:
# Setup Training args
output_dir = "results"
# ✅ 修改：优化器、调度器、保存策略
training_args = TrainingArguments(
    output_dir="results",
    eval_strategy='steps',
    eval_steps=250,
    save_strategy='steps',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    
    lr_scheduler_type="cosine",
    warmup_steps=200,
    weight_decay=0.01,
    
    learning_rate=1.5e-5,
    num_train_epochs=4,
    max_steps=2500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    dataloader_num_workers=4,
    logging_steps=100,
    logging_dir="./logs",
    report_to="none",
    optim="adamw_torch",  # ✅ 替换原来的 sgd
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True}
)
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     report_to=None,
#     eval_strategy='steps',
#     logging_steps=100,
#     learning_rate=5e-6,
#     num_train_epochs=1,
#     max_steps=1200,
#     use_cpu=False,
#     dataloader_num_workers=4,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     optim="sgd",
#     gradient_checkpointing=False,
#     gradient_checkpointing_kwargs={'use_reentrant':True}
# )

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )

In [None]:
# peft_lora_finetuning_trainer = get_trainer(peft_model)

# # result = peft_lora_finetuning_trainer.train()
# peft_lora_finetuning_trainer = get_trainer(peft_model)

# result = peft_lora_finetuning_trainer.train()
trainer = get_trainer(peft_model)

from transformers import TrainerCallback

class LogHistoryCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            history.append(logs.copy())

history = []
trainer.add_callback(LogHistoryCallback())

# ========== 9. Train ==========
train_result = trainer.train()
trainer.save_model()

# ========== 10. Evaluate ==========
metrics = trainer.evaluate()
print("Final Evaluation Metrics:", metrics)
with open("results/final_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [None]:
# ✅ 修改：更简洁地显示预测
def classify(model, tokenizer, text):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    model.to(device)
    with torch.no_grad():
        output = model(**inputs)
        pred = output.logits.argmax(-1).item()
    print(f"\n[Prediction] Label: {id2label[pred]}, Text: {text[:80]}...")
    return id2label[pred]

In [None]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Returns:
        If labelled is True, returns a tuple (metrics_dict, predictions)
        If labelled is False, returns predictions only.
    """
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    all_references = []

    # Load Hugging Face metrics
    if labelled:
        acc_metric = evaluate.load("accuracy")
        f1_metric = evaluate.load("f1")
        prec_metric = evaluate.load("precision")
        recall_metric = evaluate.load("recall")

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1).cpu()
        all_predictions.append(predictions)

        if labelled:
            references = batch["labels"].cpu()
            all_references.append(references)

    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        all_references = torch.cat(all_references, dim=0)
        acc_metric.add_batch(predictions=all_predictions.numpy(), references=all_references.numpy())
        f1_metric.add_batch(predictions=all_predictions.numpy(), references=all_references.numpy())
        prec_metric.add_batch(predictions=all_predictions.numpy(), references=all_references.numpy())
        recall_metric.add_batch(predictions=all_predictions.numpy(), references=all_references.numpy())

        results = {
            "accuracy": acc_metric.compute()["accuracy"],
            "f1": f1_metric.compute(average="weighted")["f1"],
            "precision": prec_metric.compute(average="weighted")["precision"],
            "recall": recall_metric.compute(average="weighted")["recall"]
        }
        print("Evaluation Metrics:", results)
        return results, all_predictions
    else:
        return all_predictions

In [None]:
# # Check evaluation accuracy
# _, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

In [None]:
# ✅ 第 1 步：评估验证集
# 输出所有指标：accuracy, f1, precision, recall
eval_metrics, _ = evaluate_model(peft_model, eval_dataset, labelled=True, batch_size=8, data_collator=data_collator)


### Run Inference on unlabelled dataset

In [None]:
# #Load your unlabelled data
# unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
# test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
# unlabelled_dataset
# ✅ 第 2 步：加载未标注测试数据
unlabelled_dataset = pd.read_pickle("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])


In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

In [None]:
# # # Run inference and save predictions
# # preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
# # df_output = pd.DataFrame({
# #     'ID': range(len(preds)),
# #     'Label': preds.numpy()  # or preds.tolist()
# # })
# # df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
# # print("Inference complete. Predictions saved to inference_output.csv")

# # ✅ 第 3 步：模型推理并保存结果
# preds = evaluate_model(peft_model, test_dataset, labelled=False, batch_size=8, data_collator=data_collator)

# # ✅ 建议转换为 list 避免 numpy 版本兼容问题
# df_output = pd.DataFrame({
#     'ID': range(len(preds)),
#     'Label': preds.tolist()  # ✅ 更推荐 than .numpy()
# })

# output_csv = os.path.join(output_dir, "inference_output.csv")
# df_output.to_csv(output_csv, index=False)

# print(f"Inference complete. Predictions saved to {output_csv}")

In [None]:
import matplotlib.pyplot as plt

train_loss = [log["loss"] for log in history if "loss" in log]
eval_acc = [log["eval_accuracy"] for log in history if "eval_accuracy" in log]
eval_steps = [i*100 for i in range(len(eval_acc))]  # 取 logging_steps=100 对应横坐标

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Train Loss')
plt.xlabel("Logging Steps")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(eval_steps, eval_acc, label='Eval Accuracy', color='green')
plt.xlabel("Steps")
plt.ylabel("Accuracy")
plt.title("Eval Accuracy")
plt.grid(True)

plt.tight_layout()
plt.savefig("results/training_metrics_plot.png")
plt.show()
