# Text Classification – DistilBERT (EN/VN)
**Objective/Mục tiêu**: Fine-tune DistilBERT on AG News for fast baseline.
**Inputs**: `datasets`, `transformers`, `evaluate`.
**Outputs**: Accuracy/F1, confusion matrix, saved model/metrics.


In [None]:
# !pip install -q transformers datasets evaluate accelerate torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121

In [None]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import numpy as np
import evaluate

dataset = load_dataset("ag_news")
label_names = dataset["train"].features["label"].names
num_labels = len(label_names)

model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=p.label_ids, average="macro")["f1"]
    }

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].shuffle(seed=42).select(range(5000)),
    eval_dataset=tokenized["test"].shuffle(seed=42).select(range(2000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
metrics = trainer.evaluate()
metrics



# Confusion matrix (optional)


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
preds = np.argmax(trainer.predict(tokenized['test'].select(range(2000))).predictions, axis=1)
cm = confusion_matrix(tokenized['test'].select(range(2000))['label'], preds)
cm


# Save artifacts


In [None]:
trainer.save_model("./model_distilbert_agnews")
tokenizer.save_pretrained("./model_distilbert_agnews")
with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
print("Saved to ./model_distilbert_agnews and metrics.json")