In [None]:
!pip install transformers torch scikit-learn --quiet

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import classification_report
from tqdm import tqdm
import os

In [None]:
from data_loaders import FactCheckDataset, create_dataloaders

In [None]:
model_name = "vinai/phobert-base"
train_path = "../data/vihallu-train.csv"
test_path  = "../data/vihallu-public-test.csv"

In [None]:
batch_size = 8
max_len = 128
epochs = 10
lr = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

train_loader, test_loader, label2id = create_dataloaders(
    train_path, test_path, tokenizer,
    batch_size=batch_size,
    max_len=max_len
)

id2label = {v:k for k,v in label2id.items()}
print("Label mapping:", label2id)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
from sklearn.metrics import accuracy_score

for epoch in range(epochs):
    print(f"\n===== Epoch {epoch+1}/{epochs} =====")
    model.train()
    total_loss, total_preds, total_labels = 0, [], []

    loop = tqdm(train_loader, desc="Training", leave=True)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()

        preds = torch.argmax(outputs.logits, dim=-1)
        total_preds.extend(preds.cpu().numpy())
        total_labels.extend(labels.cpu().numpy())

        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    acc = accuracy_score(total_labels, total_preds)
    print(f"Train Loss: {avg_loss:.4f}, Train Acc: {acc:.4f}")


In [None]:
model.eval()
preds, trues = [], []
with torch.no_grad():
    for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            preds.extend(predictions.cpu().numpy())
            trues.extend(labels.cpu().numpy())

print("Classification Report:")
print(classification_report(trues, preds, target_names=list(label2id.keys())))

In [None]:
save_dir = "./checkpoints"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")

In [None]:
import pandas as pd

# Sau khi train xong model
def predict_and_save(model, tokenizer, test_path, save_path, label2id, max_len=128, batch_size=8):
    # Load file test
    df_test = pd.read_csv(test_path)

    # Convert id->label mapping
    id2label = {v: k for k, v in label2id.items()}

    # Tokenize test data (không có nhãn)
    encodings = tokenizer(
        df_test["text"].tolist(),
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )

    test_dataset = torch.utils.data.TensorDataset(
        encodings["input_ids"],
        encodings["attention_mask"],
    )
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model.eval()
    preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=-1)
            preds.extend(batch_preds.cpu().numpy())

    # Map predictions to labels
    pred_labels = [id2label[p] for p in preds]

    # Tạo file submit
    submit_df = pd.DataFrame({
        "id": df_test["id"],
        "predict_label": pred_labels
    })
    submit_df.to_csv(save_path, index=False)
    print(f"✅ Submit file saved to {save_path}")
    return submit_df


submit_path = "../result/submit.csv"
submit_df = predict_and_save(model, tokenizer, test_path, submit_path, label2id, max_len, batch_size)

print(submit_df.head())
