In [1]:
# =========================
# IMPORT
# =========================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import os
import shutil
# =========================
# CONFIG
# =========================
PHOBERT_DIR = "/kaggle/input/phobert-ckpt2"  # PhoBERT pretrained
TRAIN_FILE = "/kaggle/input/nlp-final-prj2/train_data1.csv"
VAL_FILE = "/kaggle/input/nlp-final-prj2/val_data1.csv"

BATCH_SIZE = 16
MAX_LEN = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 10
LEARNING_RATE = 2e-5
HIDDEN_SIZE = 128
NUM_LAYERS = 1
DROPOUT = 0.3

TEXT_COL = "text"
LABEL_COL = "label"

# =========================
# LOAD DATA
# =========================
df_train = pd.read_csv(TRAIN_FILE)
df_val = pd.read_csv(VAL_FILE)

train_texts = df_train[TEXT_COL].astype(str).tolist()
train_labels = df_train[LABEL_COL].astype(int).to_numpy()

val_texts = df_val[TEXT_COL].astype(str).tolist()
val_labels = df_val[LABEL_COL].astype(int).to_numpy()

# =========================
# TOKENIZER & PHOBERT
# =========================
tokenizer = AutoTokenizer.from_pretrained(PHOBERT_DIR, local_files_only=True)
phobert_model = AutoModel.from_pretrained(PHOBERT_DIR, local_files_only=True)
phobert_model.to(DEVICE)
phobert_model.train()  # allow gradients

# =========================
# DATASET
# =========================
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        # squeeze to remove batch dim
        item = {k: v.squeeze(0) for k, v in enc.items()}
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return item, label

train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# =========================
# LSTM CLASSIFIER
# =========================
class LSTMClassifier(nn.Module):
    def __init__(self, phobert, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=DROPOUT, num_classes=2):
        super().__init__()
        self.phobert = phobert
        self.lstm = nn.LSTM(
            input_size=phobert.config.hidden_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers>1 else 0
        )
        self.fc = nn.Linear(hidden_size*2, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        # embeddings: [batch, seq_len, hidden]
        x = outputs.last_hidden_state
        out, (hn, cn) = self.lstm(x)
        out = out[:, -1, :]  # last time step
        logits = self.fc(out)
        return logits

model = LSTMClassifier(phobert_model)
model.to(DEVICE)

# =========================
# LOSS & OPTIMIZER
# =========================
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# =========================
# TRAIN LOOP
# =========================
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch, label in train_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        label = label.to(DEVICE)
        
        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, label)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss / len(train_loader):.4f}")

# =========================
# EVALUATION
# =========================
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch, label in val_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        label = label.to(DEVICE)
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(logits, dim=-1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
f1_macro = f1_score(all_labels, all_preds, average="macro")
f1_weighted = f1_score(all_labels, all_preds, average="weighted")
precision_macro = precision_score(all_labels, all_preds, average="macro")
recall_macro = recall_score(all_labels, all_preds, average="macro")

print("\n===== VALIDATION METRICS =====")
print(f"Accuracy          : {acc:.4f}")
print(f"F1 (macro)        : {f1_macro:.4f}")
print(f"F1 (weighted)     : {f1_weighted:.4f}")
print(f"Precision (macro) : {precision_macro:.4f}")
print(f"Recall (macro)    : {recall_macro:.4f}")
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, digits=4))


2025-12-26 15:42:38.831011: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766763759.024817      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766763759.083110      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766763759.555335      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766763759.555372      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766763759.555375      23 computation_placer.cc:177] computation placer alr

Epoch 1/10 - Loss: 0.1839
Epoch 2/10 - Loss: 0.0907
Epoch 3/10 - Loss: 0.0543
Epoch 4/10 - Loss: 0.0413
Epoch 5/10 - Loss: 0.0305
Epoch 6/10 - Loss: 0.0206
Epoch 7/10 - Loss: 0.0335
Epoch 8/10 - Loss: 0.0183
Epoch 9/10 - Loss: 0.0132
Epoch 10/10 - Loss: 0.0138

===== VALIDATION METRICS =====
Accuracy          : 0.9709
F1 (macro)        : 0.9651
F1 (weighted)     : 0.9706
Precision (macro) : 0.9776
Recall (macro)    : 0.9547

Classification Report:
              precision    recall  f1-score   support

           0     0.9622    0.9971    0.9793       689
           1     0.9929    0.9123    0.9509       308

    accuracy                         0.9709       997
   macro avg     0.9776    0.9547    0.9651       997
weighted avg     0.9717    0.9709    0.9706       997



In [2]:
SAVE_DIR = "/kaggle/working/phobert_lstm_finetuned"
ZIP_PATH = "/kaggle/working/phobert_lstm_finetuned.zip"

os.makedirs(SAVE_DIR, exist_ok=True)

# Lưu state_dict
torch.save(model.state_dict(), os.path.join(SAVE_DIR, "model.pt"))

# Lưu config cần thiết để load lại
save_config = {
    "hidden_size": HIDDEN_SIZE,
    "num_layers": NUM_LAYERS,
    "dropout": DROPOUT,
    "num_classes": 2
}
torch.save(save_config, os.path.join(SAVE_DIR, "config.pt"))

# Zip toàn bộ thư mục
shutil.make_archive(
    base_name=ZIP_PATH.replace(".zip", ""),
    format="zip",
    root_dir=SAVE_DIR
)

print("Model saved to:", ZIP_PATH)


Model saved to: /kaggle/working/phobert_lstm_finetuned.zip
