In [1]:
# =========================
# IMPORT
# =========================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, accuracy_score
import os
import zipfile

# =========================
# CONFIG
# =========================
PHOBERT_DIR = "/kaggle/input/phobert-ckpt2"  # pretrained/fine-tuned PhoBERT
TRAIN_FILE = "/kaggle/input/nlp-final-prj2/train_data1.csv"
VAL_FILE = "/kaggle/input/nlp-final-prj2/val_data1.csv"

BATCH_SIZE = 16
MAX_LEN = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 5
LEARNING_RATE = 2e-5
HIDDEN_SIZE = 128
NUM_LAYERS = 1
DROPOUT = 0.3

TEXT_COL = "text"
LABEL_COL = "label"

SAVE_DIR = "/kaggle/working/lstm_model"
SAVE_FILE = "lstm_phobert.pt"
ZIP_FILE = "lstm_phobert.zip"

os.makedirs(SAVE_DIR, exist_ok=True)

# =========================
# LOAD DATA
# =========================
df_train = pd.read_csv(TRAIN_FILE)
df_val = pd.read_csv(VAL_FILE)

train_texts = df_train[TEXT_COL].astype(str).tolist()
train_labels = df_train[LABEL_COL].astype(int).to_numpy()

val_texts = df_val[TEXT_COL].astype(str).tolist()
val_labels = df_val[LABEL_COL].astype(int).to_numpy()

# =========================
# TOKENIZER & PHOBERT
# =========================
tokenizer = AutoTokenizer.from_pretrained(PHOBERT_DIR, local_files_only=True)
phobert_model = AutoModel.from_pretrained(PHOBERT_DIR, local_files_only=True)
phobert_model.to(DEVICE)
phobert_model.eval()  # frozen

def encode_texts(texts, max_len=MAX_LEN):
    enc = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="pt"
    )
    with torch.no_grad():
        outputs = phobert_model(**{k: v.to(DEVICE) for k, v in enc.items()})
        embeddings = outputs.last_hidden_state.mean(dim=1)  # mean pooling
    return embeddings.cpu()

# =========================
# DATASET CLASS
# =========================
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        emb = encode_texts([self.texts[idx]])[0]  # vector embedding
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return emb, label

train_dataset = NewsDataset(train_texts, train_labels)
val_dataset = NewsDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# =========================
# LSTM MODEL
# =========================
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_size*2, 2)
    
    def forward(self, x):
        x = x.unsqueeze(1)  # fake seq_len dim = 1
        out, (hn, cn) = self.lstm(x)
        out = out[:, -1, :]
        logits = self.fc(out)
        return logits

input_size = train_dataset[0][0].shape[0]
model = LSTMClassifier(input_size, HIDDEN_SIZE, NUM_LAYERS, DROPOUT)
model.to(DEVICE)

# =========================
# TRAINING SETUP
# =========================
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# =========================
# TRAINING LOOP
# =========================
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for emb, label in train_loader:
        emb = emb.to(DEVICE)
        label = label.to(DEVICE)
        
        optimizer.zero_grad()
        logits = model(emb)
        loss = criterion(logits, label)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss / len(train_loader):.4f}")

# =========================
# EVALUATION
# =========================
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for emb, label in val_loader:
        emb = emb.to(DEVICE)
        logits = model(emb)
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(label.numpy())

print("Validation Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, digits=4))

# =========================
# SAVE MODEL
# =========================
save_path = os.path.join(SAVE_DIR, SAVE_FILE)
torch.save(model.state_dict(), save_path)
print(f"Model weights saved to: {save_path}")

# ZIP MODEL
zip_path = os.path.join(SAVE_DIR, ZIP_FILE)
with zipfile.ZipFile(zip_path, 'w') as zipf:
    zipf.write(save_path, arcname=SAVE_FILE)
print(f"Model weights zipped to: {zip_path}")


2025-12-26 09:15:26.262536: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766740526.751193      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766740526.874914      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766740527.930994      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766740527.931034      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766740527.931037      24 computation_placer.cc:177] computation placer alr

Epoch 1/5 - Loss: 0.3185
Epoch 2/5 - Loss: 0.1633
Epoch 3/5 - Loss: 0.1550
Epoch 4/5 - Loss: 0.1518
Epoch 5/5 - Loss: 0.1483
Validation Accuracy: 0.950852557673019
              precision    recall  f1-score   support

           0     0.9420    0.9898    0.9653       689
           1     0.9744    0.8636    0.9157       308

    accuracy                         0.9509       997
   macro avg     0.9582    0.9267    0.9405       997
weighted avg     0.9520    0.9509    0.9500       997

Model weights saved to: /kaggle/working/lstm_model/lstm_phobert.pt
Model weights zipped to: /kaggle/working/lstm_model/lstm_phobert.zip
