# Set up

In [1]:
!pip install huggingface_hub[hf_xet] --quiet

In [2]:
!pip install scikit-learn==1.6.1

Collecting scikit-learn==1.6.1
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.6.1 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSucce

In [3]:
import kagglehub
import pickle
import os
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, DataLoader, TensorDataset
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    get_scheduler
)
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    get_scheduler, PreTrainedModel, PreTrainedTokenizer
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from tqdm import tqdm

from torch import nn, optim
from typing import Tuple, Type, Optional, Dict,List

import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import GridSearchCV, PredefinedSplit

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed()
warnings.filterwarnings("ignore")

2025-06-08 15:24:28.198735: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749396268.394972      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749396268.454352      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# LLMs approach

In [None]:
#Load & prepare data
def load_and_prepare_data():
    path = kagglehub.dataset_download("tisdang/pps-data")
    llm_path = os.path.join(path, 'dataset/llm')

    with open(os.path.join(llm_path, 'train_data.pkl'), 'rb') as f:
        train_data = pickle.load(f)

    with open(os.path.join(llm_path, 'val_data.pkl'), 'rb') as f:
        val_data = pickle.load(f)

    with open(os.path.join(llm_path, 'test_data.pkl'), 'rb') as f:
        test_data = pickle.load(f)

    train_data = train_data.dropna(subset=['text'])
    val_data = val_data.dropna(subset=['text'])
    test_data = test_data.dropna(subset=['text'])

    label_map = {0: "Fake", 1: "True"}

    return train_data, val_data, test_data, label_map

#Tokenize
def tokenize(df, tokenizer, max_len):
    encodings = tokenizer(
        df["text"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    labels = torch.tensor(df["labels"].values)
    return encodings, labels

#Dataset & DataLoader
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

def create_loader(encodings, labels, batch_size=16, shuffle=False, num_workers=0, seed=42):
    def worker_init_fn(worker_id):
        worker_seed = seed + worker_id
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    dataset = FakeNewsDataset(encodings, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
                      num_workers=num_workers, worker_init_fn=worker_init_fn)

#Trainer
class Trainer:
    def __init__(self, model, train_loader, val_loader, epochs=3, lr=2e-5):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.epochs = epochs
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        total_steps = len(train_loader) * epochs
        self.scheduler = get_scheduler("linear", self.optimizer, 0, total_steps)

    def train(self):
        for epoch in range(self.epochs):
            self.model.train()
            total_loss = 0
            all_preds, all_labels = [], []
            loop = tqdm(self.train_loader, desc=f"Epoch {epoch+1}", leave=False)

            for batch in loop:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                self.optimizer.zero_grad()
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                self.optimizer.step()
                self.scheduler.step()

                total_loss += loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                loop.set_postfix(loss=loss.item())

            avg_loss = total_loss / len(self.train_loader)
            acc = accuracy_score(all_labels, all_preds)
            tqdm.write(f"\nEpoch {epoch+1} - Train Loss: {avg_loss:.4f} | Accuracy: {acc:.4f}")
            self.evaluate(self.val_loader, name="Validation")

    def evaluate(self, loader, name="Validation"):
        self.model.eval()
        all_preds, all_labels = [], []
        total_loss = 0
        with torch.no_grad():
            for batch in tqdm(loader, desc=f"Evaluating {name}", leave=True):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                total_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(loader)
        acc = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds)
        recall = recall_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)

        tqdm.write(f"{name} Loss: {avg_loss:.4f}")
        tqdm.write(f"{name} Accuracy: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-score: {f1:.4f}")
        tqdm.write("\n" + classification_report(all_labels, all_preds, digits=2))

        cm = confusion_matrix(all_labels, all_preds)
        cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)
        tqdm.write("Confusion Matrix (normalized):")
        for i, row in enumerate(cm_normalized):
            tqdm.write(f"{i}: [{', '.join(f'{val:.2f}' for val in row)}]")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data, val_data, test_data, label_map = load_and_prepare_data()

In [None]:
def llm_pipeline(
    model_name: str,
    model_class: Type[PreTrainedModel],
    tokenizer_class: Type[PreTrainedTokenizer],
    train_data: pd.DataFrame,
    val_data: pd.DataFrame,
    test_data: pd.DataFrame,
    max_len: int = 64,
    batch_size: int = 8,
    epochs: int = 3,
    lr: float = 2e-5,
    num_workers: int = 2,
    label_map: Optional[Dict[int, str]] = None
) -> None:

    if label_map is None:
        label_map = {0: "Fake", 1: "True"}

    print(f"\n======== Running {model_name.upper()} ========\n")

    # Tokenizer
    tokenizer = tokenizer_class.from_pretrained(model_name)
    if "xlnet-base-cased" in model_name.lower():
        tokenizer.pad_token = tokenizer.eos_token

    # Tokenize
    train_enc, train_labels = tokenize(train_data, tokenizer, max_len)
    val_enc, val_labels = tokenize(val_data, tokenizer, max_len)
    test_enc, test_labels = tokenize(test_data, tokenizer, max_len)

    # DataLoader
    train_loader = create_loader(train_enc, train_labels, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = create_loader(val_enc, val_labels, batch_size=batch_size, num_workers=num_workers)
    test_loader = create_loader(test_enc, test_labels, batch_size=batch_size, num_workers=num_workers)

    # Model
    model = model_class.from_pretrained(model_name, num_labels=2).to(device)

    # Thông tin model
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print("\nModel architecture:\n")
    print(model)

    # Training
    trainer = Trainer(model, train_loader, val_loader, epochs, lr)
    trainer.train()

    # Evaluation
    print(f"\n----- {model_name.upper()} Evaluation on Test Set -----")
    trainer.evaluate(test_loader, name="Test")


## BERT Model

In [6]:
llm_pipeline("bert-base-uncased", BertForSequenceClassification, BertTokenizer, train_data, val_data, test_data, label_map=label_map)





tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 109,483,778
Trainable parameters: 109,483,778

Model architecture:

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features

                                                                           


Epoch 1 - Train Loss: 0.1001 | Accuracy: 0.9622


Evaluating Validation: 100%|██████████| 1965/1965 [01:06<00:00, 29.75it/s]


Validation Loss: 0.0561
Validation Accuracy: 0.9796 | Precision: 0.9739 | Recall: 0.9806 | F1-score: 0.9772

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      8715
           1       0.97      0.98      0.98      6999

    accuracy                           0.98     15714
   macro avg       0.98      0.98      0.98     15714
weighted avg       0.98      0.98      0.98     15714

Confusion Matrix (normalized):
0: [0.98, 0.02]
1: [0.02, 0.98]


                                                                           


Epoch 2 - Train Loss: 0.0275 | Accuracy: 0.9909


Evaluating Validation: 100%|██████████| 1965/1965 [01:05<00:00, 29.92it/s]


Validation Loss: 0.0525
Validation Accuracy: 0.9833 | Precision: 0.9800 | Recall: 0.9826 | F1-score: 0.9813

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      8715
           1       0.98      0.98      0.98      6999

    accuracy                           0.98     15714
   macro avg       0.98      0.98      0.98     15714
weighted avg       0.98      0.98      0.98     15714

Confusion Matrix (normalized):
0: [0.98, 0.02]
1: [0.02, 0.98]


                                                                          


Epoch 3 - Train Loss: 0.0053 | Accuracy: 0.9984


Evaluating Validation: 100%|██████████| 1965/1965 [01:05<00:00, 29.85it/s]


Validation Loss: 0.0643
Validation Accuracy: 0.9840 | Precision: 0.9809 | Recall: 0.9833 | F1-score: 0.9821

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      8715
           1       0.98      0.98      0.98      6999

    accuracy                           0.98     15714
   macro avg       0.98      0.98      0.98     15714
weighted avg       0.98      0.98      0.98     15714

Confusion Matrix (normalized):
0: [0.98, 0.02]
1: [0.02, 0.98]

----- BERT-BASE-UNCASED Evaluation on Test Set -----


Evaluating Test: 100%|██████████| 1965/1965 [01:05<00:00, 29.93it/s]


Test Loss: 0.0799
Test Accuracy: 0.9829 | Precision: 0.9794 | Recall: 0.9818 | F1-score: 0.9806

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      8782
           1       0.98      0.98      0.98      6938

    accuracy                           0.98     15720
   macro avg       0.98      0.98      0.98     15720
weighted avg       0.98      0.98      0.98     15720

Confusion Matrix (normalized):
0: [0.98, 0.02]
1: [0.02, 0.98]


## XLNet Model

In [7]:
llm_pipeline("xlnet-base-cased", XLNetForSequenceClassification, XLNetTokenizer, train_data, val_data, test_data)





spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 117,310,466
Trainable parameters: 117,310,466

Model architecture:

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_featu


Epoch 1:   0%|          | 0/5895 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 0/5895 [00:00<?, ?it/s, loss=0.705][A
Epoch 1:   0%|          | 1/5895 [00:00<36:01,  2.73it/s, loss=0.705][A
Epoch 1:   0%|          | 1/5895 [00:00<36:01,  2.73it/s, loss=0.584][A
Epoch 1:   0%|          | 2/5895 [00:00<26:01,  3.77it/s, loss=0.584][A
Epoch 1:   0%|          | 2/5895 [00:00<26:01,  3.77it/s, loss=0.663][A
Epoch 1:   0%|          | 3/5895 [00:00<26:23,  3.72it/s, loss=0.663][A
Epoch 1:   0%|          | 3/5895 [00:01<26:23,  3.72it/s, loss=0.861][A
Epoch 1:   0%|          | 4/5895 [00:01<24:42,  3.97it/s, loss=0.861][A
Epoch 1:   0%|          | 4/5895 [00:01<24:42,  3.97it/s, loss=0.71] [A
Epoch 1:   0%|          | 5/5895 [00:01<24:34,  3.99it/s, loss=0.71][A
Epoch 1:   0%|          | 5/5895 [00:01<24:34,  3.99it/s, loss=0.554][A
Epoch 1:   0%|          | 6/5895 [00:01<27:02,  3.63it/s, loss=0.554][A
Epoch 1:   0%|          | 6/5895 [00:01<27:02,  3.63it/s, loss=0.87] [A
Epoch 


Epoch 1 - Train Loss: 0.0890 | Accuracy: 0.9666


Evaluating Validation: 100%|██████████| 1965/1965 [01:19<00:00, 24.68it/s]


Validation Loss: 0.0560
Validation Accuracy: 0.9835 | Precision: 0.9814 | Recall: 0.9816 | F1-score: 0.9815

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8715
           1       0.98      0.98      0.98      6999

    accuracy                           0.98     15714
   macro avg       0.98      0.98      0.98     15714
weighted avg       0.98      0.98      0.98     15714

Confusion Matrix (normalized):
0: [0.99, 0.01]
1: [0.02, 0.98]


                                                                          


Epoch 2 - Train Loss: 0.0288 | Accuracy: 0.9896


Evaluating Validation: 100%|██████████| 1965/1965 [01:19<00:00, 24.69it/s]


Validation Loss: 0.0525
Validation Accuracy: 0.9858 | Precision: 0.9843 | Recall: 0.9839 | F1-score: 0.9841

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8715
           1       0.98      0.98      0.98      6999

    accuracy                           0.99     15714
   macro avg       0.99      0.99      0.99     15714
weighted avg       0.99      0.99      0.99     15714

Confusion Matrix (normalized):
0: [0.99, 0.01]
1: [0.02, 0.98]


                                                                          


Epoch 3 - Train Loss: 0.0092 | Accuracy: 0.9969


Evaluating Validation: 100%|██████████| 1965/1965 [01:19<00:00, 24.69it/s]


Validation Loss: 0.0544
Validation Accuracy: 0.9868 | Precision: 0.9822 | Recall: 0.9881 | F1-score: 0.9852

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8715
           1       0.98      0.99      0.99      6999

    accuracy                           0.99     15714
   macro avg       0.99      0.99      0.99     15714
weighted avg       0.99      0.99      0.99     15714

Confusion Matrix (normalized):
0: [0.99, 0.01]
1: [0.01, 0.99]

----- XLNET-BASE-CASED Evaluation on Test Set -----


Evaluating Test: 100%|██████████| 1965/1965 [01:19<00:00, 24.68it/s]


Test Loss: 0.0623
Test Accuracy: 0.9856 | Precision: 0.9830 | Recall: 0.9844 | F1-score: 0.9837

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8782
           1       0.98      0.98      0.98      6938

    accuracy                           0.99     15720
   macro avg       0.99      0.99      0.99     15720
weighted avg       0.99      0.99      0.99     15720

Confusion Matrix (normalized):
0: [0.99, 0.01]
1: [0.02, 0.98]


# Deep Learning Approaches

In [8]:
def load_data(X_path: str, y_path: str, batch_size: int) -> torch.utils.data.DataLoader:
    with open(X_path, 'rb') as f:
        text_vector = pickle.load(f)
    with open(y_path, 'rb') as f:
        labels = pickle.load(f)
        
    return DataLoader(TensorDataset(torch.tensor(text_vector, dtype=torch.long), torch.tensor(labels)),
                      batch_size = batch_size, shuffle=True)
    

train_loader = load_data(X_path = '/kaggle/input/pps-data/dataset/dl/train_text.pkl',
                        y_path = '/kaggle/input/pps-data/dataset/dl/train_labels.pkl',
                        batch_size = 64)
test_loader = load_data(X_path = '/kaggle/input/pps-data/dataset/dl/test_text.pkl',
                        y_path = '/kaggle/input/pps-data/dataset/dl/test_labels.pkl',
                        batch_size = 64)
val_loader = load_data(X_path = '/kaggle/input/pps-data/dataset/dl/val_text.pkl',
                        y_path = '/kaggle/input/pps-data/dataset/dl/val_labels.pkl',
                        batch_size = 64)

## GRU

In [10]:
class GRUTextClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int = 1821,
        embedding_dim: int = 128,
        hidden_dim: int = 128,
        output_dim: int = 1,
        device: str = 'cpu'
    ) -> None:
        super(GRUTextClassifier, self).__init__()
        
        self.embedding: nn.Embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru: nn.GRU = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc: nn.Linear = nn.Linear(hidden_dim, output_dim)
        self.sigmoid: nn.Sigmoid = nn.Sigmoid()
        self.device: str = device
        self.to(device)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (Tensor): Tensor of shape (batch_size, sequence_length), type torch.LongTensor
        
        Returns:
            Tensor: Output probabilities of shape (batch_size, 1), values in [0,1]
        """
        x = x.long().to(self.device)
        embedded = self.embedding(x)                         # (B, T, E)
        gru_out, _ = self.gru(embedded)                      # (B, T, H)
        last_hidden = gru_out[:, -1, :]                      # (B, H)
        output = self.fc(last_hidden)                        # (B, 1)
        return self.sigmoid(output)


In [11]:
class Trainer:
    def __init__(self, 
                 model: nn.Module, 
                 device: str ='cpu', 
                 lr: float = 1e-3,
                 epochs: int = 10,
                 patience: int = 3) -> None:
        self.model = model.to(device)
        self.device = device
        self.criterion = nn.BCEWithLogitsLoss()
        self.optimizer = optim.RMSprop(self.model.parameters(), lr=lr, alpha=0.9)
        self.patience = patience
        self.epochs = epochs
    
    def evaluate(self, 
                 data_loader: torch.utils.data.DataLoader, 
                 print_confusion_matrix: bool = False
                )-> Tuple[float]:
        self.model.eval()
        total_loss = 0.0
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for inputs, labels in data_loader:
                inputs = inputs.to(self.device)
                labels = labels.to(self.device).float().unsqueeze(1)

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                total_loss += loss.item() * inputs.size(0)

                preds = (outputs > 0.5).int()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(data_loader.dataset)
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, zero_division=0)
        recall = recall_score(all_labels, all_preds, zero_division=0)
        f1 = f1_score(all_labels, all_preds, zero_division=0)

        if print_confusion_matrix:
            cm = confusion_matrix(all_labels, all_preds)
            cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)
            print(f"Confusion Matrix (normalized):\n{cm_normalized}")

        return avg_loss, accuracy, precision, recall, f1

    def train(self, 
              train_loader: torch.utils.data.DataLoader, 
              val_loader: torch.utils.data.DataLoader
             ) -> None:
        best_val_loss = float('inf')
        best_model_state = None
        epochs_no_improve = 0

        for epoch in range(self.epochs):
            self.model.train()
            train_loss = 0.0
            loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{self.epochs}", leave=True)

            for inputs, labels in loop:
                inputs = inputs.to(self.device)
                labels = labels.to(self.device).float().unsqueeze(1)

                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()

                train_loss += loss.item() * inputs.size(0)
                loop.set_postfix(loss=loss.item())

            # Đánh giá trên tập huấn luyện và validation
            avg_train_loss, train_acc, train_prec, train_rec, train_f1 = self.evaluate(train_loader)
            val_loss, val_acc, val_prec, val_rec, val_f1 = self.evaluate(val_loader, print_confusion_matrix = True)

            print(f"\nEpoch {epoch+1}")
            print(f"  Train: loss={avg_train_loss:.4f}, acc={train_acc:.4f}, precision={train_prec:.4f}, recall={train_rec:.4f}, f1={train_f1:.4f}")
            print(f"  Val:   loss={val_loss:.4f}, acc={val_acc:.4f}, precision={val_prec:.4f}, recall={val_rec:.4f}, f1={val_f1:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = self.model.state_dict()
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= self.patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

        if best_model_state:
            self.model.load_state_dict(best_model_state)

    def inference(self, inputs):
        """
        Dự đoán đầu ra cho một batch tensor input.
        Đầu vào:
            - Nếu inputs có shape [seq_len]: sẽ được unsqueeze thành [1, seq_len]
            - Nếu inputs có shape [batch_size, seq_len]: dùng trực tiếp
        Đầu ra:
            - Nếu batch_size = 1: trả về 1 số (int, float)
            - Nếu batch_size > 1: trả về mảng numpy (preds, probs)
        """
        self.model.eval()
    
        # Nếu inputs là [seq_len], thêm batch dim
        if inputs.dim() == 1:
            inputs = inputs.unsqueeze(0)
    
        inputs = inputs.to(self.device)
    
        with torch.no_grad():
            outputs = self.model(inputs)
            logits = outputs.squeeze(-1) if outputs.shape[-1] == 1 else outputs
            probs = torch.sigmoid(logits)
    
        # Trả về 1 số nếu chỉ có 1 mẫu
        if preds.shape[0] == 1:
            return  probs.item()
        else:
            return probs.cpu().numpy()

In [12]:
gru_config = {
    "embedding_dim": 128, 
    "hidden_dim": 256, 
    "output_dim": 1, 
    "device":'cuda' if torch.cuda.is_available() else 'cpu', 
}

gru = GRUTextClassifier(**gru_config)

gru_trainer_config = {
    "model": gru,
    "lr": 5e-4,
    "epochs": 50,
    "device":'cuda' if torch.cuda.is_available() else 'cpu'
}


gru_trainer = Trainer(**gru_trainer_config)
gru_trainer.train(train_loader, val_loader)

print("=========== Evaluation ==============")
metrics = gru_trainer.evaluate(test_loader, print_confusion_matrix=True)
print(f"Accuracy: {metrics[1]} - Precision: {metrics[2]} - Recall: {metrics[3]} - F1: {metrics[4]}")

Epoch 1/50: 100%|██████████| 644/644 [00:12<00:00, 50.90it/s, loss=0.635]


Confusion Matrix (normalized):
[[0.67851917 0.32148083]
 [0.3025991  0.6974009 ]]

Epoch 1
  Train: loss=0.6195, acc=0.7643, precision=0.7684, recall=0.7593, f1=0.7638
  Val:   loss=0.6591, acc=0.6880, precision=0.6861, recall=0.6974, f1=0.6917


Epoch 2/50: 100%|██████████| 644/644 [00:13<00:00, 49.51it/s, loss=0.527]


Confusion Matrix (normalized):
[[0.67500732 0.32499268]
 [0.12806737 0.87193263]]

Epoch 2
  Train: loss=0.5948, acc=0.8319, precision=0.7925, recall=0.9011, f1=0.8433
  Val:   loss=0.6270, acc=0.7739, precision=0.7300, recall=0.8719, f1=0.7947


Epoch 3/50: 100%|██████████| 644/644 [00:12<00:00, 50.38it/s, loss=0.503]


Confusion Matrix (normalized):
[[0.66052092 0.33947908]
 [0.105416   0.894584  ]]

Epoch 3
  Train: loss=0.5848, acc=0.8523, precision=0.8088, recall=0.9244, f1=0.8627
  Val:   loss=0.6273, acc=0.7780, precision=0.7264, recall=0.8946, f1=0.8018


Epoch 4/50: 100%|██████████| 644/644 [00:12<00:00, 53.37it/s, loss=0.504]


Confusion Matrix (normalized):
[[0.75695054 0.24304946]
 [0.18353419 0.81646581]]

Epoch 4
  Train: loss=0.5674, acc=0.8720, precision=0.8663, recall=0.8808, f1=0.8735
  Val:   loss=0.6122, acc=0.7868, precision=0.7720, recall=0.8165, f1=0.7936


Epoch 5/50: 100%|██████████| 644/644 [00:12<00:00, 53.03it/s, loss=0.591]


Confusion Matrix (normalized):
[[0.78914252 0.21085748]
 [0.18556701 0.81443299]]

Epoch 5
  Train: loss=0.5603, acc=0.8818, precision=0.8905, recall=0.8716, f1=0.8810
  Val:   loss=0.6026, acc=0.8018, precision=0.7956, recall=0.8144, f1=0.8049


Epoch 6/50: 100%|██████████| 644/644 [00:12<00:00, 52.16it/s, loss=0.503]


Confusion Matrix (normalized):
[[0.74802458 0.25197542]
 [0.14360389 0.85639611]]

Epoch 6
  Train: loss=0.5586, acc=0.8911, precision=0.8810, recall=0.9054, f1=0.8930
  Val:   loss=0.6078, acc=0.8024, precision=0.7740, recall=0.8564, f1=0.8131


Epoch 7/50: 100%|██████████| 644/644 [00:12<00:00, 51.25it/s, loss=0.541]


Confusion Matrix (normalized):
[[0.86376939 0.13623061]
 [0.30927835 0.69072165]]

Epoch 7
  Train: loss=0.5646, acc=0.8556, precision=0.9272, recall=0.7729, f1=0.8431
  Val:   loss=0.6032, acc=0.7769, precision=0.8363, recall=0.6907, f1=0.7566


Epoch 8/50: 100%|██████████| 644/644 [00:12<00:00, 51.92it/s, loss=0.541]


Confusion Matrix (normalized):
[[0.74261048 0.25738952]
 [0.13721504 0.86278496]]

Epoch 8
  Train: loss=0.5535, acc=0.9018, precision=0.8907, recall=0.9169, f1=0.9036
  Val:   loss=0.6076, acc=0.8029, precision=0.7716, recall=0.8628, f1=0.8146
Early stopping at epoch 8
Confusion Matrix (normalized):
[[0.72182681 0.27817319]
 [0.18646983 0.81353017]]
Accuracy: 0.768457109540121 - Precision: 0.7515889830508474 - Recall: 0.8135301705604128 - F1: 0.7813338839562255


## LSTM

In [14]:
class LSTMTextClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int = 1821,
        embedding_dim: int = 128,
        hidden_dim: int = 128,
        output_dim: int = 1,
        device: str = 'cpu'
    ) -> None:
        super(LSTMTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

        self.device = device
        self.to(device)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden = hidden[-1]
        out = self.fc(hidden)
        return self.sigmoid(out)

In [15]:
lstm_config = {
    "embedding_dim": 128, 
    "hidden_dim": 512, 
    "output_dim": 1, 
    "device":'cuda' if torch.cuda.is_available() else 'cpu', 
}
lstm = LSTMTextClassifier(**lstm_config)

lstm_trainer_config = {
    "model": lstm,
    "lr": 5e-4,
    "epochs": 50,
    "device":'cuda' if torch.cuda.is_available() else 'cpu'
}


lstm_trainer = Trainer(**lstm_trainer_config, patience=4)
lstm_trainer.train(train_loader, val_loader)
lstm_trainer.evaluate(test_loader, print_confusion_matrix=True)

print("=========== Evaluation ==============")
metrics = lstm_trainer.evaluate(test_loader, print_confusion_matrix=True)
print(f"Accuracy: {metrics[1]} - Precision: {metrics[2]} - Recall: {metrics[3]} - F1: {metrics[4]}")

Epoch 1/50: 100%|██████████| 644/644 [00:39<00:00, 16.49it/s, loss=0.628]


Confusion Matrix (normalized):
[[0.64427861 0.35572139]
 [0.16494845 0.83505155]]

Epoch 1
  Train: loss=0.6012, acc=0.8187, precision=0.7823, recall=0.8850, f1=0.8305
  Val:   loss=0.6429, acc=0.7400, precision=0.7029, recall=0.8351, f1=0.7633


Epoch 2/50: 100%|██████████| 644/644 [00:39<00:00, 16.50it/s, loss=0.558]


Confusion Matrix (normalized):
[[0.59789289 0.40210711]
 [0.12763177 0.87236823]]

Epoch 2
  Train: loss=0.5966, acc=0.8311, precision=0.7853, recall=0.9132, f1=0.8445
  Val:   loss=0.6506, acc=0.7357, precision=0.6862, recall=0.8724, f1=0.7681


Epoch 3/50: 100%|██████████| 644/644 [00:39<00:00, 16.51it/s, loss=0.641]


Confusion Matrix (normalized):
[[0.84108867 0.15891133]
 [0.23987222 0.76012778]]

Epoch 3
  Train: loss=0.5689, acc=0.8509, precision=0.9021, recall=0.7885, f1=0.8415
  Val:   loss=0.5968, acc=0.8005, precision=0.8282, recall=0.7601, f1=0.7927


Epoch 4/50: 100%|██████████| 644/644 [00:39<00:00, 16.50it/s, loss=0.469]


Confusion Matrix (normalized):
[[0.70032192 0.29967808]
 [0.11340206 0.88659794]]

Epoch 4
  Train: loss=0.5644, acc=0.8841, precision=0.8598, recall=0.9190, f1=0.8884
  Val:   loss=0.6169, acc=0.7938, precision=0.7488, recall=0.8866, f1=0.8119


Epoch 5/50: 100%|██████████| 644/644 [00:39<00:00, 16.50it/s, loss=0.432]


Confusion Matrix (normalized):
[[0.67486099 0.32513901]
 [0.1446203  0.8553797 ]]

Epoch 5
  Train: loss=0.5600, acc=0.8911, precision=0.8726, recall=0.9169, f1=0.8942
  Val:   loss=0.6295, acc=0.7655, precision=0.7261, recall=0.8554, f1=0.7855


Epoch 6/50: 100%|██████████| 644/644 [00:39<00:00, 16.49it/s, loss=0.512]


Confusion Matrix (normalized):
[[0.8309921  0.1690079 ]
 [0.34311021 0.65688979]]

Epoch 6
  Train: loss=0.5642, acc=0.8566, precision=0.9303, recall=0.7722, f1=0.8439
  Val:   loss=0.6190, acc=0.7436, precision=0.7966, recall=0.6569, f1=0.7200


Epoch 7/50: 100%|██████████| 644/644 [00:39<00:00, 16.49it/s, loss=0.541]


Confusion Matrix (normalized):
[[0.75841381 0.24158619]
 [0.17874256 0.82125744]]

Epoch 7
  Train: loss=0.5475, acc=0.9075, precision=0.9152, recall=0.8989, f1=0.9070
  Val:   loss=0.6114, acc=0.7900, precision=0.7741, recall=0.8213, f1=0.7970
Early stopping at epoch 7
Confusion Matrix (normalized):
[[0.77387307 0.22612693]
 [0.20796904 0.79203096]]
Confusion Matrix (normalized):
[[0.77387307 0.22612693]
 [0.20796904 0.79203096]]
Accuracy: 0.7831061875956563 - Precision: 0.7837186214721316 - Recall: 0.7920309588648416 - F1: 0.787852865697177


# Traditional Machine Learning approach

In [17]:
path = kagglehub.dataset_download("tisdang/pps-data")
print("Path to dataset files:", path)
data_path = os.path.join(path, 'dataset/ml')

# Load data
with open(os.path.join(data_path, 'train_text.pkl'), 'rb') as f:
    train_data = pickle.load(f)

with open(os.path.join(data_path, 'val_text.pkl'), 'rb') as f:
    val_data = pickle.load(f)

with open(os.path.join(data_path, 'test_text.pkl'), 'rb') as f:
    test_data = pickle.load(f)

with open(os.path.join(data_path, 'train_labels.pkl'), 'rb') as f:
    train_labels = pickle.load(f)

with open(os.path.join(data_path, 'val_labels.pkl'), 'rb') as f:
    val_labels = pickle.load(f)

with open(os.path.join(data_path, 'test_labels.pkl'), 'rb') as f:
    test_labels = pickle.load(f)

Path to dataset files: /kaggle/input/pps-data


In [18]:
train_data = np.array(train_data)
val_data = np.array(val_data)
test_data = np.array(test_data)

In [19]:
# Combine training and validation data
X_all = np.concatenate([train_data, val_data])
y_all = np.concatenate([train_labels, val_labels])

# Create test_fold: -1 = train, 0 = validation
training_fold = np.concatenate([
    -1 * np.ones(len(train_data), dtype = int),
    np.zeros(len(val_data), dtype = int)
])

In [20]:
def calculate_performance(y_true, y_pred):
    print("Classification report: \n", classification_report(y_true, y_pred))

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average = 'macro')
    recall = recall_score(y_true, y_pred, average = 'macro')
    f1 = f1_score(y_true, y_pred, average = 'macro')

    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1, keepdims=True)
    print(f"Confusion Matrix (normalized):\n{cm_normalized}")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

## Logistic Regression

In [21]:
warnings.simplefilter("ignore", ConvergenceWarning)

def log_model(X, y, training_fold = None):
    
    model = LogisticRegression(max_iter = 500)

    param_grid = [
        # L1 penalty
    {
        'penalty': ['l1'],
        'solver': ['liblinear', 'saga'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': [None, 'balanced']
    },
        # L2 penalty
    {
        'penalty': ['l2'],
        'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': [None, 'balanced']
    },
        # Elasticnet penalty
    {
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': [None, 'balanced'],
        'l1_ratio': [0.05, 0.1, 0.15, 0.2, 0.25]
    }
    ]

    cv_strategy = PredefinedSplit(training_fold) if training_fold is not None else 5
    
    search = GridSearchCV(model, param_grid, cv = cv_strategy, scoring = "f1", n_jobs = -1)
    search.fit(X, y)

    print("Best parameter (CV score = %0.3f):" % search.best_score_)
    print(search.best_params_)

    return search.best_estimator_

In [22]:
log_reg = log_model(X_all, y_all, training_fold)
test_preds = log_reg.predict(test_data)
calculate_performance(test_labels, test_preds)

Best parameter (CV score = 0.929):
{'C': 10, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Classification report: 
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      6744
           1       0.94      0.93      0.93      6977

    accuracy                           0.93     13721
   macro avg       0.93      0.93      0.93     13721
weighted avg       0.93      0.93      0.93     13721

Confusion Matrix (normalized):
[[0.9334223  0.0665777 ]
 [0.07152071 0.92847929]]
Accuracy: 0.9309
Precision: 0.9309
Recall: 0.9310
F1-score: 0.9309


## SVM

In [23]:


warnings.simplefilter("ignore", ConvergenceWarning)

def svm_model(X, y, training_fold = None):
    model =  LinearSVC(max_iter = 10000)

    param_grid = [
    {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1'],
    'dual': [False],
    'loss': ['squared_hinge'],
    'class_weight': [None, 'balanced']
    }, 
    {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'dual': [False, True],
    'loss': ['hinge', 'squared_hinge'],
    'class_weight': [None, 'balanced']
    }
    ]
    
    cv_strategy = PredefinedSplit(training_fold) if training_fold is not None else 5
    
    search = GridSearchCV(model, param_grid, cv = cv_strategy, scoring = "f1", n_jobs = -1)
    search.fit(X, y)

    print("Best parameter (CV score = %0.3f):" % search.best_score_)
    print(search.best_params_)

    return search.best_estimator_

In [24]:
model = svm_model(X_all, y_all, training_fold)



Best parameter (CV score = 0.929):
{'C': 1, 'class_weight': None, 'dual': True, 'loss': 'hinge', 'penalty': 'l2'}


In [25]:
test_preds = model.predict(test_data)
calculate_performance(test_labels, test_preds)

Classification report: 
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      6744
           1       0.93      0.93      0.93      6977

    accuracy                           0.93     13721
   macro avg       0.93      0.93      0.93     13721
weighted avg       0.93      0.93      0.93     13721

Confusion Matrix (normalized):
[[0.9304567  0.0695433 ]
 [0.06808084 0.93191916]]
Accuracy: 0.9312
Precision: 0.9312
Recall: 0.9312
F1-score: 0.9312
