In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np
from sklearn.model_selection import train_test_split

#=============================
# 1. MÔ HÌNH VAE
#=============================
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, latent_dim=32):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_mean = nn.Linear(hidden_dim, latent_dim)
        self.fc2_logvar = nn.Linear(hidden_dim, latent_dim)
        # Decoder
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h = F.relu(self.fc1(x))
        mean = self.fc2_mean(h)
        logvar = self.fc2_logvar(h)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def decode(self, z):
        h = F.relu(self.fc3(z))
        x_recon = self.fc4(h)
        return x_recon

    def forward(self, x):
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        x_recon = self.decode(z)
        return x_recon, mean, logvar

def vae_loss_fn(x, x_recon, mean, logvar):
    """
    Reconstruction loss + KL-divergence
    """
    recon_loss = F.mse_loss(x_recon, x, reduction='sum')
    # KL divergence
    kl_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
    return recon_loss + kl_loss


#=============================
# 2. MODULE POS ENCODING & TRANSFORMER (GIỮ NGUYÊN)
#=============================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        L = x.size(1)
        return x + self.pe[:, :L, :].to(x.device)

class TransformerDetector(nn.Module):
    def __init__(self, input_size, d_model=128, nhead=8, num_layers=2, dim_feedforward=256, dropout=0.1):
        super(TransformerDetector, self).__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Sequential(
            nn.Linear(d_model, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        if x.dim() == 2:  
            x = x.unsqueeze(1)
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return self.fc(x)

In [2]:

#=============================
# 3. HÀM TIỆN ÍCH
#=============================
def load_adbench_data(dataset_path):
    data = np.load(dataset_path)
    X = data['X']
    y = data['y']
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

def evaluate_with_classification_report_and_auc(model, test_loader, device, threshold=0.5):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch).squeeze()
            all_preds.append(y_pred.cpu())
            all_labels.append(y_batch.cpu())

    preds = torch.cat(all_preds).numpy()
    labels = torch.cat(all_labels).numpy()

    binary_preds = (preds > threshold).astype(int)

    report = classification_report(labels, binary_preds, target_names=['Class 0', 'Class 1'])
    print(report)

    if len(set(labels)) > 1:
        aucroc = roc_auc_score(labels, preds)
        print(f"AUC-ROC: {aucroc:.4f}")
    else:
        aucroc = None
        print("AUC-ROC: Undefined (only one class present in labels)")

    return report, aucroc

def train_detector(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch).squeeze()
        loss = criterion(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [5]:

#=============================
# 4. MAIN: THAY THẾ SMOTE BẰNG VAE
#=============================
if __name__ == "__main__":
    dataset_path = "Classical/20_letter.npz"  # Thay bằng đường dẫn dataset của bạn
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tải dữ liệu
    X, y = load_adbench_data(dataset_path)

    # Chia train/test
    X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
        X.numpy(), y.numpy(), test_size=0.2, random_state=42, stratify=y
    )

    # Đổi lại thành tensor
    X_train_all = torch.tensor(X_train_np, dtype=torch.float32)
    y_train_all = torch.tensor(y_train_np, dtype=torch.float32)
    X_test = torch.tensor(X_test_np, dtype=torch.float32)
    y_test = torch.tensor(y_test_np, dtype=torch.float32)

    #-------------
    # Bước A: TÁCH DỮ LIỆU LỚP THIỂU SỐ
    #-------------
    # Giả sử bạn xác định lớp 1 là lớp thiểu số. Nếu ngược lại thì tuỳ tình huống.
    minority_mask = (y_train_all == 1)
    X_minority = X_train_all[minority_mask]
    y_minority = y_train_all[minority_mask]

    majority_mask = (y_train_all == 0)
    X_majority = X_train_all[majority_mask]
    y_majority = y_train_all[majority_mask]

    print("Trước khi oversampling bằng VAE:")
    print("Số lượng majority:", len(X_majority))
    print("Số lượng minority:", len(X_minority))

    #-------------
    # Bước B: ĐỊNH NGHĨA & TRAIN VAE TRÊN DỮ LIỆU THIỂU SỐ
    #-------------
    input_dim = X_train_all.shape[1]
    vae = VAE(input_dim=input_dim, hidden_dim=128, latent_dim=32).to(device)

    optimizer_vae = Adam(vae.parameters(), lr=1e-3)
    vae_epochs = 200

    # DataLoader cho lớp thiểu số
    minority_dataset = TensorDataset(X_minority)
    minority_loader = DataLoader(minority_dataset, batch_size=64, shuffle=True)

    vae.train()
    for epoch in range(vae_epochs):
        total_vae_loss = 0
        for (x_batch,) in minority_loader:
            x_batch = x_batch.to(device)
            x_recon, mean, logvar = vae(x_batch)
            loss_vae = vae_loss_fn(x_batch, x_recon, mean, logvar)
            optimizer_vae.zero_grad()
            loss_vae.backward()
            optimizer_vae.step()
            total_vae_loss += loss_vae.item()
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}/{vae_epochs}, VAE Loss = {total_vae_loss/len(minority_loader):.2f}")

    #-------------
    # Bước C: SINH THÊM DỮ LIỆU TỪ VAE
    #-------------
    vae.eval()
    # Chọn số lượng mẫu muốn sinh thêm (ví dụ: bằng với số majority để cân bằng)
    num_generate = len(X_majority) - len(X_minority)  # Hoặc tuỳ ý
    if num_generate <= 0:
        num_generate = len(X_minority)  # nếu đã balance rồi thì sinh ít hơn

    with torch.no_grad():
        # Sampling latent vector z ~ N(0, I)
        z = torch.randn(num_generate, 32).to(device)
        X_synthetic = vae.decode(z)
    
    # Gán label = 1 cho dữ liệu synthetic
    y_synthetic = torch.ones(num_generate, dtype=torch.float32)

    # Chuyển về CPU nếu cần
    X_synthetic = X_synthetic.cpu()
    y_synthetic = y_synthetic.cpu()

    print("Đã sinh thêm:", len(X_synthetic), "mẫu minority bằng VAE")

    #-------------
    # Bước D: GHÉP DỮ LIỆU THIỂU SỐ MỚI SINH + DỮ LIỆU GỐC
    #-------------
    X_train_final = torch.cat([X_majority, X_minority, X_synthetic], dim=0)
    y_train_final = torch.cat([y_majority, y_minority, y_synthetic], dim=0)

    print("Sau khi oversampling bằng VAE:")
    unique, counts = np.unique(y_train_final.numpy(), return_counts=True)
    print("Phân phối lớp trong tập train:", dict(zip(unique, counts)))

    # Tạo DataLoader cho train & test
    train_dataset = TensorDataset(X_train_final, y_train_final)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    

Trước khi oversampling bằng VAE:
Số lượng majority: 1200
Số lượng minority: 80
Epoch 10/200, VAE Loss = 27306.62
Epoch 20/200, VAE Loss = 11886.79
Epoch 30/200, VAE Loss = 9161.32
Epoch 40/200, VAE Loss = 8260.95
Epoch 50/200, VAE Loss = 7684.43
Epoch 60/200, VAE Loss = 7215.01
Epoch 70/200, VAE Loss = 6611.28
Epoch 80/200, VAE Loss = 6188.85
Epoch 90/200, VAE Loss = 5602.25
Epoch 100/200, VAE Loss = 5209.08
Epoch 110/200, VAE Loss = 4778.91
Epoch 120/200, VAE Loss = 4531.77
Epoch 130/200, VAE Loss = 4506.56
Epoch 140/200, VAE Loss = 4117.24
Epoch 150/200, VAE Loss = 3954.61
Epoch 160/200, VAE Loss = 3739.48
Epoch 170/200, VAE Loss = 3696.18
Epoch 180/200, VAE Loss = 3597.91
Epoch 190/200, VAE Loss = 3582.14
Epoch 200/200, VAE Loss = 3358.71
Đã sinh thêm: 1120 mẫu minority bằng VAE
Sau khi oversampling bằng VAE:
Phân phối lớp trong tập train: {0.0: 1200, 1.0: 1200}


In [7]:
#-------------
# Bước E: TRAIN MODEL TRANSFORMER DETECTOR NHƯ THƯỜNG LỆ
#-------------
model = TransformerDetector(input_size=input_dim).to(device)
optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

num_epochs = 20


Epoch 1/20, Loss: 0.2071
Classification Report on Test Set:
              precision    recall  f1-score   support

     Class 0       0.94      1.00      0.97       300
     Class 1       1.00      0.05      0.10        20

    accuracy                           0.94       320
   macro avg       0.97      0.53      0.53       320
weighted avg       0.94      0.94      0.91       320

AUC-ROC: 0.6415
Epoch 2/20, Loss: 0.1278
Classification Report on Test Set:
              precision    recall  f1-score   support

     Class 0       0.94      1.00      0.97       300
     Class 1       1.00      0.05      0.10        20

    accuracy                           0.94       320
   macro avg       0.97      0.53      0.53       320
weighted avg       0.94      0.94      0.91       320

AUC-ROC: 0.7367
Epoch 3/20, Loss: 0.1187
Classification Report on Test Set:
              precision    recall  f1-score   support

     Class 0       0.94      1.00      0.97       300
     Class 1       1.00  

In [8]:
for epoch in range(num_epochs):
    train_loss = train_detector(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.4f}")
    
    print("Classification Report on Test Set:")
    evaluate_with_classification_report_and_auc(model, test_loader, device)

Epoch 1/20, Loss: 0.0370
Classification Report on Test Set:
              precision    recall  f1-score   support

     Class 0       0.97      1.00      0.99       300
     Class 1       0.92      0.60      0.73        20

    accuracy                           0.97       320
   macro avg       0.95      0.80      0.86       320
weighted avg       0.97      0.97      0.97       320

AUC-ROC: 0.9707
Epoch 2/20, Loss: 0.0499
Classification Report on Test Set:
              precision    recall  f1-score   support

     Class 0       0.98      0.99      0.99       300
     Class 1       0.87      0.65      0.74        20

    accuracy                           0.97       320
   macro avg       0.92      0.82      0.86       320
weighted avg       0.97      0.97      0.97       320

AUC-ROC: 0.9753
Epoch 3/20, Loss: 0.0343
Classification Report on Test Set:
              precision    recall  f1-score   support

     Class 0       0.97      0.99      0.98       300
     Class 1       0.86  