In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np


# =========================
# 1. CÁC HÀM TIỆN ÍCH
# =========================
def load_adbench_data(dataset_path):
    """
    Giả sử file .npz có 'X' và 'y'
    X: (N, d) 
    y: (N,)
    """
    data = np.load(dataset_path)
    X = data['X']
    y = data['y']
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

def evaluate_with_classification_report_and_auc(model, test_loader, device, threshold=0.5):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch).squeeze()  # (B,)
            all_preds.append(y_pred.cpu())
            all_labels.append(y_batch.cpu())

    preds = torch.cat(all_preds).numpy()      # (N_test,)
    labels = torch.cat(all_labels).numpy()    # (N_test,)

    # Dự đoán nhị phân
    binary_preds = (preds > threshold).astype(int)

    # In classification report
    report = classification_report(labels, binary_preds, target_names=['Class 0', 'Class 1'])
    print(report)

    # Tính AUC
    if len(set(labels)) > 1:
        aucroc = roc_auc_score(labels, preds)
        print(f"AUC-ROC: {aucroc:.4f}")
    else:
        aucroc = None
        print("AUC-ROC: Undefined (only one class present in labels)")
    return report, aucroc


# =========================
# 2. ĐỊNH NGHĨA C-VAE
# =========================
class CVAE(nn.Module):
    """
    cVAE cho bài toán nhị phân:
      - Encoder nhận đầu vào: (x, y) concat
      - Decoder nhận đầu vào: (z, y) concat
    Giả sử y là scalar 0/1, 
    ta coi y như 1 chiều continuous (đơn giản).
    """
    def __init__(self, input_dim, hidden_dim=128, latent_dim=32):
        super(CVAE, self).__init__()
        self.input_dim = input_dim
        
        # Encoder
        # Kích thước đầu vào encoder = input_dim + 1 (concat y)
        self.fc1 = nn.Linear(input_dim + 1, hidden_dim)
        self.fc2_mean = nn.Linear(hidden_dim, latent_dim)
        self.fc2_logvar = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder
        # Kích thước đầu vào decoder = latent_dim + 1 (concat y)
        self.fc3 = nn.Linear(latent_dim + 1, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x, y):
        """
        x: (B, input_dim)
        y: (B, 1)
        """
        xy = torch.cat([x, y], dim=1)  # (B, input_dim+1)
        h = F.relu(self.fc1(xy))
        mean = self.fc2_mean(h)
        logvar = self.fc2_logvar(h)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def decode(self, z, y):
        """
        z: (B, latent_dim)
        y: (B, 1)
        """
        zy = torch.cat([z, y], dim=1)  # (B, latent_dim+1)
        h = F.relu(self.fc3(zy))
        x_recon = self.fc4(h)
        return x_recon

    def forward(self, x, y):
        mean, logvar = self.encode(x, y)
        z = self.reparameterize(mean, logvar)
        x_recon = self.decode(z, y)
        return x_recon, mean, logvar

def cvae_loss_fn(x, x_recon, mean, logvar):
    """
    Reconstruction loss + KL divergence
    """
    # Reconstruction = MSE cho ví dụ này, 
    # bạn có thể dùng BCE tuỳ dạng dữ liệu
    recon_loss = F.mse_loss(x_recon, x, reduction='sum')  
    kl_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
    return recon_loss + kl_loss

def train_cvae(cvae, data_loader, optimizer, device):
    cvae.train()
    total_loss = 0
    for x_batch, y_batch in data_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        # y_batch shape = (B,) => cần reshape => (B,1)
        y_batch_ = y_batch.unsqueeze(1)
        x_recon, mean, logvar = cvae(x_batch, y_batch_)
        loss = cvae_loss_fn(x_batch, x_recon, mean, logvar)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)


# =========================
# 3. MÔ HÌNH TRANSFORMER
# =========================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        L = x.size(1)
        return x + self.pe[:, :L, :].to(x.device)

class TransformerDetector(nn.Module):
    def __init__(self, input_size, d_model=128, nhead=8, num_layers=2, dim_feedforward=256, dropout=0.1):
        super(TransformerDetector, self).__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,
                                                   dim_feedforward=dim_feedforward, 
                                                   dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Sequential(
            nn.Linear(d_model, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        """
        x shape: (B, d_input) 
        => transform thành (B, L, d_input). Ở đây L=1 nếu không có chiều sequence.
        """
        if x.dim() == 2:
            x = x.unsqueeze(1)
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Pooling
        return self.fc(x).squeeze(1)


def train_detector(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)  # (B,)
        loss = criterion(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)




In [18]:

# =========================
# 4. DEMO MAIN
# =========================
if __name__ == "__main__":
    # Ví dụ đường dẫn, thay bằng của bạn
    dataset_path = "Classical/20_letter.npz"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 4.1: Load data
    X_all, y_all = load_adbench_data(dataset_path)
    input_dim = X_all.shape[1]

    # 4.2: Chia train/test
    X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
        X_all.numpy(), y_all.numpy(), test_size=0.2, random_state=42, stratify=y_all
    )
    X_train = torch.tensor(X_train_np, dtype=torch.float32)
    y_train = torch.tensor(y_train_np, dtype=torch.float32)
    X_test  = torch.tensor(X_test_np,  dtype=torch.float32)
    y_test  = torch.tensor(y_test_np,  dtype=torch.float32)

    # Tạo DataLoader cho cVAE (train trên toàn bộ train data)
    train_dataset = TensorDataset(X_train, y_train)
    train_loader  = DataLoader(train_dataset, batch_size=64, shuffle=True)

    # 4.3: Khởi tạo & Train cVAE
    cvae = CVAE(input_dim=input_dim, hidden_dim=128, latent_dim=32).to(device)
    optimizer_cvae = Adam(cvae.parameters(), lr=1e-3)
    cvae_epochs = 300

    

    

In [21]:
for epoch in range(cvae_epochs):
    loss_cvae = train_cvae(cvae, train_loader, optimizer_cvae, device)
    if (epoch+1) % 5 == 0:
        print(f"[cVAE] Epoch {epoch+1}/{cvae_epochs}, loss = {loss_cvae:.2f}")

# 4.4: Sinh dữ liệu “lai” hoặc “thuần minority”
# ----------------------------------------------------------------
# Giả sử ta muốn sinh gấp đôi minority, 
# hoặc tuỳ ý (số lượng = num_generate).
# ----------------------------------------------------------------
# Ở đây minh hoạ 2 cách:
# A) Gán y=1 => thuần minority
# B) Gán y=(0.3..0.7) => “lai” normal-minority
# Tuỳ bạn muốn, ta sẽ demo 1 cách => y=1
# (hoặc y=0.5 => lai)
num_generate = 1020  # ví dụ

cvae.eval()
with torch.no_grad():
    # Lấy z ~ N(0, I)
    z = torch.randn(num_generate, 32).to(device)
    
    # Giả sử generate = minority => y=1
    # Nếu muốn “lai” => y=0.5 (chẳng hạn) 
    # => y_synthetic = torch.full((num_generate,1), 0.5, device=device)
    # Ví dụ: ta muốn “lai” => y=0.7
    # => cần tạo (num_generate,1)
    y_synthetic = torch.full((num_generate, 1), 0.7, device=device)

    # Decode
    X_synthetic = cvae.decode(z, y_synthetic)  
    X_synthetic = X_synthetic.cpu()

[cVAE] Epoch 5/300, loss = 11317.94
[cVAE] Epoch 10/300, loss = 7319.40
[cVAE] Epoch 15/300, loss = 6119.08
[cVAE] Epoch 20/300, loss = 5300.07
[cVAE] Epoch 25/300, loss = 4785.09
[cVAE] Epoch 30/300, loss = 4488.60
[cVAE] Epoch 35/300, loss = 4212.90
[cVAE] Epoch 40/300, loss = 3989.47
[cVAE] Epoch 45/300, loss = 3843.13
[cVAE] Epoch 50/300, loss = 3654.31
[cVAE] Epoch 55/300, loss = 3561.08
[cVAE] Epoch 60/300, loss = 3507.12
[cVAE] Epoch 65/300, loss = 3355.66
[cVAE] Epoch 70/300, loss = 3261.92
[cVAE] Epoch 75/300, loss = 3192.61
[cVAE] Epoch 80/300, loss = 3117.20
[cVAE] Epoch 85/300, loss = 3053.98
[cVAE] Epoch 90/300, loss = 3016.30
[cVAE] Epoch 95/300, loss = 2967.25
[cVAE] Epoch 100/300, loss = 2890.04
[cVAE] Epoch 105/300, loss = 2868.46
[cVAE] Epoch 110/300, loss = 2846.27
[cVAE] Epoch 115/300, loss = 2809.09
[cVAE] Epoch 120/300, loss = 2771.46
[cVAE] Epoch 125/300, loss = 2743.19
[cVAE] Epoch 130/300, loss = 2717.42
[cVAE] Epoch 135/300, loss = 2694.27
[cVAE] Epoch 140/300

In [23]:
# Ghép lại với bộ train gốc
y_synthetic_labels = torch.ones(num_generate)  # label=1

X_train_final = torch.cat([X_train, X_synthetic], dim=0)  # (N + num_generate, d)
y_train_final = torch.cat([y_train, y_synthetic_labels], dim=0)  # (N + num_generate,)

# 4.5: Kiểm tra phân phối trước/sau
unique_orig, counts_orig = np.unique(y_train.numpy(), return_counts=True)
print("Trước sinh:", dict(zip(unique_orig, counts_orig)))
unique_final, counts_final = np.unique(y_train_final.numpy(), return_counts=True)
print("Sau sinh:", dict(zip(unique_final, counts_final)))

# 4.6: Huấn luyện mô hình TransformerDetector trên dữ liệu đã oversample
train_dataset_final = TensorDataset(X_train_final, y_train_final)
test_dataset        = TensorDataset(X_test, y_test)
train_loader_final  = DataLoader(train_dataset_final, batch_size=64, shuffle=True)
test_loader         = DataLoader(test_dataset, batch_size=64)

model = TransformerDetector(input_size=input_dim).to(device)
optimizer_tf = Adam(model.parameters(), lr=1e-3)
criterion    = nn.BCELoss()

num_epochs = 50
for epoch in range(num_epochs):
    train_loss = train_detector(model, train_loader_final, optimizer_tf, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss={train_loss:.4f}")

    # Đánh giá
    print("Test set evaluation:")
    evaluate_with_classification_report_and_auc(model, test_loader, device, threshold=0.3)
    print("-"*40)

Trước sinh: {0.0: 1200, 1.0: 80}
Sau sinh: {0.0: 1200, 1.0: 1100}
Epoch 1/50, Loss=0.6387
Test set evaluation:
              precision    recall  f1-score   support

     Class 0       0.99      0.71      0.83       300
     Class 1       0.17      0.85      0.28        20

    accuracy                           0.72       320
   macro avg       0.58      0.78      0.55       320
weighted avg       0.93      0.72      0.79       320

AUC-ROC: 0.9165
----------------------------------------
Epoch 2/50, Loss=0.5149
Test set evaluation:
              precision    recall  f1-score   support

     Class 0       0.99      0.53      0.69       300
     Class 1       0.11      0.90      0.20        20

    accuracy                           0.55       320
   macro avg       0.55      0.72      0.45       320
weighted avg       0.93      0.55      0.66       320

AUC-ROC: 0.8540
----------------------------------------
Epoch 3/50, Loss=0.4548
Test set evaluation:
              precision    reca