In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
import logging
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import RobustScaler
import multiprocessing

In [2]:

logging.basicConfig(level=logging.INFO)
def load_data(data_dir):
    all_data = []
    for file in os.listdir(data_dir):
        if file.endswith(".csv"):
            file_path = os.path.join(data_dir, file)
            df = pd.read_csv(file_path, skipinitialspace=False)
            df = df[df[' Label'] != 'BENIGN']  #Removing rows with 'BENIGN' label and dropping label column
            df = df.drop(' Label', axis=1)   
            all_data.append(df)

    combined_data = pd.concat(all_data, ignore_index=True)
    logging.info(f"Initial data shape: {combined_data.shape}")

    #replacing non-numeric values with NaN
    for col in combined_data.columns:
        combined_data[col] = pd.to_numeric(combined_data[col], errors='coerce')
    
    #replacing infinite values with Nan
    combined_data.replace([np.inf, -np.inf], np.nan, inplace=True)

    combined_data = combined_data.dropna()
    logging.info(f"Data shape after cleaning: {combined_data.shape}")
    
    return combined_data

In [3]:
def preprocess_data(data):
    column_names = data.columns
    column_types = data.dtypes
    
    # Clip the data to handle extreme values (1st to 99th percentile)
    data = data.clip(lower=data.quantile(0.01), upper=data.quantile(0.99), axis=1)
    
    #ensure outputs are non-negative
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(data)
    
    return scaled_features, scaler, column_names, column_types

In [4]:
class DiffusionModel(nn.Module):
    def __init__(self, input_dim):
        super(DiffusionModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, input_dim),
            nn.Sigmoid()  #ensure output is in [0,1] to match MinMax scaling
        )
    def forward(self, x):
        return self.model(x)

In [5]:
#to prevent overfitting
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0.0001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [6]:
def add_noise(inputs, noise_factor=0.05):
    #Adds random noise to the inputs for data augmentation
    noisy_inputs = inputs + noise_factor * torch.randn_like(inputs)
    return torch.clamp(noisy_inputs, 0.0, 1.0)  #ensures still in [0, 1] range

In [7]:
def train_model(model, train_loader, val_loader, epochs=50, device='cpu'):
    criterion = nn.MSELoss()  
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5) 
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    early_stopping = EarlyStopping(patience=10, min_delta=0.0001)

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0.0

        for batch in train_loader:
            batch_x = batch[0].to(device)  #load batch to device

            #Adding noise
            noisy_batch_x = add_noise(batch_x)

            #forward pass
            outputs = model(noisy_batch_x)
            loss = criterion(outputs, batch_x)

            #backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        #validation phase
        model.eval()
        total_val_loss = 0.0

        with torch.no_grad():
            for batch in val_loader:
                batch_x = batch[0].to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_x)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        logging.info(f"Epoch [{epoch+1}/{epochs}], Avg Train Loss: {avg_train_loss:.4f}, Avg Val Loss: {avg_val_loss:.4f}")

      
        scheduler.step()

        #check early stopping condition
        early_stopping(avg_val_loss)
        if early_stopping.early_stop:
            logging.info("Early stopping triggered")
            break

    #saving model
    torch.save(model.state_dict(), 'diffusion_model.pth')


In [8]:
#Function to generate new samples
def generate_samples(model, scaler, column_names, column_types, num_samples=1000, device='cpu'):
    model.eval() 
    input_dim = len(column_names)
    with torch.no_grad():
        samples = torch.randn(num_samples, input_dim).to(device)
        generated_samples = model(samples).cpu().numpy()
    
    #reverse scaling using inverse transform
    generated_samples = scaler.inverse_transform(generated_samples)
    
    #preserve column types
    generated_df = pd.DataFrame(generated_samples, columns=column_names)
    for col, col_type in zip(column_names, column_types):
        generated_df[col] = generated_df[col].astype(col_type)
    
    return generated_df

In [9]:
def main(data_dir, output_file, epochs=50, batch_size=64, num_samples=1000):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    data = load_data(data_dir)
    scaled_features, scaler, column_names, column_types = preprocess_data(data)

    #split data into train and validation sets
    dataset = TensorDataset(torch.tensor(scaled_features, dtype=torch.float32))
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    #Create DataLoader for batch processing
    num_workers = min(4, multiprocessing.cpu_count() - 1)  # Use a safe number of workers
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    
    input_dim = scaled_features.shape[1]
    model = DiffusionModel(input_dim).to(device)
    
    train_model(model, train_loader, val_loader, epochs, device)

    #generate samples and save them to a CSV file
    generated_samples = generate_samples(model, scaler, column_names, column_types, num_samples, device)
    generated_samples.to_csv(output_file, index=False)
    logging.info(f"Generated data saved to {output_file}")

#parameters
data_dir = 'gan_dataset'  #directory containing all CSV files
output_file = 'diffusion_model_results.csv'
epochs = 50  
batch_size = 64 
num_samples = 1000  #Number of generated samples

if __name__ == "__main__":
    main(data_dir, output_file, epochs, batch_size, num_samples)

INFO:root:Initial data shape: (557646, 78)
INFO:root:Data shape after cleaning: (556556, 78)
INFO:root:Epoch [1/50], Avg Train Loss: 0.0051, Avg Val Loss: 0.0006
INFO:root:Epoch [2/50], Avg Train Loss: 0.0017, Avg Val Loss: 0.0005
INFO:root:Epoch [3/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0006
INFO:root:Epoch [4/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0005
INFO:root:Epoch [5/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0005
INFO:root:Epoch [6/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0005
INFO:root:Epoch [7/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0005
INFO:root:Epoch [8/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0005
INFO:root:Epoch [9/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0005
INFO:root:Epoch [10/50], Avg Train Loss: 0.0016, Avg Val Loss: 0.0006
INFO:root:Epoch [11/50], Avg Train Loss: 0.0014, Avg Val Loss: 0.0004
INFO:root:Epoch [12/50], Avg Train Loss: 0.0014, Avg Val Loss: 0.0004
INFO:root:Early stopping triggered
INFO:root:Generated data saved to diffusi