In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kddcup-data-gz/kddcup.data.corrected


In [3]:
import gzip
import pandas as pd

In [4]:
# Define column names
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

In [5]:
# Load the dataset
df = pd.read_csv('/kaggle/input/kddcup-data-gz/kddcup.data.corrected', header=None, names=columns)

In [6]:



# # Convert to DataFrame
# columns = [
#     'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
#     'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
#     'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
#     'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
#     'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
#     'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
#     'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
#     'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
#     'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
#     'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
# ]
# df = pd.DataFrame([line.split(',') for line in data], columns=columns)

In [7]:
# Encode categorical variables
cat_cols = df.select_dtypes(include=['object']).columns

In [8]:
print(cat_cols)

Index(['protocol_type', 'service', 'flag', 'label'], dtype='object')


In [9]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

# Normalize continuous features
continuous_cols = df.columns.drop(cat_cols)
scaler = MinMaxScaler()
df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

In [10]:
continuous_cols

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [11]:
import pickle


In [12]:
with open("encoder.pkl", "wb") as file_obj:
    pickle.dump(encoder, file_obj)

In [13]:
with open("scaler.pkl", "wb") as file_obj:
    pickle.dump(scaler, file_obj)

In [14]:
import torch

data_tensor = torch.tensor(df.values, dtype=torch.float32)

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Define the diffusion model
class DiffusionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(DiffusionModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        return self.net(x)

    def training_step(self, x, noise):
        predicted_noise = self(x)
        loss = nn.MSELoss()(predicted_noise, noise)
        return loss
    def sample(self, batch_size, num_steps=1000):
        with torch.no_grad():
            # Start with random noise
            x = torch.randn(batch_size, self.net[-1].out_features, device=next(self.parameters()).device)
            for _ in range(num_steps):
                predicted_noise = self(x)
                x = x - 0.01 * predicted_noise  # Reverse diffusion step
            return x

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [17]:
input_dim = data_tensor.shape[1]
print(input_dim)

42


In [20]:
from tqdm import tqdm

In [24]:
# Initialize the model

model = DiffusionModel(input_dim=input_dim).to(device)
# Set up optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Create DataLoader
dataset = TensorDataset(data_tensor)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    for batch in tqdm(dataloader, desc=f"the epoch {epoch+1}/{num_epochs} of training" ) :
        x = batch[0].to(device)  # Move data to GPU
        noise = torch.randn_like(x).to(device)  # Move noise to GPU
        noisy_x = x + 0.1 * noise  # Corrupt the data with noise
        loss = model.training_step(noisy_x, noise)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

the epoch 1/50 of training: 100%|██████████| 38269/38269 [01:37<00:00, 393.21it/s]


Epoch [1/50], Loss: 0.0594


the epoch 2/50 of training: 100%|██████████| 38269/38269 [01:34<00:00, 404.55it/s]


Epoch [2/50], Loss: 0.0542


the epoch 3/50 of training: 100%|██████████| 38269/38269 [01:33<00:00, 409.00it/s]


Epoch [3/50], Loss: 0.0350


the epoch 4/50 of training: 100%|██████████| 38269/38269 [01:33<00:00, 408.32it/s]


Epoch [4/50], Loss: 0.0409


the epoch 5/50 of training: 100%|██████████| 38269/38269 [01:39<00:00, 385.50it/s]


Epoch [5/50], Loss: 0.0291


the epoch 6/50 of training: 100%|██████████| 38269/38269 [01:33<00:00, 410.37it/s]


Epoch [6/50], Loss: 0.0314


the epoch 7/50 of training: 100%|██████████| 38269/38269 [01:33<00:00, 411.46it/s]


Epoch [7/50], Loss: 0.0399


the epoch 8/50 of training: 100%|██████████| 38269/38269 [01:29<00:00, 425.35it/s]


Epoch [8/50], Loss: 0.0261


the epoch 9/50 of training: 100%|██████████| 38269/38269 [01:29<00:00, 428.21it/s]


Epoch [9/50], Loss: 0.0427


the epoch 10/50 of training: 100%|██████████| 38269/38269 [01:28<00:00, 430.70it/s]


Epoch [10/50], Loss: 0.0256


the epoch 11/50 of training: 100%|██████████| 38269/38269 [01:28<00:00, 431.86it/s]


Epoch [11/50], Loss: 0.0259


the epoch 12/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 449.98it/s]


Epoch [12/50], Loss: 0.0392


the epoch 13/50 of training: 100%|██████████| 38269/38269 [01:24<00:00, 452.58it/s]


Epoch [13/50], Loss: 0.0280


the epoch 14/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 449.80it/s]


Epoch [14/50], Loss: 0.0308


the epoch 15/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 449.70it/s]


Epoch [15/50], Loss: 0.0253


the epoch 16/50 of training: 100%|██████████| 38269/38269 [01:24<00:00, 453.43it/s]


Epoch [16/50], Loss: 0.0314


the epoch 17/50 of training: 100%|██████████| 38269/38269 [01:24<00:00, 452.15it/s]


Epoch [17/50], Loss: 0.0264


the epoch 18/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 449.10it/s]


Epoch [18/50], Loss: 0.0265


the epoch 19/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 445.56it/s]


Epoch [19/50], Loss: 0.0272


the epoch 20/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 445.61it/s]


Epoch [20/50], Loss: 0.0258


the epoch 21/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 446.51it/s]


Epoch [21/50], Loss: 0.0344


the epoch 22/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 449.35it/s]


Epoch [22/50], Loss: 0.0263


the epoch 23/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 446.03it/s]


Epoch [23/50], Loss: 0.0283


the epoch 24/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 448.97it/s]


Epoch [24/50], Loss: 0.0302


the epoch 25/50 of training: 100%|██████████| 38269/38269 [01:25<00:00, 447.02it/s]


Epoch [25/50], Loss: 0.0334


the epoch 26/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 460.24it/s]


Epoch [26/50], Loss: 0.0282


the epoch 27/50 of training: 100%|██████████| 38269/38269 [01:21<00:00, 467.25it/s]


Epoch [27/50], Loss: 0.0246


the epoch 28/50 of training: 100%|██████████| 38269/38269 [01:22<00:00, 463.16it/s]


Epoch [28/50], Loss: 0.0219


the epoch 29/50 of training: 100%|██████████| 38269/38269 [01:22<00:00, 463.01it/s]


Epoch [29/50], Loss: 0.0252


the epoch 30/50 of training: 100%|██████████| 38269/38269 [01:22<00:00, 464.24it/s]


Epoch [30/50], Loss: 0.0366


the epoch 31/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 459.90it/s]


Epoch [31/50], Loss: 0.0266


the epoch 32/50 of training: 100%|██████████| 38269/38269 [01:22<00:00, 461.13it/s]


Epoch [32/50], Loss: 0.0177


the epoch 33/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 457.15it/s]


Epoch [33/50], Loss: 0.0295


the epoch 34/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 458.25it/s]


Epoch [34/50], Loss: 0.0298


the epoch 35/50 of training: 100%|██████████| 38269/38269 [01:22<00:00, 462.63it/s]


Epoch [35/50], Loss: 0.0231


the epoch 36/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 458.15it/s]


Epoch [36/50], Loss: 0.0247


the epoch 37/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 456.46it/s]


Epoch [37/50], Loss: 0.0208


the epoch 38/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 456.14it/s]


Epoch [38/50], Loss: 0.0231


the epoch 39/50 of training: 100%|██████████| 38269/38269 [01:22<00:00, 461.26it/s]


Epoch [39/50], Loss: 0.0323


the epoch 40/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 458.46it/s]


Epoch [40/50], Loss: 0.0249


the epoch 41/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 458.54it/s]


Epoch [41/50], Loss: 0.0296


the epoch 42/50 of training: 100%|██████████| 38269/38269 [01:24<00:00, 455.49it/s]


Epoch [42/50], Loss: 0.0298


the epoch 43/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 459.50it/s]


Epoch [43/50], Loss: 0.0282


the epoch 44/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 457.50it/s]


Epoch [44/50], Loss: 0.0280


the epoch 45/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 455.61it/s]


Epoch [45/50], Loss: 0.0273


the epoch 46/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 456.08it/s]


Epoch [46/50], Loss: 0.0264


the epoch 47/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 459.28it/s]


Epoch [47/50], Loss: 0.0251


the epoch 48/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 456.44it/s]


Epoch [48/50], Loss: 0.0221


the epoch 49/50 of training: 100%|██████████| 38269/38269 [01:23<00:00, 460.97it/s]


Epoch [49/50], Loss: 0.0265


the epoch 50/50 of training: 100%|██████████| 38269/38269 [01:22<00:00, 461.53it/s]

Epoch [50/50], Loss: 0.0276





In [27]:
torch.save(model.state_dict(), "diffusion_model.pth")

In [29]:
# Generate new samples
batch_size = 128
generated_data = model.sample(batch_size=batch_size)

# Convertir les données générées en DataFrame
generated_data_np = generated_data.cpu().numpy()
generated_df = pd.DataFrame(generated_data_np, columns=columns)

# Inverse scaling pour les caractéristiques continues
generated_df[continuous_cols] = scaler.inverse_transform(generated_df[continuous_cols])

# Décodage des caractéristiques catégorielles
for col in cat_cols:
    # Limiter les valeurs générées aux classes valides
    valid_classes = np.arange(len(encoder.classes_))
    generated_df[col] = np.round(generated_df[col]).astype(int)
    generated_df[col] = np.clip(generated_df[col], valid_classes.min(), valid_classes.max())
    
    # Décodage des valeurs
    generated_df[col] = encoder.inverse_transform(generated_df[col])

# Afficher les données générées
print(generated_df.head())

        duration     protocol_type        service              flag  \
0   11999.377930     guess_passwd.          back.  buffer_overflow.   
1   17900.613281     guess_passwd.          back.          neptune.   
2   28404.175781        ftp_write.          back.             imap.   
3  274230.812500  buffer_overflow.          back.     guess_passwd.   
4     102.607765  buffer_overflow.  guess_passwd.  buffer_overflow.   

      src_bytes    dst_bytes      land  wrong_fragment     urgent         hot  \
0  9.035079e+08 -246210208.0 -0.340797        0.353344   0.851186  -34.626610   
1 -2.342755e+08  -19597360.0  0.025844        0.035002  -0.215348    7.869659   
2 -2.363161e+08 -368414688.0  0.056273       -0.114926  -2.371714   15.306267   
3  3.817758e+09 -422722464.0 -1.324064      -15.232164  62.066299 -141.397888   
4 -2.459759e+07  -23518676.0  0.002679       -0.034897   0.011986    0.677723   

   ...  dst_host_srv_count  dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0  ...  