In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv("final_benign_dataset.csv")  # replace with your filename
original_rows = df.shape[0]

In [4]:
original_rows

27519

In [9]:
label_column = 'Label'  # Change this to your actual column name
features_df = df.drop(columns=[label_column])
labels = df[label_column]

# Ensure only numeric columns go into autoencoder
numeric_features = features_df.select_dtypes(include=[np.number])
numeric_features.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_features.dropna(inplace=True)
labels = labels.loc[numeric_features.index]
non_numeric_features = features_df.select_dtypes(exclude=[np.number])

In [10]:
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(numeric_features)

In [11]:
input_dim = data_scaled.shape[1]
encoding_dim = input_dim // 2

In [12]:
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

In [13]:
autoencoder = Model(inputs=input_layer, outputs=decoded)
encoder = Model(inputs=input_layer, outputs=encoded)

In [14]:
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

In [15]:
autoencoder.fit(data_scaled, data_scaled, epochs=50, batch_size=64, verbose=1)

Epoch 1/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1488
Epoch 2/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0157
Epoch 3/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0075
Epoch 4/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0045
Epoch 5/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0024
Epoch 6/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0013
Epoch 7/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0010
Epoch 8/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8.2073e-04
Epoch 9/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 7.4534e-04
Epoch 10/50
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/s

<keras.src.callbacks.history.History at 0x173aa1b2390>

In [16]:
encoded_data = encoder.predict(data_scaled)

[1m853/853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [28]:
samples_to_generate = 25000 - data_scaled.shape[0]

if samples_to_generate <= 0:
    print(f"✅ Your dataset already has {data_scaled.shape[0]} rows. No need to generate synthetic data.")

    # Save the original dataset as final output
    final_df = numeric_features.copy()
    final_df[label_column] = labels
    final_df.to_csv("augmented_data_25000.csv", index=False)
    print("📁 File saved as 'augmented_data_25000.csv'")
    # If you're in a script, exit here. If in notebook, use return or skip rest of code
    import sys
    sys.exit()


✅ Your dataset already has 27265 rows. No need to generate synthetic data.
📁 File saved as 'augmented_data_25000.csv'


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [29]:
samples_to_generate = 200000 - data_scaled.shape[0]
synthetic_encoded = []

for _ in range(samples_to_generate):
    idx = np.random.randint(0, data_scaled.shape[0])
    noise = np.random.normal(loc=0.0, scale=0.05, size=encoding_dim)
    new_encoded = encoded_data[idx] + noise

    # 🔥 FIX: sanitize latent space values
    new_encoded = np.nan_to_num(new_encoded, nan=0.0, posinf=1.0, neginf=0.0)

    synthetic_encoded.append(new_encoded)

synthetic_encoded = np.array(synthetic_encoded)

In [30]:
print("Shape of synthetic_encoded:", synthetic_encoded.shape)
print("Any NaN:", np.any(np.isnan(synthetic_encoded)))
print("Any Inf:", np.any(np.isinf(synthetic_encoded)))
print("Max:", np.max(synthetic_encoded))
print("Min:", np.min(synthetic_encoded))

Shape of synthetic_encoded: (172735, 16)
Any NaN: False
Any Inf: False
Max: 7.625360511795518
Min: -0.22759519110750281


In [32]:
encoded_input = Input(shape=(encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(encoded_input, decoder_layer(encoded_input))

synthetic_decoded = decoder.predict(synthetic_encoded)
synthetic_decoded = np.clip(synthetic_decoded, 0.0, 1.0)
synthetic_data = scaler.inverse_transform(synthetic_decoded)
synthetic_df = pd.DataFrame(synthetic_data, columns=numeric_features.columns)

[1m5398/5398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step


In [37]:
synthetic_df[label_column] = 'BENIGN'

In [38]:
original_df = numeric_features.copy()
original_df[label_column] = labels

final_df = pd.concat([original_df, synthetic_df], ignore_index=True)

In [39]:
final_df.to_csv("augmented_data_200000.csv", index=False)
print("✅ Final augmented dataset saved as 'augmented_data_200000.csv'")

✅ Final augmented dataset saved as 'augmented_data_200000.csv'


In [40]:
import pandas as pd

# 📥 Load the augmented dataset
df_augmented = pd.read_csv("augmented_data_200000.csv")

# 📐 Check the shape (rows, columns)
print("🔍 Shape of the dataset:", df_augmented.shape)

# 👀 Preview first few rows
print("\n📄 Sample Data:")
print(df_augmented.head())

# 📊 Check if there are any missing/null values
print("\n🔎 Null values per column:")
print(df_augmented.isnull().sum())

# 🔎 Check unique values in the label column
print("\n🏷️ Unique labels and their counts:")
print(df_augmented['Label'].value_counts())


🔍 Shape of the dataset: (200000, 33)

📄 Sample Data:
   Flow Duration  Flow Bytes/s  Flow Packets/s  Total Fwd Packets  \
0    110861755.0     12.484017        0.469053               26.0   
1     40335006.0      1.735465        0.471055                9.0   
2    113244633.0      0.000000        0.494505               56.0   
3     95628949.0      0.209142        0.428740               21.0   
4     95613243.0      0.209176        0.428811               21.0   

   Total Backward Packets  Total Length of Fwd Packets  \
0                    26.0                        616.0   
1                    10.0                          8.0   
2                     0.0                          0.0   
3                    20.0                         20.0   
4                    20.0                         20.0   

   Total Length of Bwd Packets  Fwd Packet Length Mean  \
0                        768.0               23.692308   
1                         62.0                0.888889   
2        