In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from keras import models, layers, regularizers
from keras.layers import BatchNormalization, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
file_path =r'/kaggle/input/traindataset/Train_data.csv'
X=pd.read_csv(file_path)

always_zero_features= ['num_outbound_cmds', 'is_host_login']
X.drop(always_zero_features, axis=1, inplace=True)
print(X['class'].value_counts())
print(X.isnull().sum())
y= X['class']
X.drop('class', axis=1, inplace=True)
categorical_cols=X.select_dtypes(include=['object']).columns
print(categorical_cols)
encoder = OneHotEncoder(sparse=False) 
encoded_categories = encoder.fit_transform(X[['protocol_type', 'service','flag']])


encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(['protocol_type', 'service','flag']))

X_copy = X.copy()

# Drop the columns and concatenate
X_copy.drop(columns=['protocol_type', 'service', 'flag'], inplace=True)
X_full = pd.concat([X_copy, encoded_df], axis=1)
binary_cols = [col for col in X_full.columns if X_full[col].dropna().nunique() == 2]
non_binary_cols = [col for col in X_full.columns if col not in binary_cols and X_full[col].dtype in ['int64', 'float64']]

# Scale only non-binary numerical columns
scaler = StandardScaler()
df_scaled = X_full.copy()
df_scaled[non_binary_cols] = scaler.fit_transform(X_full[non_binary_cols])
df_scaled.head()


encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_encoded= pd.DataFrame(y_encoded, index=y.index, columns=['class'])
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_encoded, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [None]:
len(X_train.columns)

In [None]:
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers 
#Model Definition
model = models.Sequential([
   
    BatchNormalization(input_shape=(116,)),

 
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)), 
    BatchNormalization(),
    Dropout(0.4), 


    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    BatchNormalization(),
    Dropout(0.3), 


    layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    BatchNormalization(),
    Dropout(0.4), 

    layers.Dense(1, activation='sigmoid') 
])

In [None]:
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5, 
    patience=10, 
    min_lr=0.00001,
    verbose=1
)


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

In [None]:
history=model.fit(
    X_train, y_train,
    epochs=500,
    batch_size=256,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stopping,reduce_lr]
)

In [None]:
train_acc = history.history['accuracy']  

# Validation accuracy for each epoch
val_acc = history.history['val_accuracy']  

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#Data preprocessing for train modified data
train_path = '/kaggle/input/modified-dataset/KDDTrain.arff'
data_train, meta_train = arff.loadarff(train_path)
df_train = pd.DataFrame(data_train)

for col in df_train.select_dtypes([object]).columns:
    df_train[col] = df_train[col].str.decode('utf-8')

y_train = df_train['class']
X_train = df_train.drop(['num_outbound_cmds', 'is_host_login', 'is_guest_login', 'class'], axis=1)

categorical_cols_to_encode = ['protocol_type', 'service', 'flag']
cols_to_drop_after_splitting = ['protocol_type', 'service', 'flag']


onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_features_train = onehot_encoder.fit_transform(X_train[categorical_cols_to_encode])
encoded_df_train = pd.DataFrame(encoded_features_train, columns=onehot_encoder.get_feature_names_out(categorical_cols_to_encode), index=X_train.index)

X_train_numerical = X_train.drop(columns=cols_to_drop_after_splitting)
X_train_preprocessed = pd.concat([X_train_numerical, encoded_df_train], axis=1)


binary_cols_train = [col for col in X_train_preprocessed.columns if X_train_preprocessed[col].dropna().nunique() == 2 and X_train_preprocessed[col].dtype in ['int64', 'float64']]
non_binary_cols_train = [col for col in X_train_preprocessed.columns if col not in binary_cols_train and X_train_preprocessed[col].dtype in ['int64', 'float64']]


scaler = StandardScaler()
df_scaled_train = X_train_preprocessed.copy()
df_scaled_train[non_binary_cols_train] = scaler.fit_transform(X_train_preprocessed[non_binary_cols_train])


label_encoder = LabelEncoder()
y_encoded_train = label_encoder.fit_transform(y_train)
y_encoded_train = pd.DataFrame(y_encoded_train, index=y_train.index, columns=['class'])

print(f"Shape of df_scaled_train (training features): {df_scaled_train.shape}")


# new test data preprocessing
test_path = r'/kaggle/input/testnew/KDDTest.arff'
data_test, meta_test = arff.loadarff(test_path)
df_test = pd.DataFrame(data_test)

for col in df_test.select_dtypes([object]).columns:
    df_test[col] = df_test[col].str.decode('utf-8')

y_test = df_test['class']
X_test = df_test.drop(['class'], axis=1) 

encoded_features_test = onehot_encoder.transform(X_test[categorical_cols_to_encode])
encoded_df_test = pd.DataFrame(encoded_features_test, columns=onehot_encoder.get_feature_names_out(categorical_cols_to_encode), index=X_test.index)

X_test_numerical = X_test.drop(columns=cols_to_drop_after_splitting)
X_test_preprocessed = pd.concat([X_test_numerical, encoded_df_test], axis=1)

missing_cols_in_test = set(X_train_preprocessed.columns) - set(X_test_preprocessed.columns)
for col in missing_cols_in_test:
    X_test_preprocessed[col] = 0

X_test_preprocessed = X_test_preprocessed[X_train_preprocessed.columns]

df_scaled_test = X_test_preprocessed.copy()
df_scaled_test[non_binary_cols_train] = scaler.transform(X_test_preprocessed[non_binary_cols_train]) 


y_encoded_test = label_encoder.transform(y_test)
y_encoded_test = pd.DataFrame(y_encoded_test, index=y_test.index, columns=['class'])

print(f"Shape of df_scaled_test (test features): {df_scaled_test.shape}")

assert df_scaled_train.shape[1] == df_scaled_test.shape[1], \
    f"Number of features in training ({df_scaled_train.shape[1]}) and test sets ({df_scaled_test.shape[1]}) do not match!"
assert df_scaled_train.shape[0] == y_encoded_train.shape[0], \
    "Number of rows in training features and labels do not match!"
assert df_scaled_test.shape[0] == y_encoded_test.shape[0], \
    "Number of rows in test features and labels do not match!"


input_dim = df_scaled_train.shape[1]
print(f"Detected input dimension for the model: {input_dim}")

In [None]:
#Model Definition
model = models.Sequential([
    BatchNormalization(input_shape=(input_dim,)),
    layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.05)),
    BatchNormalization(),
    Dropout(0.5),
    layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.05)),
    BatchNormalization(),
    Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])



model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stopping_2 = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) 
reduce_lr_2 = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10, 
    min_lr=0.00001,
    verbose=1
)


history_2 = model.fit(
    df_scaled_train, y_encoded_train,
    epochs=500, 
    batch_size=64,
    validation_data=(df_scaled_test, y_encoded_test),
    callbacks=[early_stopping_2, reduce_lr_2]
)




plt.figure(figsize=(12, 6))
plt.plot(history_2.history['accuracy'], label='Training Accuracy')
plt.plot(history_2.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(history_2.history['loss'], label='Training Loss')
plt.plot(history_2.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
train_acc = history_2.history['accuracy']  # or 'acc' in some older versions

# Validation accuracy for each epoch
val_acc = history_2.history['val_accuracy']  # or 'val_acc'

In [None]:


plt.plot(train_acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
train_acc_2 = history_2.history['accuracy']  # or 'acc' in some older versions

# Validation accuracy for each epoch
val_acc_2 = history_2.history['val_accuracy']  # or 'val_acc'

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_acc_2, label='Training Accuracy')
plt.plot(val_acc_2, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()