In [9]:
#XGB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the XGBoost model on the 80% of the training split
model = XGBClassifier(random_state=42)
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)

# Calculate validation metrics
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


y_pred_unseen = model.predict(X_unseen_test)

# Calculate unseen test metrics
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')
# Output unseen test results
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.2f}")
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")




Accuracy: 98.63
Precision: 98.63
Recall: 98.63
F1-Score: 98.63
Overall TP: 98.58
Overall TN: 97.86
Overall FP: 0.71
Overall FN: 0.71


In [10]:
#XGB-B
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv' 
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the XGBoost model on the 80% of the training split
model = XGBClassifier(random_state=42)
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)

# Calculate validation metrics
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


y_pred_unseen = model.predict(X_unseen_test)

# Calculate unseen test metrics
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')
# Output unseen test results
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.2f}")
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")




Accuracy: 98.66
Precision: 98.66
Recall: 98.66
F1-Score: 98.66
Overall TP: 98.73
Overall TN: 98.10
Overall FP: 0.51
Overall FN: 0.51


In [11]:
#LGB-A
#!pip install lightgbm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier  # Import LightGBM
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load the dataset
filenames = [
          # '1-Update-3-All_together-Bi.csv',
           #'2-Update-3-All_together-Bi.csv',
        # '3-Update-3-All_together-Bi.csv'
      '1-All-Together-Update-Single-26-8-24.csv',
      '2-All-Together-Update-Single-26-8-24.csv',
      '3-All-Together-Update-Single-26-8-24.csv',
      '4-All-Together-Update-Single-26-8-24.csv',
       '5-All-Together-Update-Single-26-8-24.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop the unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the LightGBM model on the 80% of the training split
model = LGBMClassifier(random_state=42)
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


# Step 5: Evaluate the model on the 20% unseen testing data
y_pred_unseen = model.predict(X_unseen_test)

# Calculate metrics for unseen test data
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')

# Output results for unseen test data
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.2f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Accuracy: 98.51
Precision: 98.51
Recall: 98.50
F1-Score: 98.50
Overall TP: 98.44
Overall TN: 97.75
Overall FP: 0.75
Overall FN: 0.75


In [12]:
#LGB-B
#!pip install lightgbm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier  # Import LightGBM
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load the dataset
filenames = [
           '1-Update-3-All_together-Bi.csv',
           '2-Update-3-All_together-Bi.csv',
        '3-Update-3-All_together-Bi.csv'
      
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop the unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the LightGBM model on the 80% of the training split
model = LGBMClassifier(random_state=42)
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


# Step 5: Evaluate the model on the 20% unseen testing data
y_pred_unseen = model.predict(X_unseen_test)

# Calculate metrics for unseen test data
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')

# Output results for unseen test data
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.2f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Accuracy: 98.57
Precision: 98.57
Recall: 98.57
F1-Score: 98.56
Overall TP: 98.60
Overall TN: 97.95
Overall FP: 0.52
Overall FN: 0.52


In [13]:
#CatBoost-A
#!pip install catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier  # Import CatBoostClassifier from CatBoost
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
   '1-All-Together-Update-Single-26-8-24.csv',
      '2-All-Together-Update-Single-26-8-24.csv',
      '3-All-Together-Update-Single-26-8-24.csv',
      '4-All-Together-Update-Single-26-8-24.csv',
       '5-All-Together-Update-Single-26-8-24.csv' 
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the CatBoost model on the 80% of the training split
model = CatBoostClassifier(random_state=42, verbose=0)  # Use CatBoostClassifier
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)

# Calculate validation metrics
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


# Step 5: Evaluate the model on the 20% unseen testing data
y_pred_unseen = model.predict(X_unseen_test)

# Calculate unseen test metrics
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')

# Output unseen test results
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.42}")
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")




Accuracy: 98.22
Precision: 98.23
Recall: 98.22
F1-Score: 98.22
Overall TP: 98.17
Overall TN: 97.47
Overall FP: 0.91
Overall FN: 0.91


In [14]:
#CatBoost-B
#!pip install catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier  # Import CatBoostClassifier from CatBoost
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv'  
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the CatBoost model on the 80% of the training split
model = CatBoostClassifier(random_state=42, verbose=0)  # Use CatBoostClassifier
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)

# Calculate validation metrics
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


# Step 5: Evaluate the model on the 20% unseen testing data
y_pred_unseen = model.predict(X_unseen_test)

# Calculate unseen test metrics
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')

# Output unseen test results
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.42}")
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")




Accuracy: 98.21
Precision: 98.21
Recall: 98.21
F1-Score: 98.20
Overall TP: 98.39
Overall TN: 97.82
Overall FP: 0.89
Overall FN: 0.89


In [15]:
#RF-A
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
    #'1-Update-3-All_together-Bi.csv',
    #'2-Update-3-All_together-Bi.csv',
    #'3-Update-3-All_together-Bi.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the Random Forest model on the 80% of the training split
model = RandomForestClassifier(random_state=42)  # Replace with Random Forest
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)

# Calculate validation metrics
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


# Step 5: Evaluate the model on the 20% unseen testing data
y_pred_unseen = model.predict(X_unseen_test)

# Calculate unseen test metrics
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')

# Output unseen test results
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.2f}")
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")




Accuracy: 93.33
Precision: 93.36
Recall: 93.33
F1-Score: 93.33
Overall TP: 93.42
Overall TN: 92.88
Overall FP: 6.57
Overall FN: 6.57


In [16]:
#RF-B
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
    
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
df = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop unnecessary columns
df = df.drop(columns=['Source IP', 'Destination IP'])

# Define features (X) and target (y)
X = df.drop(columns=['Label'])
y = df['Label']

# Step 1: Split the dataset into 80% training and 20% unseen testing
X_train, X_unseen_test, y_train, y_unseen_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Further split the 80% training set into 80% training and 20% validation
X_train_split, X_validation_split, y_train_split, y_validation_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 3: Train the Random Forest model on the 80% of the training split
model = RandomForestClassifier(random_state=42)  # Replace with Random Forest
model.fit(X_train_split, y_train_split)

# Step 4: Evaluate the model on the 20% validation split
y_pred_validation = model.predict(X_validation_split)

# Calculate validation metrics
accuracy_validation = accuracy_score(y_validation_split, y_pred_validation)
precision_validation = precision_score(y_validation_split, y_pred_validation, average='weighted')
recall_validation = recall_score(y_validation_split, y_pred_validation, average='weighted')
f1_validation = f1_score(y_validation_split, y_pred_validation, average='weighted')


# Step 5: Evaluate the model on the 20% unseen testing data
y_pred_unseen = model.predict(X_unseen_test)

# Calculate unseen test metrics
accuracy_unseen = accuracy_score(y_unseen_test, y_pred_unseen)
precision_unseen = precision_score(y_unseen_test, y_pred_unseen, average='weighted')
recall_unseen = recall_score(y_unseen_test, y_pred_unseen, average='weighted')
f1_unseen = f1_score(y_unseen_test, y_pred_unseen, average='weighted')

# Output unseen test results
print(f"Accuracy: {accuracy_unseen:.2f}")
print(f"Precision: {precision_unseen:.2f}")
print(f"Recall: {recall_unseen:.2f}")
print(f"F1-Score: {f1_unseen:.2f}")
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_unseen_test, y_pred_unseen)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")




Accuracy: 87.82
Precision: 87.82
Recall: 87.82
F1-Score: 87.78
Overall TP: 89.08
Overall TN: 87.91
Overall FP: 5.55
Overall FN: 5.55


In [24]:
#CNN-A

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers, models  # Use from tensorflow.keras

from keras import layers
from tensorflow.keras import layers

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder_protocol = LabelEncoder()
data['Protocol'] = label_encoder_protocol.fit_transform(data['Protocol'])

label_encoder_label = LabelEncoder()
data['Label'] = label_encoder_label.fit_transform(data['Label'])

# Store the labels and drop unnecessary columns
labels = data['Label'].values
data.drop(columns=['Source IP', 'Destination IP', 'Label'], inplace=True)

# Split the data into 80-20 training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Further split the training data into 80-20 training-validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Scale the training, validation, and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Reshape the data for Conv1D layers
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_val_scaled = X_val_scaled.reshape((X_val_scaled.shape[0], X_val_scaled.shape[1], 1))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Define the CNN model
model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1], 1)),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(np.unique(labels)), activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
history = model.fit(
    X_train_scaled, y_train,
    epochs=40,
    batch_size=64,
    validation_data=(X_val_scaled, y_val)
)

# Evaluate the model on the validation set
val_predictions = model.predict(X_val_scaled)
val_predicted_labels = np.argmax(val_predictions, axis=1)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, val_predicted_labels)
val_precision = precision_score(y_val, val_predicted_labels, average='weighted', zero_division=0)
val_recall = recall_score(y_val, val_predicted_labels, average='weighted', zero_division=0)
val_f1_score = f1_score(y_val, val_predicted_labels, average='weighted', zero_division=0)



# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Accuracy: {test_accuracy * 100:.4f}%')

# Make predictions on the test set
test_predictions = model.predict(X_test_scaled)
test_predicted_labels = np.argmax(test_predictions, axis=1)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, test_predicted_labels)
test_precision = precision_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_recall = recall_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_f1_score = f1_score(y_test, test_predicted_labels, average='weighted', zero_division=0)

print(f'Accuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predicted_labels)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 945us/step - accuracy: 0.3285 - loss: 1.9579 - val_accuracy: 0.4997 - val_loss: 1.4701
Epoch 2/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 939us/step - accuracy: 0.5212 - loss: 1.4066 - val_accuracy: 0.5870 - val_loss: 1.2128
Epoch 3/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 949us/step - accuracy: 0.6010 - loss: 1.1726 - val_accuracy: 0.6376 - val_loss: 1.0738
Epoch 4/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 958us/step - accuracy: 0.6481 - loss: 1.0367 - val_accuracy: 0.6757 - val_loss: 0.9585
Epoch 5/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 957us/step - accuracy: 0.6792 - loss: 0.9515 - val_accuracy: 0.6796 - val_loss: 0.9381
Epoch 6/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 959us/step - accuracy: 0.6997 - loss: 0.8931 - val_accuracy: 0.7054 - val_loss: 0.8718
Epoch 7/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 956us/step - accuracy: 0.7163 - loss: 0.8454 - val_accuracy: 0.7235 - val_loss: 0.8206
Epoch 8/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 40s 953us/s

In [26]:
#CNN-B

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers, models  # Use from tensorflow.keras

from keras import layers
from tensorflow.keras import layers

# Load the dataset
filenames = [
     '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder_protocol = LabelEncoder()
data['Protocol'] = label_encoder_protocol.fit_transform(data['Protocol'])

label_encoder_label = LabelEncoder()
data['Label'] = label_encoder_label.fit_transform(data['Label'])

# Store the labels and drop unnecessary columns
labels = data['Label'].values
data.drop(columns=['Source IP', 'Destination IP', 'Label'], inplace=True)

# Split the data into 80-20 training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Further split the training data into 80-20 training-validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Scale the training, validation, and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Reshape the data for Conv1D layers
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_val_scaled = X_val_scaled.reshape((X_val_scaled.shape[0], X_val_scaled.shape[1], 1))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Define the CNN model
model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1], 1)),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(np.unique(labels)), activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
history = model.fit(
    X_train_scaled, y_train,
    epochs=40,
    batch_size=64,
    validation_data=(X_val_scaled, y_val)
)

# Evaluate the model on the validation set
val_predictions = model.predict(X_val_scaled)
val_predicted_labels = np.argmax(val_predictions, axis=1)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, val_predicted_labels)
val_precision = precision_score(y_val, val_predicted_labels, average='weighted', zero_division=0)
val_recall = recall_score(y_val, val_predicted_labels, average='weighted', zero_division=0)
val_f1_score = f1_score(y_val, val_predicted_labels, average='weighted', zero_division=0)



# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Accuracy: {test_accuracy * 100:.4f}%')

# Make predictions on the test set
test_predictions = model.predict(X_test_scaled)
test_predicted_labels = np.argmax(test_predictions, axis=1)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, test_predicted_labels)
test_precision = precision_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_recall = recall_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_f1_score = f1_score(y_test, test_predicted_labels, average='weighted', zero_division=0)

print(f'Accuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predicted_labels)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 20s 959us/step - accuracy: 0.3023 - loss: 2.0403 - val_accuracy: 0.4829 - val_loss: 1.5355
Epoch 2/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 19s 948us/step - accuracy: 0.5018 - loss: 1.4638 - val_accuracy: 0.5793 - val_loss: 1.2527
Epoch 3/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 19s 951us/step - accuracy: 0.5966 - loss: 1.1916 - val_accuracy: 0.6430 - val_loss: 1.0462
Epoch 4/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 19s 950us/step - accuracy: 0.6545 - loss: 1.0209 - val_accuracy: 0.6949 - val_loss: 0.9146
Epoch 5/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 19s 947us/step - accuracy: 0.6914 - loss: 0.9116 - val_accuracy: 0.7069 - val_loss: 0.8621
Epoch 6/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 20s 953us/step - accuracy: 0.7149 - loss: 0.8404 - val_accuracy: 0.7158 - val_loss: 0.8368
Epoch 7/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 20s 955us/step - accuracy: 0.7333 - loss: 0.7861 - val_accuracy: 0.7494 - val_loss: 0.7419
Epoch 8/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 20s 959us/s

In [27]:
#LSTM-A
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras import layers, Model

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Step 2: Initialize the scaler and fit it on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Step 3: Reshape the data for LSTM input (LSTM expects 3D input: [samples, timesteps, features])
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_reshaped = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Step 4: Define the LSTM model using Functional API
inputs = layers.Input(shape=(1, X_train_reshaped.shape[2]))
x = layers.LSTM(128, activation='tanh', return_sequences=True)(inputs)
x = layers.LSTM(64, activation='tanh')(x)
outputs = layers.Dense(len(np.unique(y)), activation='softmax')(x)
lstm_model = Model(inputs, outputs)

# Compile and train the LSTM model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_reshaped, y_train, epochs=40, batch_size=64, validation_data=(X_val_reshaped, y_val))

# --- Validation Metrics ---
# Make predictions with the LSTM model on validation set
y_pred_val = np.argmax(lstm_model.predict(X_val_reshaped), axis=-1)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val, average='weighted')
val_recall = recall_score(y_val, y_pred_val, average='weighted')
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')

# --- Test Metrics ---
# Make predictions with the LSTM model on the test set
y_pred_test = np.argmax(lstm_model.predict(X_test_reshaped), axis=-1)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

print(f'\nAccuracy:{test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 46s 1ms/step - accuracy: 0.3973 - loss: 1.7740 - val_accuracy: 0.6370 - val_loss: 1.1144
Epoch 2/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 45s 1ms/step - accuracy: 0.6588 - loss: 1.0388 - val_accuracy: 0.7163 - val_loss: 0.8656
Epoch 3/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 44s 1ms/step - accuracy: 0.7297 - loss: 0.8373 - val_accuracy: 0.7609 - val_loss: 0.7501
Epoch 4/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 45s 1ms/step - accuracy: 0.7685 - loss: 0.7257 - val_accuracy: 0.7892 - val_loss: 0.6670
Epoch 5/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 44s 1ms/step - accuracy: 0.7927 - loss: 0.6556 - val_accuracy: 0.8091 - val_loss: 0.6069
Epoch 6/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 45s 1ms/step - accuracy: 0.8120 - loss: 0.6031 - val_accuracy: 0.8203 - val_loss: 0.5719
Epoch 7/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 45s 1ms/step - accuracy: 0.8257 - loss: 0.5623 - val_accuracy: 0.8325 - val_loss: 0.5408
Epoch 8/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 44s 1ms/step - accuracy: 

In [28]:
#LSTM-B
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras import layers, Model

# Load the dataset
filenames = [
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Step 2: Initialize the scaler and fit it on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Step 3: Reshape the data for LSTM input (LSTM expects 3D input: [samples, timesteps, features])
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_reshaped = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Step 4: Define the LSTM model using Functional API
inputs = layers.Input(shape=(1, X_train_reshaped.shape[2]))
x = layers.LSTM(128, activation='tanh', return_sequences=True)(inputs)
x = layers.LSTM(64, activation='tanh')(x)
outputs = layers.Dense(len(np.unique(y)), activation='softmax')(x)
lstm_model = Model(inputs, outputs)

# Compile and train the LSTM model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_reshaped, y_train, epochs=40, batch_size=64, validation_data=(X_val_reshaped, y_val))

# --- Validation Metrics ---
# Make predictions with the LSTM model on validation set
y_pred_val = np.argmax(lstm_model.predict(X_val_reshaped), axis=-1)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val, average='weighted')
val_recall = recall_score(y_val, y_pred_val, average='weighted')
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')

# --- Test Metrics ---
# Make predictions with the LSTM model on the test set
y_pred_test = np.argmax(lstm_model.predict(X_test_reshaped), axis=-1)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

print(f'\nAccuracy:{test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 23s 1ms/step - accuracy: 0.3281 - loss: 1.9645 - val_accuracy: 0.5234 - val_loss: 1.4337
Epoch 2/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 22s 1ms/step - accuracy: 0.5584 - loss: 1.3279 - val_accuracy: 0.6272 - val_loss: 1.1195
Epoch 3/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 21s 1ms/step - accuracy: 0.6442 - loss: 1.0774 - val_accuracy: 0.6809 - val_loss: 0.9649
Epoch 4/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 22s 1ms/step - accuracy: 0.6915 - loss: 0.9435 - val_accuracy: 0.7135 - val_loss: 0.8741
Epoch 5/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 22s 1ms/step - accuracy: 0.7219 - loss: 0.8560 - val_accuracy: 0.7403 - val_loss: 0.8070
Epoch 6/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 21s 1ms/step - accuracy: 0.7456 - loss: 0.7899 - val_accuracy: 0.7620 - val_loss: 0.7457
Epoch 7/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 22s 1ms/step - accuracy: 0.7647 - loss: 0.7358 - val_accuracy: 0.7751 - val_loss: 0.7071
Epoch 8/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 22s 1ms/step - accuracy: 

In [29]:
#FFNN-A
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
    
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
# First, split into Train+Validation and Test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split Train+Validation into Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)  # 0.25 x 0.8 = 0.2 of total data

# Step 2: Initialize the scaler and fit it on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Scale the validation and test sets using the scaler fitted on the training data
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Define the feedforward neural network model
model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(np.unique(y)), activation='softmax')  # Adjust based on the number of target classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
model.fit(X_train_scaled, y_train, epochs=40, batch_size=64, validation_data=(X_val_scaled, y_val))

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Accuracy: {test_accuracy * 100:.4f}%')

# Make predictions on the test set
predictions_test = model.predict(X_test_scaled)
predicted_labels_test = np.argmax(predictions_test, axis=1)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, predicted_labels_test)
test_precision = precision_score(y_test, predicted_labels_test, average='weighted')
test_recall = recall_score(y_test, predicted_labels_test, average='weighted')
test_f1_score = f1_score(y_test, predicted_labels_test, average='weighted')

print(f'Accuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, predicted_labels_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 20s 456us/step - accuracy: 0.3273 - loss: 1.9555 - val_accuracy: 0.4501 - val_loss: 1.5869
Epoch 2/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 20s 472us/step - accuracy: 0.4807 - loss: 1.5248 - val_accuracy: 0.5258 - val_loss: 1.3970
Epoch 3/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 20s 475us/step - accuracy: 0.5373 - loss: 1.3556 - val_accuracy: 0.5593 - val_loss: 1.2849
Epoch 4/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 20s 474us/step - accuracy: 0.5719 - loss: 1.2581 - val_accuracy: 0.5971 - val_loss: 1.1893
Epoch 5/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 20s 477us/step - accuracy: 0.5972 - loss: 1.1847 - val_accuracy: 0.6006 - val_loss: 1.1671
Epoch 6/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 21s 485us/step - accuracy: 0.6144 - loss: 1.1353 - val_accuracy: 0.6229 - val_loss: 1.1097
Epoch 7/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 20s 474us/step - accuracy: 0.6295 - loss: 1.0940 - val_accuracy: 0.6509 - val_loss: 1.0480
Epoch 8/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 20s 478us/s

In [30]:
#FFNN-B
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the dataset
filenames = [
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv' 
    
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
# First, split into Train+Validation and Test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split Train+Validation into Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)  # 0.25 x 0.8 = 0.2 of total data

# Step 2: Initialize the scaler and fit it on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Scale the validation and test sets using the scaler fitted on the training data
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Define the feedforward neural network model
model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(np.unique(y)), activation='softmax')  # Adjust based on the number of target classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with validation data
model.fit(X_train_scaled, y_train, epochs=40, batch_size=64, validation_data=(X_val_scaled, y_val))

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Accuracy: {test_accuracy * 100:.4f}%')

# Make predictions on the test set
predictions_test = model.predict(X_test_scaled)
predicted_labels_test = np.argmax(predictions_test, axis=1)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, predicted_labels_test)
test_precision = precision_score(y_test, predicted_labels_test, average='weighted')
test_recall = recall_score(y_test, predicted_labels_test, average='weighted')
test_f1_score = f1_score(y_test, predicted_labels_test, average='weighted')

print(f'Accuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, predicted_labels_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 10s 528us/step - accuracy: 0.2873 - loss: 2.0690 - val_accuracy: 0.3874 - val_loss: 1.7782
Epoch 2/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 9s 521us/step - accuracy: 0.4084 - loss: 1.7221 - val_accuracy: 0.4560 - val_loss: 1.5946
Epoch 3/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 10s 541us/step - accuracy: 0.4646 - loss: 1.5658 - val_accuracy: 0.4956 - val_loss: 1.4947
Epoch 4/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 10s 530us/step - accuracy: 0.5028 - loss: 1.4662 - val_accuracy: 0.5215 - val_loss: 1.4068
Epoch 5/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 9s 526us/step - accuracy: 0.5295 - loss: 1.3926 - val_accuracy: 0.5312 - val_loss: 1.3713
Epoch 6/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 9s 527us/step - accuracy: 0.5502 - loss: 1.3345 - val_accuracy: 0.5670 - val_loss: 1.2976
Epoch 7/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 9s 528us/step - accuracy: 0.5677 - loss: 1.2854 - val_accuracy: 0.5592 - val_loss: 1.2843
Epoch 8/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 10s 532us/step 

In [20]:
#TabNet-A
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
     '1-All-Together-Update-Single-26-8-24.csv',
   '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
   '5-All-Together-Update-Single-26-8-24.csv'
   
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Drop 'Source IP' and 'Destination IP' columns
data = data.drop(columns=['Source IP', 'Destination IP'])

# Convert categorical 'Protocol' column to numerical using Label Encoding
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])

# Split the data into features (X) and labels (y)
X = data.drop(columns=['Label'])  # Assuming 'Label' is your target column
y = data['Label']

# Step 1: Split into Training (80%) and Testing (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Further split Training into Training (80%) and Validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

# Define the TabNet model with fine-tuned hyperparameters
tabnet_model = TabNetClassifier(
    n_d=32,  # Reduced Decision layer output size
    n_a=32,  # Reduced Attention layer size
    n_steps=3,  # Reduced number of hidden layers (decision steps)
    gamma=1.0,  # Adjusted coefficient for feature reusage in decision step
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-2),  # Reduced learning rate
    scheduler_params={"step_size":20, "gamma":0.8},  # Adjusted learning rate scheduler
    mask_type='entmax',  # Changed mask type in TabNet
)

# Train the TabNet model
tabnet_model.fit(
    X_train=X_train.values, y_train=y_train.values,
    eval_set=[(X_val.values, y_val.values), (X_test.values, y_test.values)],
    eval_name=['validation', 'test'],
    eval_metric=['accuracy'],
    max_epochs=40,  # Increase number of epochs
    patience=10,  # Patience for early stopping
    batch_size=1024, virtual_batch_size=128,
    num_workers=1, drop_last=False
)

# Evaluate the model on Validation and Test sets
val_preds = tabnet_model.predict(X_val.values)
test_preds = tabnet_model.predict(X_test.values)

# Calculate metrics for Validation set
val_accuracy = accuracy_score(y_val, val_preds)
val_precision = precision_score(y_val, val_preds, average='weighted')
val_recall = recall_score(y_val, val_preds, average='weighted')
val_f1 = f1_score(y_val, val_preds, average='weighted')



# Calculate metrics for Test set
test_accuracy = accuracy_score(y_test, test_preds)
test_precision = precision_score(y_test, test_preds, average='weighted')
test_recall = recall_score(y_test, test_preds, average='weighted')
test_f1 = f1_score(y_test, test_preds, average='weighted')

print(f"Accuracy: {test_accuracy * 100:.2f}%")
print(f"Precision: {test_precision * 100:.2f}%")
print(f"Recall: {test_recall * 100:.2f}%")
print(f"F1-Score: {test_f1 * 100:.2f}%")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_val, val_preds)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



epoch 0  | loss: 1.97642 | validation_accuracy: 0.40161 | test_accuracy: 0.40212 |  0:02:05s
epoch 1  | loss: 1.62754 | validation_accuracy: 0.46586 | test_accuracy: 0.46533 |  0:04:10s
epoch 2  | loss: 1.49843 | validation_accuracy: 0.50719 | test_accuracy: 0.50751 |  0:06:16s
epoch 3  | loss: 1.45283 | validation_accuracy: 0.57678 | test_accuracy: 0.57787 |  0:08:20s
epoch 4  | loss: 1.40475 | validation_accuracy: 0.52407 | test_accuracy: 0.52488 |  0:10:22s
epoch 5  | loss: 1.35478 | validation_accuracy: 0.49926 | test_accuracy: 0.49917 |  0:12:27s
epoch 6  | loss: 1.33864 | validation_accuracy: 0.60885 | test_accuracy: 0.60972 |  0:14:36s
epoch 7  | loss: 1.31064 | validation_accuracy: 0.5449 | test_accuracy: 0.54486 |  0:16:45s
epoch 8  | loss: 1.29692 | validation_accuracy: 0.51509 | test_accuracy: 0.51513 |  0:18:54s
epoch 9  | loss: 1.28205 | validation_accuracy: 0.55115 | test_accuracy: 0.55073 |  0:21:03s
epoch 10  | loss: 1.21888 | validation_accuracy: 0.59068 | test_accurac

In [21]:
#TabNet-B
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
filenames = [
     '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv'
   
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Drop 'Source IP' and 'Destination IP' columns
data = data.drop(columns=['Source IP', 'Destination IP'])

# Convert categorical 'Protocol' column to numerical using Label Encoding
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])

# Split the data into features (X) and labels (y)
X = data.drop(columns=['Label'])  # Assuming 'Label' is your target column
y = data['Label']

# Step 1: Split into Training (80%) and Testing (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Further split Training into Training (80%) and Validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

# Define the TabNet model with fine-tuned hyperparameters
tabnet_model = TabNetClassifier(
    n_d=32,  # Reduced Decision layer output size
    n_a=32,  # Reduced Attention layer size
    n_steps=3,  # Reduced number of hidden layers (decision steps)
    gamma=1.0,  # Adjusted coefficient for feature reusage in decision step
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-2),  # Reduced learning rate
    scheduler_params={"step_size":20, "gamma":0.8},  # Adjusted learning rate scheduler
    mask_type='entmax',  # Changed mask type in TabNet
)

# Train the TabNet model
tabnet_model.fit(
    X_train=X_train.values, y_train=y_train.values,
    eval_set=[(X_val.values, y_val.values), (X_test.values, y_test.values)],
    eval_name=['validation', 'test'],
    eval_metric=['accuracy'],
    max_epochs=40,  # Increase number of epochs
    patience=10,  # Patience for early stopping
    batch_size=1024, virtual_batch_size=128,
    num_workers=1, drop_last=False
)

# Evaluate the model on Validation and Test sets
val_preds = tabnet_model.predict(X_val.values)
test_preds = tabnet_model.predict(X_test.values)

# Calculate metrics for Validation set
val_accuracy = accuracy_score(y_val, val_preds)
val_precision = precision_score(y_val, val_preds, average='weighted')
val_recall = recall_score(y_val, val_preds, average='weighted')
val_f1 = f1_score(y_val, val_preds, average='weighted')



# Calculate metrics for Test set
test_accuracy = accuracy_score(y_test, test_preds)
test_precision = precision_score(y_test, test_preds, average='weighted')
test_recall = recall_score(y_test, test_preds, average='weighted')
test_f1 = f1_score(y_test, test_preds, average='weighted')

print(f"Accuracy: {test_accuracy * 100:.2f}%")
print(f"Precision: {test_precision * 100:.2f}%")
print(f"Recall: {test_recall * 100:.2f}%")
print(f"F1-Score: {test_f1 * 100:.2f}%")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_val, val_preds)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



epoch 0  | loss: 2.1253 | validation_accuracy: 0.34256 | test_accuracy: 0.34316 |  0:01:03s
epoch 1  | loss: 1.81025 | validation_accuracy: 0.42152 | test_accuracy: 0.4219 |  0:02:06s
epoch 2  | loss: 1.64385 | validation_accuracy: 0.4915 | test_accuracy: 0.49245 |  0:03:09s
epoch 3  | loss: 1.55703 | validation_accuracy: 0.47667 | test_accuracy: 0.47763 |  0:04:12s
epoch 4  | loss: 1.49588 | validation_accuracy: 0.47285 | test_accuracy: 0.47316 |  0:05:15s
epoch 5  | loss: 1.45646 | validation_accuracy: 0.55168 | test_accuracy: 0.55193 |  0:06:18s
epoch 6  | loss: 1.42892 | validation_accuracy: 0.55157 | test_accuracy: 0.5521 |  0:07:21s
epoch 7  | loss: 1.39888 | validation_accuracy: 0.57106 | test_accuracy: 0.57288 |  0:08:24s
epoch 8  | loss: 1.35797 | validation_accuracy: 0.57327 | test_accuracy: 0.57382 |  0:09:29s
epoch 9  | loss: 1.33848 | validation_accuracy: 0.56052 | test_accuracy: 0.56245 |  0:10:32s
epoch 10  | loss: 1.29554 | validation_accuracy: 0.6163 | test_accuracy: 0

In [22]:
##Original-TABNET-XGB-----------------------80-20
#Tabnet-XGB-A

import pandas as pd
import numpy as np
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
   # '1-Update-3-All_together-Bi.csv',
   # '2-Update-3-All_together-Bi.csv',
    #'3-Update-3-All_together-Bi.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Drop 'Source IP' and 'Destination IP' columns
data = data.drop(columns=['Source IP', 'Destination IP'])

# Convert categorical 'Protocol' column to numerical using Label Encoding
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Split the data into features (X) and labels (y)
X = data.drop(columns=['Label'])  # Assuming 'Label' is your target column
y = data['Label']

# First, split the dataset into 80% training and 20% testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split the 80% training set into 80% actual training and 20% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the TabNet model with 4 decision steps (hidden layers)
tabnet_model = TabNetClassifier(
     n_d=32,  # Reduced Decision layer output size
    n_a=32,  # Reduced Attention layer size
    n_steps=3,  # Reduced number of hidden layers (decision steps)
    gamma=1.0,  # Adjusted coefficient for feature reusage in decision step
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-2),  # Reduced learning rate
    scheduler_params={"step_size":20, "gamma":0.8},  # Adjusted learning rate scheduler
    mask_type='entmax',  # Changed mask type in TabNet
)

# Train the TabNet model with the validation set
tabnet_model.fit(
    X_train=X_train.values, y_train=y_train.values,
    eval_set=[(X_val.values, y_val.values)],
    eval_name=['validation'],
    eval_metric=['accuracy'],
    max_epochs=40,  # Set the number of epochs
    patience=10,  # Stop if no improvement after 10 epochs
    batch_size=1024, virtual_batch_size=128,
    num_workers=1, drop_last=False
)

# Get the probability scores from TabNet
tabnet_train_output = tabnet_model.predict_proba(X_train.values)
tabnet_val_output = tabnet_model.predict_proba(X_val.values)
tabnet_test_output = tabnet_model.predict_proba(X_test.values)

# Train an XGBoost model on TabNet's training output
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(tabnet_train_output, y_train)

# Predict with the XGBoost model on validation and test sets
xgb_val_preds = xgb_model.predict(tabnet_val_output)
xgb_test_preds = xgb_model.predict(tabnet_test_output)

# Evaluate XGBoost model performance on the testing set
xgb_test_accuracy = accuracy_score(y_test, xgb_test_preds)
xgb_test_precision = precision_score(y_test, xgb_test_preds, average='weighted')
xgb_test_recall = recall_score(y_test, xgb_test_preds, average='weighted')
xgb_test_f1 = f1_score(y_test, xgb_test_preds, average='weighted')


print(f"Accuracy: {xgb_test_accuracy * 100:.2f}%")
print(f"Precision: {xgb_test_precisionn * 100:.2f}%")
print(f"Recall: {xgb_test_recall * 100:.2f}%")
print(f"F1-Score: {xgb_test_f1 * 100:.2f}%")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, xgb_test_preds)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



epoch 0  | loss: 1.93248 | validation_accuracy: 0.1773 |  0:01:04s
epoch 1  | loss: 1.64357 | validation_accuracy: 0.1681 |  0:02:09s
epoch 2  | loss: 1.51891 | validation_accuracy: 0.1606 |  0:03:18s
epoch 3  | loss: 1.44765 | validation_accuracy: 0.16881 |  0:04:32s
epoch 4  | loss: 1.39207 | validation_accuracy: 0.15698 |  0:05:36s
epoch 5  | loss: 1.35884 | validation_accuracy: 0.16609 |  0:06:41s
epoch 6  | loss: 1.31611 | validation_accuracy: 0.14968 |  0:07:45s
epoch 7  | loss: 1.30148 | validation_accuracy: 0.1519 |  0:08:49s
epoch 8  | loss: 1.26737 | validation_accuracy: 0.14716 |  0:09:54s
epoch 9  | loss: 1.23795 | validation_accuracy: 0.15809 |  0:10:58s
epoch 10  | loss: 1.21772 | validation_accuracy: 0.1775 |  0:12:02s
epoch 11  | loss: 1.19136 | validation_accuracy: 0.16083 |  0:13:07s
epoch 12  | loss: 1.17117 | validation_accuracy: 0.16557 |  0:14:12s
epoch 13  | loss: 1.15543 | validation_accuracy: 0.14526 |  0:15:17s
epoch 14  | loss: 1.12842 | validation_accuracy: 

In [23]:
##Original-TABNET-XGB-----------------------80-20
#Tabnet-XGB-B

import pandas as pd
import numpy as np
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

# Load the dataset
filenames = [
    
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Drop 'Source IP' and 'Destination IP' columns
data = data.drop(columns=['Source IP', 'Destination IP'])

# Convert categorical 'Protocol' column to numerical using Label Encoding
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Split the data into features (X) and labels (y)
X = data.drop(columns=['Label'])  # Assuming 'Label' is your target column
y = data['Label']

# First, split the dataset into 80% training and 20% testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split the 80% training set into 80% actual training and 20% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the TabNet model with 4 decision steps (hidden layers)
tabnet_model = TabNetClassifier(
     n_d=32,  # Reduced Decision layer output size
    n_a=32,  # Reduced Attention layer size
    n_steps=3,  # Reduced number of hidden layers (decision steps)
    gamma=1.0,  # Adjusted coefficient for feature reusage in decision step
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-2),  # Reduced learning rate
    scheduler_params={"step_size":20, "gamma":0.8},  # Adjusted learning rate scheduler
    mask_type='entmax',  # Changed mask type in TabNet
)

# Train the TabNet model with the validation set
tabnet_model.fit(
    X_train=X_train.values, y_train=y_train.values,
    eval_set=[(X_val.values, y_val.values)],
    eval_name=['validation'],
    eval_metric=['accuracy'],
    max_epochs=40,  # Set the number of epochs
    patience=10,  # Stop if no improvement after 10 epochs
    batch_size=1024, virtual_batch_size=128,
    num_workers=1, drop_last=False
)

# Get the probability scores from TabNet
tabnet_train_output = tabnet_model.predict_proba(X_train.values)
tabnet_val_output = tabnet_model.predict_proba(X_val.values)
tabnet_test_output = tabnet_model.predict_proba(X_test.values)

# Train an XGBoost model on TabNet's training output
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(tabnet_train_output, y_train)

# Predict with the XGBoost model on validation and test sets
xgb_val_preds = xgb_model.predict(tabnet_val_output)
xgb_test_preds = xgb_model.predict(tabnet_test_output)

# Evaluate XGBoost model performance on the testing set
xgb_test_accuracy = accuracy_score(y_test, xgb_test_preds)
xgb_test_precision = precision_score(y_test, xgb_test_preds, average='weighted')
xgb_test_recall = recall_score(y_test, xgb_test_preds, average='weighted')
xgb_test_f1 = f1_score(y_test, xgb_test_preds, average='weighted')


print(f"Accuracy: {xgb_test_accuracy * 100:.2f}%")
print(f"Precision: {xgb_test_precisionn * 100:.2f}%")
print(f"Recall: {xgb_test_recall * 100:.2f}%")
print(f"F1-Score: {xgb_test_f1 * 100:.2f}%")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, xgb_test_preds)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



epoch 0  | loss: 2.13592 | validation_accuracy: 0.18737 |  0:00:32s
epoch 1  | loss: 1.84975 | validation_accuracy: 0.18929 |  0:01:03s
epoch 2  | loss: 1.70977 | validation_accuracy: 0.17264 |  0:01:35s
epoch 3  | loss: 1.60026 | validation_accuracy: 0.17847 |  0:02:07s
epoch 4  | loss: 1.53786 | validation_accuracy: 0.18306 |  0:02:38s
epoch 5  | loss: 1.49343 | validation_accuracy: 0.15583 |  0:03:10s
epoch 6  | loss: 1.46453 | validation_accuracy: 0.14998 |  0:03:42s
epoch 7  | loss: 1.44388 | validation_accuracy: 0.14958 |  0:04:13s
epoch 8  | loss: 1.39924 | validation_accuracy: 0.16248 |  0:04:45s
epoch 9  | loss: 1.39855 | validation_accuracy: 0.1412 |  0:05:16s
epoch 10  | loss: 1.37862 | validation_accuracy: 0.15747 |  0:05:48s
epoch 11  | loss: 1.34189 | validation_accuracy: 0.13419 |  0:06:20s
Early stopping occurred at epoch 11 with best_epoch = 1 and best_validation_accuracy = 0.18929
Accuracy: 96.58
Precision: 96.58
Recall: 96.58
F1-Score: 96.58
Overall TP: 96.56
Overall

In [34]:
#CNN-LSTM-A
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the dataset
filenames = [
    #'1-Update-3-All_together-Bi.csv',
   # '2-Update-3-All_together-Bi.csv',
    #'3-Update-3-All_together-Bi.csv',
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder_protocol = LabelEncoder()
data['Protocol'] = label_encoder_protocol.fit_transform(data['Protocol'])

label_encoder_label = LabelEncoder()
data['Label'] = label_encoder_label.fit_transform(data['Label'])

# Store the labels and drop unnecessary columns
labels = data['Label'].values
data.drop(columns=['Source IP', 'Destination IP', 'Label'], inplace=True)

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Further split the 80% training data into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Scale the training, validation, and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Reshape the data for Conv1D layers
X_train_final = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_val_final = X_val_scaled.reshape((X_val_scaled.shape[0], X_val_scaled.shape[1], 1))
X_test_final = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Define the CNN base model
cnn_model = keras.Sequential([
    layers.Input(shape=(X_train_final.shape[1], 1)),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(np.unique(labels)), activation='softmax')  # Output layer
])

# Compile the CNN model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the CNN model with validation data
cnn_model.fit(
    X_train_final, y_train,
    epochs=40,
    batch_size=64,
    validation_data=(X_val_final, y_val)
)

# Generate predictions for stacking
cnn_train_predictions = cnn_model.predict(X_train_final)
cnn_val_predictions = cnn_model.predict(X_val_final)
cnn_test_predictions = cnn_model.predict(X_test_final)

# Use the predicted probabilities as features for the LSTM model
cnn_train_preds_for_lstm = cnn_train_predictions.reshape(cnn_train_predictions.shape[0], cnn_train_predictions.shape[1], 1)
cnn_val_preds_for_lstm = cnn_val_predictions.reshape(cnn_val_predictions.shape[0], cnn_val_predictions.shape[1], 1)
cnn_test_preds_for_lstm = cnn_test_predictions.reshape(cnn_test_predictions.shape[0], cnn_test_predictions.shape[1], 1)

# Define the LSTM meta model
lstm_model = keras.Sequential([
    layers.LSTM(64, input_shape=(cnn_train_preds_for_lstm.shape[1], cnn_train_preds_for_lstm.shape[2])),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(np.unique(labels)), activation='softmax')  # Output layer
])

# Compile the LSTM meta model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the LSTM meta model with validation data
lstm_model.fit(
    cnn_train_preds_for_lstm, y_train,
    epochs=40,
    batch_size=64,
    validation_data=(cnn_val_preds_for_lstm, y_val)
)

# Evaluate the LSTM model on the validation and test sets
val_loss, val_accuracy = lstm_model.evaluate(cnn_val_preds_for_lstm, y_val)
test_loss, test_accuracy = lstm_model.evaluate(cnn_test_preds_for_lstm, y_test)

print(f'Validation Accuracy: {val_accuracy * 100:.4f}%')
print(f'Test Accuracy: {test_accuracy * 100:.4f}%')

# Make predictions on the validation and test sets using the stacked model
val_predictions = lstm_model.predict(cnn_val_preds_for_lstm)
test_predictions = lstm_model.predict(cnn_test_preds_for_lstm)

# Calculate predicted labels for validation and test sets
val_predicted_labels = np.argmax(val_predictions, axis=1)
test_predicted_labels = np.argmax(test_predictions, axis=1)

# Calculate metrics for test set
test_accuracy = accuracy_score(y_test, test_predicted_labels)
test_precision = precision_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_recall = recall_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_f1_score = f1_score(y_test, test_predicted_labels, average='weighted', zero_division=0)

print(f'Accuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predicted_labels)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 965us/step - accuracy: 0.3306 - loss: 1.9533 - val_accuracy: 0.4981 - val_loss: 1.4610
Epoch 2/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 962us/step - accuracy: 0.5270 - loss: 1.3876 - val_accuracy: 0.5793 - val_loss: 1.2307
Epoch 3/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 965us/step - accuracy: 0.5973 - loss: 1.1830 - val_accuracy: 0.6269 - val_loss: 1.0949
Epoch 4/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 962us/step - accuracy: 0.6396 - loss: 1.0609 - val_accuracy: 0.6512 - val_loss: 1.0155
Epoch 5/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 967us/step - accuracy: 0.6684 - loss: 0.9764 - val_accuracy: 0.6842 - val_loss: 0.9356
Epoch 6/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 971us/step - accuracy: 0.6901 - loss: 0.9134 - val_accuracy: 0.7027 - val_loss: 0.8849
Epoch 7/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 972us/step - accuracy: 0.7082 - loss: 0.8635 - val_accuracy: 0.7167 - val_loss: 0.8325
Epoch 8/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 41s 967us/s

In [35]:
#CNN-LSTM-B
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load the dataset
filenames = [
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv',
    
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder_protocol = LabelEncoder()
data['Protocol'] = label_encoder_protocol.fit_transform(data['Protocol'])

label_encoder_label = LabelEncoder()
data['Label'] = label_encoder_label.fit_transform(data['Label'])

# Store the labels and drop unnecessary columns
labels = data['Label'].values
data.drop(columns=['Source IP', 'Destination IP', 'Label'], inplace=True)

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Further split the 80% training data into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Scale the training, validation, and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Reshape the data for Conv1D layers
X_train_final = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_val_final = X_val_scaled.reshape((X_val_scaled.shape[0], X_val_scaled.shape[1], 1))
X_test_final = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Define the CNN base model
cnn_model = keras.Sequential([
    layers.Input(shape=(X_train_final.shape[1], 1)),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(np.unique(labels)), activation='softmax')  # Output layer
])

# Compile the CNN model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the CNN model with validation data
cnn_model.fit(
    X_train_final, y_train,
    epochs=40,
    batch_size=64,
    validation_data=(X_val_final, y_val)
)

# Generate predictions for stacking
cnn_train_predictions = cnn_model.predict(X_train_final)
cnn_val_predictions = cnn_model.predict(X_val_final)
cnn_test_predictions = cnn_model.predict(X_test_final)

# Use the predicted probabilities as features for the LSTM model
cnn_train_preds_for_lstm = cnn_train_predictions.reshape(cnn_train_predictions.shape[0], cnn_train_predictions.shape[1], 1)
cnn_val_preds_for_lstm = cnn_val_predictions.reshape(cnn_val_predictions.shape[0], cnn_val_predictions.shape[1], 1)
cnn_test_preds_for_lstm = cnn_test_predictions.reshape(cnn_test_predictions.shape[0], cnn_test_predictions.shape[1], 1)

# Define the LSTM meta model
lstm_model = keras.Sequential([
    layers.LSTM(64, input_shape=(cnn_train_preds_for_lstm.shape[1], cnn_train_preds_for_lstm.shape[2])),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(np.unique(labels)), activation='softmax')  # Output layer
])

# Compile the LSTM meta model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the LSTM meta model with validation data
lstm_model.fit(
    cnn_train_preds_for_lstm, y_train,
    epochs=40,
    batch_size=64,
    validation_data=(cnn_val_preds_for_lstm, y_val)
)

# Evaluate the LSTM model on the validation and test sets
val_loss, val_accuracy = lstm_model.evaluate(cnn_val_preds_for_lstm, y_val)
test_loss, test_accuracy = lstm_model.evaluate(cnn_test_preds_for_lstm, y_test)

print(f'Validation Accuracy: {val_accuracy * 100:.4f}%')
print(f'Test Accuracy: {test_accuracy * 100:.4f}%')

# Make predictions on the validation and test sets using the stacked model
val_predictions = lstm_model.predict(cnn_val_preds_for_lstm)
test_predictions = lstm_model.predict(cnn_test_preds_for_lstm)

# Calculate predicted labels for validation and test sets
val_predicted_labels = np.argmax(val_predictions, axis=1)
test_predicted_labels = np.argmax(test_predictions, axis=1)

# Calculate metrics for test set
test_accuracy = accuracy_score(y_test, test_predicted_labels)
test_precision = precision_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_recall = recall_score(y_test, test_predicted_labels, average='weighted', zero_division=0)
test_f1_score = f1_score(y_test, test_predicted_labels, average='weighted', zero_division=0)

print(f'Accuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predicted_labels)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 19s 898us/step - accuracy: 0.2944 - loss: 2.0591 - val_accuracy: 0.4690 - val_loss: 1.5592
Epoch 2/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 18s 893us/step - accuracy: 0.5079 - loss: 1.4457 - val_accuracy: 0.5957 - val_loss: 1.1980
Epoch 3/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 18s 897us/step - accuracy: 0.6036 - loss: 1.1606 - val_accuracy: 0.6470 - val_loss: 1.0274
Epoch 4/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 18s 882us/step - accuracy: 0.6591 - loss: 0.9982 - val_accuracy: 0.6870 - val_loss: 0.9128
Epoch 5/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 18s 884us/step - accuracy: 0.6951 - loss: 0.8992 - val_accuracy: 0.7190 - val_loss: 0.8310
Epoch 6/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 18s 894us/step - accuracy: 0.7219 - loss: 0.8219 - val_accuracy: 0.7359 - val_loss: 0.7833
Epoch 7/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 18s 889us/step - accuracy: 0.7428 - loss: 0.7628 - val_accuracy: 0.7426 - val_loss: 0.7492
Epoch 8/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 18s 896us/s

In [36]:
#FFNN-XGB-A
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import xgboost as xgb

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Step 2: Initialize the scaler and fit it on the training, testing and validation data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Define the feedforward neural network model using Functional API
inputs = keras.Input(shape=(X_train_scaled.shape[1],))
x = layers.Dense(128, activation='relu')(inputs)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(len(np.unique(y)), activation='softmax')(x)
ffnn_model = keras.Model(inputs=inputs, outputs=outputs)

# Compile and train the FFNN model
ffnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
ffnn_model.fit(X_train_scaled, y_train, epochs=40, batch_size=64, validation_data=(X_val_scaled, y_val))

# Extract intermediate output from the penultimate layer
intermediate_layer_model = keras.Model(inputs=ffnn_model.input, outputs=ffnn_model.layers[-2].output)
train_intermediate_output = intermediate_layer_model.predict(X_train_scaled)
val_intermediate_output = intermediate_layer_model.predict(X_val_scaled)
test_intermediate_output = intermediate_layer_model.predict(X_test_scaled)

# Train XGBoost as the meta model on the FFNN's intermediate output
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y)), eval_metric='mlogloss')
xgb_model.fit(train_intermediate_output, y_train)

# --- Validation Metrics ---
# Make predictions with the XGBoost model on validation set
y_pred_val = xgb_model.predict(val_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val, average='weighted')
val_recall = recall_score(y_val, y_pred_val, average='weighted')
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')

# --- Test Metrics ---
# Make predictions with the XGBoost model on the test set
y_pred_test = xgb_model.predict(test_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

print(f'\nAccuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 20s 530us/step - accuracy: 0.3120 - loss: 1.9937 - val_accuracy: 0.4350 - val_loss: 1.6618
Epoch 2/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 20s 527us/step - accuracy: 0.4553 - loss: 1.5939 - val_accuracy: 0.4958 - val_loss: 1.4838
Epoch 3/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 20s 536us/step - accuracy: 0.5038 - loss: 1.4525 - val_accuracy: 0.5279 - val_loss: 1.3923
Epoch 4/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 20s 537us/step - accuracy: 0.5398 - loss: 1.3617 - val_accuracy: 0.5465 - val_loss: 1.3551
Epoch 5/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 19s 522us/step - accuracy: 0.5622 - loss: 1.2975 - val_accuracy: 0.5711 - val_loss: 1.2530
Epoch 6/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 19s 524us/step - accuracy: 0.5797 - loss: 1.2462 - val_accuracy: 0.5790 - val_loss: 1.2399
Epoch 7/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 19s 525us/step - accuracy: 0.5932 - loss: 1.2071 - val_accuracy: 0.5958 - val_loss: 1.2081
Epoch 8/40
36906/36906 ━━━━━━━━━━━━━━━━━━━━ 19s 527us/s

In [37]:
#FFNN-XGB-A
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import xgboost as xgb

# Load the dataset
filenames = [
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv',
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Step 2: Initialize the scaler and fit it on the training, testing and validation data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Define the feedforward neural network model using Functional API
inputs = keras.Input(shape=(X_train_scaled.shape[1],))
x = layers.Dense(128, activation='relu')(inputs)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(len(np.unique(y)), activation='softmax')(x)
ffnn_model = keras.Model(inputs=inputs, outputs=outputs)

# Compile and train the FFNN model
ffnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
ffnn_model.fit(X_train_scaled, y_train, epochs=40, batch_size=64, validation_data=(X_val_scaled, y_val))

# Extract intermediate output from the penultimate layer
intermediate_layer_model = keras.Model(inputs=ffnn_model.input, outputs=ffnn_model.layers[-2].output)
train_intermediate_output = intermediate_layer_model.predict(X_train_scaled)
val_intermediate_output = intermediate_layer_model.predict(X_val_scaled)
test_intermediate_output = intermediate_layer_model.predict(X_test_scaled)

# Train XGBoost as the meta model on the FFNN's intermediate output
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y)), eval_metric='mlogloss')
xgb_model.fit(train_intermediate_output, y_train)

# --- Validation Metrics ---
# Make predictions with the XGBoost model on validation set
y_pred_val = xgb_model.predict(val_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val, average='weighted')
val_recall = recall_score(y_val, y_pred_val, average='weighted')
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')

# --- Test Metrics ---
# Make predictions with the XGBoost model on the test set
y_pred_test = xgb_model.predict(test_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

print(f'\nAccuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 13s 683us/step - accuracy: 0.2863 - loss: 2.0679 - val_accuracy: 0.3970 - val_loss: 1.7643
Epoch 2/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 12s 692us/step - accuracy: 0.4193 - loss: 1.7010 - val_accuracy: 0.4693 - val_loss: 1.5630
Epoch 3/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 12s 683us/step - accuracy: 0.4789 - loss: 1.5289 - val_accuracy: 0.5006 - val_loss: 1.4782
Epoch 4/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 12s 678us/step - accuracy: 0.5149 - loss: 1.4264 - val_accuracy: 0.5308 - val_loss: 1.3886
Epoch 5/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 12s 677us/step - accuracy: 0.5403 - loss: 1.3553 - val_accuracy: 0.5500 - val_loss: 1.3215
Epoch 6/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 12s 676us/step - accuracy: 0.5584 - loss: 1.3037 - val_accuracy: 0.5837 - val_loss: 1.2600
Epoch 7/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 12s 676us/step - accuracy: 0.5750 - loss: 1.2589 - val_accuracy: 0.5873 - val_loss: 1.2242
Epoch 8/40
17860/17860 ━━━━━━━━━━━━━━━━━━━━ 12s 676us/s

In [38]:
#LSTM-XGB-A
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras import layers, Model
import xgboost as xgb

# Load the dataset
filenames = [
    '1-All-Together-Update-Single-26-8-24.csv',
    '2-All-Together-Update-Single-26-8-24.csv',
    '3-All-Together-Update-Single-26-8-24.csv',
    '4-All-Together-Update-Single-26-8-24.csv',
    '5-All-Together-Update-Single-26-8-24.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Step 2: Initialize the scaler and fit it on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Step 3: Reshape the data for LSTM input (LSTM expects 3D input: [samples, timesteps, features])
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_reshaped = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Step 4: Define the LSTM model using Functional API
inputs = layers.Input(shape=(1, X_train_reshaped.shape[2]))
x = layers.LSTM(128, activation='tanh', return_sequences=True)(inputs)
x = layers.LSTM(64, activation='tanh')(x)
outputs = layers.Dense(len(np.unique(y)), activation='softmax')(x)
lstm_model = Model(inputs, outputs)

# Compile and train the LSTM model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_reshaped, y_train, epochs=40, batch_size=64, validation_data=(X_val_reshaped, y_val))

# Extract intermediate output from the penultimate layer
intermediate_layer_model = Model(inputs=lstm_model.input, outputs=lstm_model.layers[-2].output)
train_intermediate_output = intermediate_layer_model.predict(X_train_reshaped)
val_intermediate_output = intermediate_layer_model.predict(X_val_reshaped)
test_intermediate_output = intermediate_layer_model.predict(X_test_reshaped)

# Train XGBoost as the meta model on the LSTM's intermediate output
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y)), eval_metric='mlogloss')
xgb_model.fit(train_intermediate_output, y_train)

# --- Validation Metrics ---
# Make predictions with the XGBoost model on validation set
y_pred_val = xgb_model.predict(val_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val, average='weighted')
val_recall = recall_score(y_val, y_pred_val, average='weighted')
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')

# --- Test Metrics ---
# Make predictions with the XGBoost model on the test set
y_pred_test = xgb_model.predict(test_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

print(f'\nAccuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 51s 1ms/step - accuracy: 0.4002 - loss: 1.7764 - val_accuracy: 0.6400 - val_loss: 1.1161
Epoch 2/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 49s 1ms/step - accuracy: 0.6648 - loss: 1.0372 - val_accuracy: 0.7289 - val_loss: 0.8667
Epoch 3/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 49s 1ms/step - accuracy: 0.7344 - loss: 0.8348 - val_accuracy: 0.7621 - val_loss: 0.7387
Epoch 4/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 50s 1ms/step - accuracy: 0.7714 - loss: 0.7162 - val_accuracy: 0.7954 - val_loss: 0.6497
Epoch 5/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 50s 1ms/step - accuracy: 0.7976 - loss: 0.6381 - val_accuracy: 0.8084 - val_loss: 0.6032
Epoch 6/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 50s 1ms/step - accuracy: 0.8169 - loss: 0.5820 - val_accuracy: 0.8261 - val_loss: 0.5520
Epoch 7/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 50s 1ms/step - accuracy: 0.8306 - loss: 0.5404 - val_accuracy: 0.8410 - val_loss: 0.5081
Epoch 8/40
42178/42178 ━━━━━━━━━━━━━━━━━━━━ 50s 1ms/step - accuracy: 

In [39]:
#LSTM-XGB-A
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras import layers, Model
import xgboost as xgb

# Load the dataset
filenames = [
    '1-Update-3-All_together-Bi.csv',
    '2-Update-3-All_together-Bi.csv',
    '3-Update-3-All_together-Bi.csv'
]

# Concatenate all chunks into a single DataFrame
dataframes = [pd.read_csv(filename) for filename in filenames]
data = pd.concat(dataframes, ignore_index=True)

# Apply Label Encoding to the "Protocol" and "Label" columns
label_encoder = LabelEncoder()
data['Protocol'] = label_encoder.fit_transform(data['Protocol'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Separate features and target
X = data.drop(columns=['Source IP', 'Destination IP', 'Label'])
y = data['Label']

# Step 1: Split into Train, Validation, and Test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Step 2: Initialize the scaler and fit it on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Step 3: Reshape the data for LSTM input (LSTM expects 3D input: [samples, timesteps, features])
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_reshaped = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Step 4: Define the LSTM model using Functional API
inputs = layers.Input(shape=(1, X_train_reshaped.shape[2]))
x = layers.LSTM(128, activation='tanh', return_sequences=True)(inputs)
x = layers.LSTM(64, activation='tanh')(x)
outputs = layers.Dense(len(np.unique(y)), activation='softmax')(x)
lstm_model = Model(inputs, outputs)

# Compile and train the LSTM model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_reshaped, y_train, epochs=40, batch_size=64, validation_data=(X_val_reshaped, y_val))

# Extract intermediate output from the penultimate layer
intermediate_layer_model = Model(inputs=lstm_model.input, outputs=lstm_model.layers[-2].output)
train_intermediate_output = intermediate_layer_model.predict(X_train_reshaped)
val_intermediate_output = intermediate_layer_model.predict(X_val_reshaped)
test_intermediate_output = intermediate_layer_model.predict(X_test_reshaped)

# Train XGBoost as the meta model on the LSTM's intermediate output
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y)), eval_metric='mlogloss')
xgb_model.fit(train_intermediate_output, y_train)

# --- Validation Metrics ---
# Make predictions with the XGBoost model on validation set
y_pred_val = xgb_model.predict(val_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val, average='weighted')
val_recall = recall_score(y_val, y_pred_val, average='weighted')
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')

# --- Test Metrics ---
# Make predictions with the XGBoost model on the test set
y_pred_test = xgb_model.predict(test_intermediate_output)

# Calculate accuracy, precision, recall, and F1-score on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

print(f'\nAccuracy: {test_accuracy * 100:.2f}%')
print(f'Precision: {test_precision:.2f}')
print(f'Recall: {test_recall:.2f}')
print(f'F1-Score: {test_f1_score:.2f}')
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Aggregate TP, TN, FP, FN for all classes
TP = np.sum(np.diag(conf_matrix))  # True Positives
FP = np.sum(np.sum(conf_matrix, axis=0) - np.diag(conf_matrix))  # False Positives
FN = np.sum(np.sum(conf_matrix, axis=1) - np.diag(conf_matrix))  # False Negatives
TN = np.sum(conf_matrix) - (TP + FP + FN)  # True Negatives

# Calculate percentages
total_instances = np.sum(conf_matrix)
TP_percentage = (TP / total_instances) * 100
FP_percentage = (FP / total_instances) * 100
FN_percentage = (FN / total_instances) * 100
TN_percentage = (TN / total_instances) * 100

# Print TP, TN, FP, FN as percentages
print(f"Overall TP: {TP} ({TP_percentage:.2f}%)")
print(f"Overall TN: {TN} ({TN_percentage:.2f}%)")
print(f"Overall FP: {FP} ({FP_percentage:.2f}%)")
print(f"Overall FN: {FN} ({FN_percentage:.2f}%)")



Epoch 1/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 36s 2ms/step - accuracy: 0.3310 - loss: 1.9546 - val_accuracy: 0.5299 - val_loss: 1.4124
Epoch 2/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 34s 2ms/step - accuracy: 0.5591 - loss: 1.3175 - val_accuracy: 0.6262 - val_loss: 1.1161
Epoch 3/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 33s 2ms/step - accuracy: 0.6463 - loss: 1.0737 - val_accuracy: 0.6853 - val_loss: 0.9717
Epoch 4/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 34s 2ms/step - accuracy: 0.6943 - loss: 0.9423 - val_accuracy: 0.7156 - val_loss: 0.8838
Epoch 5/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 33s 2ms/step - accuracy: 0.7252 - loss: 0.8528 - val_accuracy: 0.7427 - val_loss: 0.8022
Epoch 6/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 34s 2ms/step - accuracy: 0.7462 - loss: 0.7911 - val_accuracy: 0.7588 - val_loss: 0.7547
Epoch 7/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 33s 2ms/step - accuracy: 0.7616 - loss: 0.7412 - val_accuracy: 0.7739 - val_loss: 0.7082
Epoch 8/40
20411/20411 ━━━━━━━━━━━━━━━━━━━━ 33s 2ms/step - accuracy: 