# Combine the data files into a single df containing attack and benign data

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import numpy as np
from df_helpers import non_persistent_preprocessing

np.random.seed(29)

non_persistent_file_names = ['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 
           'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
           'Friday-WorkingHours-Morning.pcap_ISCX.csv',
           'Monday-WorkingHours.pcap_ISCX.csv',
           'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
           'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
           'Tuesday-WorkingHours.pcap_ISCX.csv',
           'Wednesday-workingHours.pcap_ISCX.csv'
]

df = pd.DataFrame()

for file in non_persistent_file_names:
    print(file)
    df_temp = TabularDataset('non-persistent/'+file)
    df_temp.replace([np.inf, -np.inf], np.nan, inplace=True) 
    df_temp.dropna(inplace=True)
    df = pd.concat([df_temp, df])

df = df.sample(frac=1)

# Remove duplicate column and whitespace from all columns
df = non_persistent_preprocessing(df)
    
df.reset_index(inplace=True, drop=True)


Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Friday-WorkingHours-Morning.pcap_ISCX.csv
Monday-WorkingHours.pcap_ISCX.csv
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Tuesday-WorkingHours.pcap_ISCX.csv
Wednesday-workingHours.pcap_ISCX.csv


# Drop data for some uncommon threat types

In [2]:
benign_df = df.loc[df['Label'] == 'BENIGN']
malicious_df = df.loc[df['Label'] != 'BENIGN'].copy(deep=True)

malicious_df.drop(malicious_df[malicious_df['Label'] == 'Heartbleed'].index, inplace=True)
malicious_df.drop(malicious_df[malicious_df['Label'] == 'Web Attack � Sql Injection'].index, inplace=True)
malicious_df.drop(malicious_df[malicious_df['Label'] == 'Infiltration'].index, inplace=True)
malicious_df.drop(malicious_df[malicious_df['Label'] == 'Web Attack � XSS'].index, inplace=True)
malicious_df.drop(malicious_df[malicious_df['Label'] == 'Web Attack � Brute Force'].index, inplace=True)
malicious_df.drop(malicious_df[malicious_df['Label'] == 'Bot'].index, inplace=True)

merged_df = pd.concat([benign_df, malicious_df])
merged_df = merged_df.sample(frac=1)



In [3]:
merged_df['Label'].value_counts()

Label
BENIGN              2271320
DoS Hulk             230124
PortScan             158804
DDoS                 128025
DoS GoldenEye         10293
FTP-Patator            7935
SSH-Patator            5897
DoS slowloris          5796
DoS Slowhttptest       5499
Name: count, dtype: int64

# Remove the whitespace and duplicate column

In [17]:
# # some column names have trailing white space!!!
# column_names_without_space = []
# for x in list(merged_df):
#     column_names_without_space.append(x.strip())

# column_name_updates = {}
# for old_column_name, new_column_name in zip(list(merged_df), column_names_without_space):
#     column_name_updates[old_column_name] = new_column_name

# merged_df.rename(columns=column_name_updates, inplace=True)

# # This dataset also has a duplicate column, dropping...
# merged_df.drop(['Fwd Header Length.1'], axis=1, inplace=True)

# merged_df = merged_df.sample(frac=1, random_state=29)
    
# merged_df.reset_index(inplace=True, drop=True)


In [18]:
list(merged_df)

['Flow ID',
 'Source IP',
 'Source Port',
 'Destination IP',
 'Destination Port',
 'Protocol',
 'Timestamp',
 'Flow Duration',
 'Total Fwd Packets',
 'Total Backward Packets',
 'Total Length of Fwd Packets',
 'Total Length of Bwd Packets',
 'Fwd Packet Length Max',
 'Fwd Packet Length Min',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow IAT Max',
 'Flow IAT Min',
 'Fwd IAT Total',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Max',
 'Fwd IAT Min',
 'Bwd IAT Total',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd IAT Min',
 'Fwd PSH Flags',
 'Bwd PSH Flags',
 'Fwd URG Flags',
 'Bwd URG Flags',
 'Fwd Header Length',
 'Bwd Header Length',
 'Fwd Packets/s',
 'Bwd Packets/s',
 'Min Packet Length',
 'Max Packet Length',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'FIN Flag 

# Split the df into Train and Test dataframes

In [19]:
slice_size = int(0.75 * len(merged_df))
combined_train_df = merged_df[:slice_size].copy(deep=True)
combined_test_df = merged_df[slice_size:].copy(deep=True)


In [20]:
# Training
all_results = []
# for train_data_size in range(2000, 6000, 2000):
train_data_size = 2000
combined_train_df_slice = pd.DataFrame()
for malicious_label_name in list(malicious_df['Label'].unique()):
    malicious_slice = malicious_df.loc[malicious_df['Label'] == malicious_label_name][:int(train_data_size/8)]
    combined_train_df_slice = pd.concat([combined_train_df_slice, malicious_slice])
combined_train_df_slice = pd.concat([combined_train_df_slice, benign_df[:train_data_size]])

print(len(combined_train_df_slice))

predictor = TabularPredictor(label='Label', eval_metric='mcc').fit(combined_train_df_slice, time_limit=60, presets='medium_quality', fit_weighted_ensemble=False)

y_pred = predictor.predict(combined_test_df.drop(columns=['Label']))
all_results.append(predictor.evaluate(combined_test_df, silent=True))

    

No path specified. Models will be saved in: "AutogluonModels/ag-20250409_050526"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.3.0: Thu Jan  2 20:22:58 PST 2025; root:xnu-11215.81.4~3/RELEASE_ARM64_T8132
CPU Count:          10
Memory Avail:       3.01 GB / 16.00 GB (18.8%)
Disk Space Avail:   280.16 GB / 460.43 GB (60.8%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "/Users/navya/Documents/Uni/Repos/autogluon/AutogluonModels/ag-20250409_050526"
Train Data Rows:    4000
Train Data Columns: 83
Label Column:       Label
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	9 unique label values:  ['DoS Hulk', 'PortScan', 'DDoS', 'FTP-Patator', 'DoS GoldenEye', 'DoS Slowhttptest', 'DoS slowloris', 'SSH-Patator', 'BENIGN']
	If 'multiclass'

4000


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 6 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
		Fitting DatetimeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Useless Original Features (Count: 12): ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'RST Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all rows.
		These features do not need to be present at inference 

In [23]:
all_results

[{'mcc': 0.9938331690493879,
  'accuracy': 0.9979006238631921,
  'balanced_accuracy': 0.9951110602012148}]

# Model Export Non Persistent Multiclass classification

In [25]:
save_path_clone_opt = "non-persistent-multiclass-classifier"
# will return the path to the cloned predictor, identical to save_path_clone_opt
path_clone_opt = predictor.clone_for_deployment(path=save_path_clone_opt)

Cloned TabularPredictor located in '/Users/navya/Documents/Uni/Repos/autogluon/AutogluonModels/ag-20250409_050526' to 'non-persistent-multiclass-classifier'.
	To load the cloned predictor: predictor_clone = TabularPredictor.load(path="non-persistent-multiclass-classifier")
Clone: Keeping minimum set of models required to predict with best model 'LightGBM'...
Deleting model KNeighborsUnif. All files under /Users/navya/Documents/Uni/Repos/autogluon/non-persistent-multiclass-classifier/models/KNeighborsUnif will be removed.
Deleting model KNeighborsDist. All files under /Users/navya/Documents/Uni/Repos/autogluon/non-persistent-multiclass-classifier/models/KNeighborsDist will be removed.
Deleting model NeuralNetFastAI. All files under /Users/navya/Documents/Uni/Repos/autogluon/non-persistent-multiclass-classifier/models/NeuralNetFastAI will be removed.
Deleting model LightGBMXT. All files under /Users/navya/Documents/Uni/Repos/autogluon/non-persistent-multiclass-classifier/models/LightGBMX