In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime, timezone
import ipaddress
from collections import Counter

from tqdm import tqdm
import plotly.express as px

from sentence_transformers import SentenceTransformer
import hdbscan

from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.utils import resample
from sklearn.metrics import (
    silhouette_score as sil_,
    silhouette_samples,
    calinski_harabasz_score as calinski_,
)

In [14]:
import os
def load_data(directory):
    data = []
    for file in tqdm(os.listdir(directory)):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(directory, file))
            data.append(df)
    return pd.concat(data, ignore_index=True)

df = load_data('data/cic')
# Clean column names (in case of whitespace)
df.columns = df.columns.str.strip()

df = df[df['Label'] != 'BENIGN']

features = df.select_dtypes(include=[np.number]).copy()
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)
df = df.loc[features.index].copy()

100%|██████████| 8/8 [00:12<00:00,  1.55s/it]


In [15]:
df.reset_index(drop=True, inplace=True)

In [16]:
def group_attack_label(label):
    if label == "BENIGN":
        return "Benign"
    elif "DoS" in label or label == "DDoS":
        return "DoS/DDoS"
    elif "PortScan" in label:
        return "Scan"
    elif "Patator" in label:
        return "BruteForce"
    elif "Web Attack" in label:
        return "WebAttack"
    elif label in ["Bot", "Infiltration", "Heartbleed"]:
        return "Other"
    else:
        return "Unknown"

df['label_group'] = df['Label'].apply(group_attack_label)

In [17]:
df["label_group"].value_counts()

label_group
DoS/DDoS      379737
Scan          158804
BruteForce     13832
WebAttack       2180
Other           2003
Name: count, dtype: int64

In [18]:
df

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,label_group
0,80,1293792,3,7,26,11607,20,0,8.666667,10.263203,...,0.0,0.0,0,0,0.0,0.0,0,0,DDoS,DoS/DDoS
1,80,4421382,4,0,24,0,6,6,6.000000,0.000000,...,0.0,0.0,0,0,0.0,0.0,0,0,DDoS,DoS/DDoS
2,80,1083538,3,6,26,11601,20,0,8.666667,10.263203,...,0.0,0.0,0,0,0.0,0.0,0,0,DDoS,DoS/DDoS
3,80,80034360,8,4,56,11601,20,0,7.000000,5.656854,...,939.0,0.0,939,939,39300000.0,44200000.0,70600000,8072664,DDoS,DoS/DDoS
4,80,642654,3,6,26,11607,20,0,8.666667,10.263203,...,0.0,0.0,0,0,0.0,0.0,0,0,DDoS,DoS/DDoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556551,80,11512204,8,5,326,11632,326,0,40.750000,115.258405,...,892.0,0.0,892,892,6507197.0,0.0,6507197,6507197,DoS GoldenEye,DoS/DDoS
556552,80,11513325,5,5,471,3525,471,0,94.200000,210.637604,...,918.0,0.0,918,918,6508582.0,0.0,6508582,6508582,DoS GoldenEye,DoS/DDoS
556553,80,11509201,7,6,314,11632,314,0,44.857143,118.680845,...,899.0,0.0,899,899,6503248.0,0.0,6503248,6503248,DoS GoldenEye,DoS/DDoS
556554,80,11509095,8,5,369,11632,369,0,46.125000,130.461201,...,914.0,0.0,914,914,6504954.0,0.0,6504954,6504954,DoS GoldenEye,DoS/DDoS


In [23]:
df = df.loc[:, ~df.columns.str.contains('bwd', case=False)]
df = df.loc[:, ~df.columns.str.contains('backward', case=False)]


In [26]:
df.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Length of Fwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Length',
       'Fwd Packets/s', 'Min Packet Length', 'Max Packet Length',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count',
       'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count',
       'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size',
       'Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk',
       'Fwd Avg Bulk Rate', 'Subflow Fwd Packets', 'Subflow Fwd Bytes',
       'In

In [25]:
import pickle
with open('data/grouped_df.pkl', 'rb') as f:
    grouped_df = pickle.load(f)
grouped_df

Unnamed: 0,time_seconds_min,time_seconds_max,hour_first,minute_first,second_first,src_oct1_first,src_oct2_first,src_oct3_first,src_oct4_first,dst_oct1_first,...,dst_country_emb_3_first,dst_country_emb_4_first,dst_country_emb_5_first,dst_country_emb_6_first,dst_country_emb_7_first,dst_country_emb_8_first,dst_country_emb_9_first,burst_duration,attacks_in_burst,attack_rate
0,37407,37407,10,23,27,1,0,104,204,139,...,0.100905,-0.065801,0.297281,0.305936,-0.159620,0.116160,-0.044771,0,1,1.00
1,58010,58010,16,6,50,1,0,104,204,45,...,-0.011996,0.005260,-0.025826,-0.004693,0.013444,0.013675,-0.007392,0,1,1.00
2,18350,18353,5,5,50,1,0,138,115,139,...,0.331180,-0.106147,-0.040069,-0.212324,0.055434,0.027325,0.007878,3,3,0.75
3,55896,55896,15,31,36,1,0,152,1,67,...,0.100905,-0.065801,0.297281,0.305936,-0.159620,0.116160,-0.044771,0,1,1.00
4,29705,29705,8,15,5,1,0,155,6,141,...,-0.107222,0.117179,-0.348651,0.276392,0.075724,-0.134629,0.061312,0,1,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339707,29576,29576,8,12,56,99,61,5,160,155,...,0.056062,0.007198,0.258058,0.090996,-0.054872,0.113645,-0.099819,0,1,1.00
339708,37291,37291,10,21,31,99,63,230,56,65,...,0.331180,-0.106147,-0.040069,-0.212324,0.055434,0.027325,0.007878,0,1,1.00
339709,24838,24838,6,53,58,99,69,247,39,70,...,0.063331,-0.045459,-0.107533,-0.084309,-0.155936,-0.095196,-0.385889,0,1,1.00
339710,76666,76666,21,17,46,99,88,86,106,70,...,0.063331,-0.045459,-0.107533,-0.084309,-0.155936,-0.095196,-0.385889,0,1,1.00


In [29]:
grouped_df

Unnamed: 0,time_seconds_min,time_seconds_max,hour_first,minute_first,second_first,src_oct1_first,src_oct2_first,src_oct3_first,src_oct4_first,dst_oct1_first,...,dst_country_emb_3_first,dst_country_emb_4_first,dst_country_emb_5_first,dst_country_emb_6_first,dst_country_emb_7_first,dst_country_emb_8_first,dst_country_emb_9_first,burst_duration,attacks_in_burst,attack_rate
0,37407,37407,10,23,27,1,0,104,204,139,...,0.100905,-0.065801,0.297281,0.305936,-0.159620,0.116160,-0.044771,0,1,1.00
1,58010,58010,16,6,50,1,0,104,204,45,...,-0.011996,0.005260,-0.025826,-0.004693,0.013444,0.013675,-0.007392,0,1,1.00
2,18350,18353,5,5,50,1,0,138,115,139,...,0.331180,-0.106147,-0.040069,-0.212324,0.055434,0.027325,0.007878,3,3,0.75
3,55896,55896,15,31,36,1,0,152,1,67,...,0.100905,-0.065801,0.297281,0.305936,-0.159620,0.116160,-0.044771,0,1,1.00
4,29705,29705,8,15,5,1,0,155,6,141,...,-0.107222,0.117179,-0.348651,0.276392,0.075724,-0.134629,0.061312,0,1,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339707,29576,29576,8,12,56,99,61,5,160,155,...,0.056062,0.007198,0.258058,0.090996,-0.054872,0.113645,-0.099819,0,1,1.00
339708,37291,37291,10,21,31,99,63,230,56,65,...,0.331180,-0.106147,-0.040069,-0.212324,0.055434,0.027325,0.007878,0,1,1.00
339709,24838,24838,6,53,58,99,69,247,39,70,...,0.063331,-0.045459,-0.107533,-0.084309,-0.155936,-0.095196,-0.385889,0,1,1.00
339710,76666,76666,21,17,46,99,88,86,106,70,...,0.063331,-0.045459,-0.107533,-0.084309,-0.155936,-0.095196,-0.385889,0,1,1.00


In [28]:
list(grouped_df.columns)

['time_seconds_min',
 'time_seconds_max',
 'hour_first',
 'minute_first',
 'second_first',
 'src_oct1_first',
 'src_oct2_first',
 'src_oct3_first',
 'src_oct4_first',
 'dst_oct1_first',
 'dst_oct2_first',
 'dst_oct3_first',
 'dst_oct4_first',
 'src_lat_first',
 'src_lon_first',
 'dst_lat_first',
 'dst_lon_first',
 'src_lat_sin_first',
 'src_lat_cos_first',
 'src_lon_sin_first',
 'src_lon_cos_first',
 'dst_lat_sin_first',
 'dst_lat_cos_first',
 'dst_lon_sin_first',
 'dst_lon_cos_first',
 'src_proxy_first',
 'dst_proxy_first',
 'sequential_ports_max',
 'attack_entropy_mean',
 'is_weekend_first',
 'dst_port_nunique',
 'protocol_category_email_first',
 'protocol_category_file_transfer_first',
 'protocol_category_iot_first',
 'protocol_category_messaging_first',
 'protocol_category_remote_access_first',
 'protocol_category_web_first',
 'pl_entropy_min',
 'pl_entropy_max',
 'pl_entropy_mean',
 'pl_length_sum',
 'pl_length_mean',
 'src_as_encoded_first',
 'src_city_emb_0_first',
 'src_city_em