In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import ipaddress
import gdown
from sklearn.metrics import silhouette_score, silhouette_samples
import joblib


# **Load DATA**

In [None]:

# Google Drive file ID
file_id = "1UY_OmZj1fo-MAYWK7hjrp5u4NEzGamzz"

# Create download URL
download_url = f"https://drive.google.com/uc?id={file_id}"

# Download CSV into Colab workspace
output = "/content/training.csv"
gdown.download(download_url, output, quiet=False)

# Load CSV into pandas DataFrame
df = pd.read_csv(output)
# Display first 5 rows
df.head()


Downloading...
From (original): https://drive.google.com/uc?id=1UY_OmZj1fo-MAYWK7hjrp5u4NEzGamzz
From (redirected): https://drive.google.com/uc?id=1UY_OmZj1fo-MAYWK7hjrp5u4NEzGamzz&confirm=t&uuid=df22a242-c5f5-4e6e-aead-097b2d1b63b5
To: /content/training.csv
100%|██████████| 253M/253M [00:01<00:00, 139MB/s]


Unnamed: 0,pkSeqID,min,max,mean,stddev,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,drate,srate,sport,dport,proto,state_number,category
0,3142762,0.0,4.031619,2.687519,1.900363,100,100,0.0,0.494549,6551,80,udp,4,DDoS UDP
1,2432264,3.85693,4.012924,3.934927,0.078003,38,100,0.0,0.256493,5532,80,tcp,3,DDoS TCP
2,1976315,2.9741,3.609205,3.341429,0.268666,100,100,0.0,0.29488,27165,80,tcp,3,DDoS TCP
3,1240757,0.0,4.942302,3.222832,1.823185,63,63,0.0,0.461435,48719,80,udp,4,DoS UDP
4,3257991,2.979995,4.994452,3.983222,0.822418,100,100,0.0,1.002999,22461,80,udp,4,DDoS UDP


# **Model**

In [None]:


df = pd.read_csv("training.csv")
df = df.iloc[:100_000]

# Encode proto (TCP=0, UDP=1)
df["proto"] = df["proto"].map({"tcp": 0, "udp": 1})

# Select numeric features for GMM
features = [
    "proto",
    "sport", "dport",
    "min", "max", "mean", "stddev",
    "state_number",
    "N_IN_Conn_P_SrcIP", "N_IN_Conn_P_DstIP",
    "srate", "drate"
]

# Drop rows with missing feature values
X = df[features].dropna()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --------------------------
# Hyperparameter tuning: n_components using BIC
# --------------------------
n_components_range = range(8, 20)  # try different clusters
best_bic = np.inf
best_gmm = None
best_n = None
print("Start Tuning ..." )
for n in n_components_range:
    # Initialize means using KMeans
    kmeans = KMeans(n_clusters=n, random_state=42, n_init='auto').fit(X_scaled)
    initial_means = kmeans.cluster_centers_

    gmm = GaussianMixture(
        n_components=n,
        covariance_type='full',
        means_init=initial_means,
        reg_covar=1e-4,
        random_state=42
    )
    gmm.fit(X_scaled)
    bic = gmm.bic(X_scaled)

    print(f"n_components={n}, BIC={bic}")

    if bic < best_bic:
        best_bic = bic
        best_gmm = gmm
        best_n = n

print(f"\nBest number of clusters according to BIC: {best_n}")
print(f"Best BIC: {best_bic}")

# --------------------------
# Assign cluster labels safely
# --------------------------
df["GMM_cluster"] = np.nan
df.loc[X.index, "GMM_cluster"] = best_gmm.predict(X_scaled)

# --------------------------
# Compute Silhouette score
# --------------------------
print("Computing Silhouette score ...")
labels = best_gmm.predict(X_scaled)
overall_silhouette = silhouette_score(X_scaled, labels)
sample_silhouettes = silhouette_samples(X_scaled, labels)

print(f"\nOverall Silhouette Score: {overall_silhouette:.4f}")

# Optional: average silhouette per cluster
for cluster in range(best_n):
    cluster_silhouette = sample_silhouettes[labels == cluster].mean()
    print(f"Cluster {cluster} average Silhouette: {cluster_silhouette:.4f}")

# --------------------------
# Save the best model + scaler + metadata
# --------------------------
joblib.dump({
    "model": best_gmm,
    "scaler": scaler,
    "features": features
}, "gmm_model.pkl")

print("\nBest GMM model saved as gmm_model.pkl")

# --------------------------
# Category distribution per cluster
# --------------------------
def cluster_category_distribution(df, cluster_col, category_col):
    pivot = (
        df
        .groupby([cluster_col, category_col])
        .size()
        .groupby(level=0)
        .apply(lambda x: 100 * x / x.sum())
        .unstack(fill_value=0)
    )
    pivot['Top_category'] = pivot.idxmax(axis=1)
    return pivot

# Only run if 'category' exists
if "category" in df.columns:
    category_pivot = cluster_category_distribution(df, "GMM_cluster", "category")
    print("\nCategory distribution per cluster:")
    print(category_pivot)


n_components=8, BIC=-2838274.1296256986
n_components=9, BIC=-3214903.6090609836
n_components=10, BIC=-3405924.7832311685
n_components=11, BIC=-3671012.04728512
n_components=12, BIC=-3804931.3427558294
n_components=13, BIC=-3800819.3196423976
n_components=14, BIC=-3864112.227460732
n_components=15, BIC=-3865142.152281284
n_components=16, BIC=-4128906.788141535
n_components=17, BIC=-4150506.659640347
n_components=18, BIC=-4221817.58533901
n_components=19, BIC=-4258872.855966527

Best number of clusters according to BIC: 19
Best BIC: -4258872.855966527

Overall Silhouette Score: 0.1490
Cluster 0 average Silhouette: 0.3099
Cluster 1 average Silhouette: -0.0852
Cluster 2 average Silhouette: -0.1596
Cluster 3 average Silhouette: 0.1723
Cluster 4 average Silhouette: 0.6995
Cluster 5 average Silhouette: 0.0000
Cluster 6 average Silhouette: 0.0213
Cluster 7 average Silhouette: -0.4346
Cluster 8 average Silhouette: 0.1765
Cluster 9 average Silhouette: 0.1981
Cluster 10 average Silhouette: -0.176

In [None]:
# Compute Top_category per cluster
if "category" in df.columns:
    category_pivot = cluster_category_distribution(df, "GMM_cluster", "category")
    top_category_per_cluster = category_pivot["Top_category"].to_dict()
else:
    top_category_per_cluster = {}

# Save model, scaler, features, and top categories
joblib.dump({
    "model": best_gmm,
    "scaler": scaler,
    "features": features,
    "top_category_per_cluster": top_category_per_cluster
}, "gmm_model.pkl")

['gmm_model.pkl']

# **Predict**

In [None]:


# Load saved model
data = joblib.load("gmm_model.pkl")
model = data["model"]
scaler = data["scaler"]
features = data["features"]
top_category_per_cluster = data["top_category_per_cluster"]

def predict_category(new_sample: pd.DataFrame) -> str:
    """
    Predict the cluster for a new sample and return the cluster's dominant category.

    Parameters:
        new_sample (pd.DataFrame): single-row DataFrame with the same features used in training.

    Returns:
        str: predicted category
    """
    # Ensure column order
    X_new = new_sample[features]

    # Scale features
    X_scaled = scaler.transform(X_new)

    # Predict cluster
    cluster = model.predict(X_scaled)[0]

    # Return the dominant category in that cluster
    return top_category_per_cluster.get(cluster, "Unknown")
# Example usage
sample_row = pd.DataFrame([{
    "proto": 3142762, "sport": 0.000000, "dport": 4.031619,
    "min": 2.687519 , "max": 1.900363, "mean": 100, "stddev": 100,
    "state_number": 0,
    "N_IN_Conn_P_SrcIP":0.494549, "N_IN_Conn_P_DstIP": 6551,
    "srate": 80, "drate": 4
}])

predicted_category = predict_category(sample_row)
print("Predicted category:", predicted_category)


Predicted category: Unknown
