In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# -----------------------------------
# LOAD DATA
# -----------------------------------

train_path = "/content/drive/My Drive/Datasets/ML Project/data/recommended/training/training_cleaned.csv"
test_path  = "/content/drive/My Drive/Datasets/ML Project/data/recommended/test/test_cleaned.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

In [3]:
X_train = train.drop(columns=["category"])

X_test = test.drop(columns=["category"])

In [4]:
import gc
del train
del test
gc.collect()

0

In [6]:
num_cols = list(X_train.select_dtypes(include=[np.number]).columns)
num_cols.remove('pkSeqID')
print(num_cols)

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
])

kmeans = KMeans(
    n_clusters=8,      # <-- change if needed
    random_state=42,
    n_init=5
)

kmeans_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("kmeans", kmeans)
])

labels = kmeans_pipeline.fit_predict(X_train[num_cols])
print(np.unique(labels))


['min', 'max', 'mean', 'stddev', 'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP', 'drate', 'srate', 'sport', 'dport', 'state_number']
[0 1 2 3 4 5 6 7]


In [7]:
save_path = "/content/drive/My Drive/Datasets/ML Project/models/without/kmeans.joblib"
joblib.dump(kmeans_pipeline, save_path)

print("Pipeline saved to:", save_path)


Pipeline saved to: /content/drive/My Drive/Datasets/ML Project/models/without/kmeans.joblib


In [8]:
print(kmeans_pipeline.feature_names_in_)
print(num_cols)

['min' 'max' 'mean' 'stddev' 'N_IN_Conn_P_SrcIP' 'N_IN_Conn_P_DstIP'
 'drate' 'srate' 'sport' 'dport' 'state_number']
['min', 'max', 'mean', 'stddev', 'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP', 'drate', 'srate', 'sport', 'dport', 'state_number']


In [None]:
X_transformed = kmeans_pipeline.named_steps['preprocess'].transform(X_train)
sil_score = silhouette_score(X_transformed, labels, n_jobs=-1)

print("Silhouette score:", sil_score)