In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import get_scorer_names

from summarytools import dfSummary

# Load features dataframe

In [2]:
# define a variable that allows you to read prior saved pkl files
READ_FROM_PKL = True

In [3]:
# read features saved, add new ones
if READ_FROM_PKL:
    mirai_features_df = pd.read_pickle("../data/blog_fe/mirai_features.pkl")
    benign_features_df = pd.read_pickle("../data/blog_fe/benign_features.pkl")
else:
    print(
        "Error! Feature pkl not saved. Please run blog_fe_2.ipynb, blog_fe_3.ipynb, blog_fe_4.ipynb or download from here: https://drive.google.com/drive/folders/1dBQhbQtIk_fbbb80G5pSVV3hbWYJY7fv?usp=sharing"
    )

# K-means

In [4]:
mirai_features_df = mirai_features_df.drop("payload_embedding", axis=1)
mirai_features_df_values = mirai_features_df.values.astype(np.float64)

# Find rows with infinite values
inf_rows = np.isinf(mirai_features_df_values).any(axis=1)

# Remove rows with infinite values
mirai_features_df = mirai_features_df[~inf_rows]

In [5]:
# Find rows with infinite values
inf_rows = mirai_features_df.index[np.isinf(mirai_features_df).any(1)]

# Remove rows with infinite values
mirai_features_df = mirai_features_df.drop(inf_rows)

# Display the DataFrame after removing rows with infinite values
print("\nDataFrame after removing rows with infinite values:")
print(mirai_features_df)


DataFrame after removing rows with infinite values:
           Timestamp  Source Port  Destination Port  Packet Length  Protocol  \
0       1.540446e+09      21074.0              80.0           26.0       6.0   
1       1.540446e+09      20532.0            8280.0           26.0       6.0   
2       1.540446e+09       2440.0              53.0           83.0      17.0   
3       1.540446e+09         53.0            2440.0           83.0      17.0   
4       1.540446e+09      21074.0              80.0           26.0       6.0   
...              ...          ...               ...            ...       ...   
764121           NaN          NaN               NaN            NaN       NaN   
764122           NaN          NaN               NaN            NaN       NaN   
764124           NaN          NaN               NaN            NaN       NaN   
764125           NaN          NaN               NaN            NaN       NaN   
764136           NaN          NaN               NaN            NaN 

  inf_rows = mirai_features_df.index[np.isinf(mirai_features_df).any(1)]


In [6]:
# Scale the data
scaler = RobustScaler()
mirai_features_df_scaled = pd.DataFrame(
    scaler.fit_transform(mirai_features_df),
    columns=mirai_features_df.columns,
)

In [7]:
# Handle missing values
imputer = SimpleImputer(
    strategy="mean"
)  # You can use 'median' or 'most_frequent' as well
mirai_features_df_imputed = pd.DataFrame(
    imputer.fit_transform(mirai_features_df_scaled), columns=mirai_features_df_scaled.columns
)

In [8]:
# Initialize and train KMeans model
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(mirai_features_df_imputed)



In [9]:
# Predict cluster labels
cluster_labels = kmeans.labels_

cluster_labels

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [10]:
# print the rows that are marked malicious
indices = np.where(cluster_labels == 1)[0]
mirai_features_df_imputed.iloc[indices]

Unnamed: 0,Timestamp,Source Port,Destination Port,Packet Length,Protocol,incoming_traffic,outgoing_traffic,traffic_ratio,cumulative_length,interarrival,...,SNMP_trap,HTTPS,Syslog,LDAPS,FTPS,IMAPS,POP3S,SOCKS_proxy,length_risk,dst_port_freq_encoded
3842,-1.036472,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,0.195708,0.138769,...,0.0,0.019855,0.0,0.0,0.0,0.0,0.0,7e-06,0.069418,0.560415
3859,-1.036208,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,-0.455439,2.027287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.560415
4459,-1.027217,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,-0.520991,0.201128,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.560415
4497,-1.026665,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,0.195708,0.793498,...,0.0,0.019855,0.0,0.0,0.0,0.0,0.0,7e-06,0.069418,0.560415
4509,-1.026607,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,0.195708,1.667205,...,0.0,0.019855,0.0,0.0,0.0,0.0,0.0,7e-06,0.069418,0.560415
4570,-1.026055,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,-0.443272,1.699296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.560415
4791,-1.023142,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,-0.459573,0.163834,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.560415
4806,-1.022587,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,-0.45903,3.799973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.560415
4860,-1.022035,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,0.195708,0.751799,...,0.0,0.019855,0.0,0.0,0.0,0.0,0.0,7e-06,0.069418,0.560415
4884,-1.021781,-1.128746,-0.806224,14.75,0.0,0.0,-0.719906,0.0,-0.457209,1.465944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
dfSummary(mirai_features_df_imputed.iloc[indices])

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Timestamp [float64],Mean (sd) : -0.5 (0.5) min < med < max: -1.0 < -0.4 < 0.7 IQR (CV) : 0.7 (-1.0),27 distinct values,,0 (0.0%)
2,Source Port [float64],1. -1.128746029553929,27 (100.0%),,0 (0.0%)
3,Destination Port [float64],1. -0.8062236183370963,27 (100.0%),,0 (0.0%)
4,Packet Length [float64],1. 14.75,27 (100.0%),,0 (0.0%)
5,Protocol [float64],1. 0.0,27 (100.0%),,0 (0.0%)
6,incoming_traffic [float64],1. 0.0,27 (100.0%),,0 (0.0%)
7,outgoing_traffic [float64],1. -0.7199058507061197,27 (100.0%),,0 (0.0%)
8,traffic_ratio [float64],1. 0.0,27 (100.0%),,0 (0.0%)
9,cumulative_length [float64],Mean (sd) : -0.1 (0.3) min < med < max: -0.5 < 0.1 < 0.5 IQR (CV) : 0.6 (-0.2),16 distinct values,,0 (0.0%)
10,interarrival [float64],Mean (sd) : 1.4 (1.3) min < med < max: 0.0 < 0.9 < 4.9 IQR (CV) : 1.5 (1.1),27 distinct values,,0 (0.0%)


# Hyperparameter tuning

In [12]:
# we set the cluster number to 2 for malicious and benign classes
n_clusters = 2
# Hyperparameter grid for KMeans
param_grid = {
    'init': ['k-means++', 'random'],
    'n_init': [10, 20, 30],
    'max_iter': [100, 200, 300],
    'tol': [1e-3, 1e-4, 1e-5]
}

In [14]:
# Initialize KMeans model
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Grid search with cross-validation
grid_search = GridSearchCV(kmeans, param_grid, cv=5, scoring="neg_mean_squared_error")

# Fit the grid search to the data
grid_search.fit(mirai_features_df_imputed)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)

Traceback (most recent call last):
  File "/home/drx/sandbox/cyber-ml/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/drx/sandbox/cyber-ml/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/drx/sandbox/cyber-ml/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/drx/sandbox/cyber-ml/.venv/lib/python3.10/site-packages/sklearn/model_selection/_val

Best hyperparameters: {'init': 'k-means++', 'max_iter': 100, 'n_init': 10, 'tol': 0.001}
