In [3]:
import pandas as pd
import numpy as np

# Load NSL-KDD (adjust path to your dataset file)
data = pd.read_csv('KDDTrain+.txt', header=None)

# Define feature names and columns (based on NSL-KDD documentation)
feature_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes']
columns = [0, 1, 2, 3, 4, 5]  # Corresponding indices
X = data[columns].copy()  # Use .copy() to avoid modifying original data

# Convert categorical columns to numeric
categorical_cols = [1, 2, 3]  # protocol_type, service, flag
for col in categorical_cols:
    X[col] = pd.factorize(X[col])[0]  # Convert strings to integers (e.g., 'tcp' -> 0, 'udp' -> 1)

# Convert to numpy array and ensure float type
X = X.values.astype(float)  # Ensure all data is numeric (float)

# Normalize
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X_normalized = (X - mean) / std

# Save for later use
np.save('mean.npy', mean)
np.save('std.npy', std)
np.save('X_normalized.npy', X_normalized)

# Verify
print("Mean:", mean)
print("Std:", std)
print("First normalized sample:", X_normalized[0])

Mean: [2.87144650e+02 2.50648949e-01 1.01071103e+01 6.22196820e-01
 4.55667430e+04 1.97791144e+04]
Std: [2.60450497e+03 5.65203888e-01 1.27602402e+01 1.02519294e+00
 5.87030788e+06 4.02125319e+06]
First normalized sample: [-0.11024922 -0.44346643 -0.79207837 -0.60690705 -0.0076786  -0.00491864]


In [2]:
import numpy as np  # Add this line
from sklearn.cluster import KMeans

X_normalized = np.load('X_normalized.npy')
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_normalized)
centroids = kmeans.cluster_centers_
np.save('centroids.npy', centroids)

labels = kmeans.labels_
cov_matrices = []
for i in range(2):
    cluster_data = X_normalized[labels == i]
    cov = np.cov(cluster_data.T)
    inv_cov = np.linalg.inv(cov)
    cov_matrices.append(inv_cov)
np.save('cov.npy', np.array(cov_matrices))