In [22]:
# cluster the data using k-means
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# normalize the data using sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from scipy.stats import mode


In [32]:
# load the data as df
breast_wisc_data = pd.read_csv('breast_wisc_dataset.csv')
cyber_data = pd.read_csv('cybersecurity_data.csv')

In [55]:
breast_X_train, breast_X_test, breast_y_train, breast_y_test = train_test_split(breast_wisc_data.iloc[:, :-1], breast_wisc_data.iloc[:, -1], test_size=0.2, random_state=42)
cyber_X_train, cyber_X_test, cyber_y_train, cyber_y_test = train_test_split(cyber_data.iloc[:, :-1], cyber_data.iloc[:, -1], test_size=0.2, random_state=42)
# turn y into 0 and 1
breast_y_train = breast_y_train.map({'Benign': 0, 'Malignant': 1})
breast_y_test = breast_y_test.map({'Benign': 0, 'Malignant': 1})

cyber_y_train = cyber_y_train.map({-1: 0, 1: 1})
cyber_y_test = cyber_y_test.map({-1: 0, 1: 1})
print(cyber_y_train.value_counts())

0    395
1     13
Name: class, dtype: int64


In [46]:
def do_kmeans(data, k):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
    return kmeans

In [27]:
bwd_ss = StandardScaler().fit_transform(breast_X_train)
bwd_mm = MinMaxScaler().fit_transform(breast_X_train)
bwd_rs = RobustScaler().fit_transform(breast_X_train)
bwd_mas = MaxAbsScaler().fit_transform(breast_X_train)

kmeans_ss = do_kmeans(bwd_ss, 2)
kmeans_mm = do_kmeans(bwd_mm, 2)
kmeans_rs = do_kmeans(bwd_rs, 2)
kmeans_mas = do_kmeans(bwd_mas, 2)


In [28]:
def resolve_masking(y_pred, y_true):
    pred_labels = np.zeros_like(y_true)
    for i in range(len(np.unique(y_pred))):
        mask = (y_pred == i)
        pred_labels[mask] = mode(y_true[mask])[0]
    return pred_labels

In [29]:
y_pred_ss = resolve_masking(kmeans_ss.labels_, breast_y_train)
y_pred_mm = resolve_masking(kmeans_mm.labels_, breast_y_train)
y_pred_rs = resolve_masking(kmeans_rs.labels_, breast_y_train)
y_pred_mas = resolve_masking(kmeans_mas.labels_, breast_y_train)

  pred_labels[mask] = mode(y_true[mask])[0]


In [47]:
# find the accuracy of the kmeans
from sklearn.metrics import accuracy_score
acc_ss = accuracy_score(breast_y_train, kmeans_ss.labels_)
acc_mm = accuracy_score(breast_y_train, kmeans_mm.labels_)
acc_rs = accuracy_score(breast_y_train, kmeans_rs.labels_)
acc_mas = accuracy_score(breast_y_train, kmeans_mas.labels_)



In [31]:
print(f"Accuracy for StandardScaler: {acc_ss}")
print(f"Accuracy for MinMaxScaler: {acc_mm}")
print(f"Accuracy for RobustScaler: {acc_rs}")
print(f"Accuracy for MaxAbsScaler: {acc_mas}")


Accuracy for StandardScaler: 0.9010989010989011
Accuracy for MinMaxScaler: 0.9208791208791208
Accuracy for RobustScaler: 0.8505494505494505
Accuracy for MaxAbsScaler: 0.9142857142857143


## Kmeans Accuracy
We get the best accuracy with the MinMaxScalar and the worst accuracy with the RobustScalar

Accuracy for StandardScaler: `0.9010989010989011`

Accuracy for MinMaxScaler: `0.9208791208791208`

Accuracy for RobustScaler: `0.8505494505494505`

Accuracy for MaxAbsScaler: `0.9142857142857143`

In [34]:
from sklearn.metrics import adjusted_mutual_info_score

In [48]:
cyber_ss = StandardScaler().fit_transform(cyber_X_train)
cyber_mm = MinMaxScaler().fit_transform(cyber_X_train)
cyber_rs = RobustScaler().fit_transform(cyber_X_train)
cyber_mas = MaxAbsScaler().fit_transform(cyber_X_train)

kmeans_cyber_ss = do_kmeans(cyber_ss, 2)
kmeans_cyber_mm = do_kmeans(cyber_mm, 2)
kmeans_cyber_rs = do_kmeans(cyber_rs, 2)
kmeans_cyber_mas = do_kmeans(cyber_mas, 2)

In [49]:
y_pred_cyber_ss = resolve_masking(do_kmeans(cyber_ss, 2).labels_, cyber_y_train)
y_pred_cyber_mm = resolve_masking(do_kmeans(cyber_mm, 2).labels_, cyber_y_train)
y_pred_cyber_rs = resolve_masking(do_kmeans(cyber_rs, 2).labels_, cyber_y_train)
y_pred_cyber_mas = resolve_masking(do_kmeans(cyber_mas, 2).labels_, cyber_y_train)

  pred_labels[mask] = mode(y_true[mask])[0]
  pred_labels[mask] = mode(y_true[mask])[0]
  pred_labels[mask] = mode(y_true[mask])[0]
  pred_labels[mask] = mode(y_true[mask])[0]


In [58]:
def get_LHI(pred_labels, true_labels):
    ami = adjusted_mutual_info_score(pred_labels, true_labels)
    return 1-ami



In [59]:
print(f"LHI for StandardScaler: {get_LHI(y_pred_cyber_ss, cyber_y_train)}")
print(f"LHI for MinMaxScaler: {get_LHI(y_pred_cyber_mm, cyber_y_train)}")
print(f"LHI for RobustScaler: {get_LHI(y_pred_cyber_rs, cyber_y_train)}")
print(f"LHI for MaxAbsScaler: {get_LHI(y_pred_cyber_mas, cyber_y_train)}")


LHI for StandardScaler: 1.0
LHI for MinMaxScaler: 1.0
LHI for RobustScaler: 1.0
LHI for MaxAbsScaler: 1.0


### Due to the sparsity of the cyber data, kmeans (as ordered by the problem) has a very hard time clustering the data.

LHI for StandardScaler: `1.0`

LHI for MinMaxScaler: `1.0`

LHI for RobustScaler: `1.0`

LHI for MaxAbsScaler: `1.0`

Instead let's try DBSCAN for better results

In [56]:
from sklearn.cluster import DBSCAN

def do_dbscan(data, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    return dbscan

In [57]:
dbscan_ss = do_dbscan(cyber_ss, 0.5, 5)
dbscan_mm = do_dbscan(cyber_mm, 0.5, 5)
dbscan_rs = do_dbscan(cyber_rs, 0.5, 5)
dbscan_mas = do_dbscan(cyber_mas, 0.5, 5)

In [60]:
print(f"LHI for StandardScaler: {get_LHI(dbscan_ss.labels_, cyber_y_train)}")
print(f"LHI for MinMaxScaler: {get_LHI(dbscan_mm.labels_, cyber_y_train)}")
print(f"LHI for RobustScaler: {get_LHI(dbscan_rs.labels_, cyber_y_train)}")
print(f"LHI for MaxAbsScaler: {get_LHI(dbscan_mas.labels_, cyber_y_train)}")

LHI for StandardScaler: 1.0014210924665643
LHI for MinMaxScaler: 0.9344306863676856
LHI for RobustScaler: 1.0066579493467414
LHI for MaxAbsScaler: 0.9839699002616406


We get much better results for AMI (and LHI by extension) by using DBSCAN for clustering because it is far more sensitive to outliers

LHI for StandardScaler: `1.0014210924665643`

LHI for MinMaxScaler: `0.9344306863676856`

LHI for RobustScaler: `1.0066579493467414`

LHI for MaxAbsScaler: `0.9839699002616406`