In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, v_measure_score, adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, fcluster
import hdbscan

In [2]:
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target

#last column is the target for the following dataset
breast = pd.read_csv('project1/breast_wisc_dataset.csv', header=0)
X_breast_wisc = breast.iloc[:, :-1]
y_breast_wisc = breast.iloc[:, -1]
print(y_breast_wisc)

cybersecurity = pd.read_csv('project1/cybersecurity_data.csv', header=0)
X_cybersecurity = cybersecurity.iloc[:, :-1]
y_cybersecurity = cybersecurity.iloc[:, -1]

aapl = pd.read_csv('project1/HFT_AAPL_data.csv', header=0)
# drop the Date column and first column
aapl = aapl.drop(columns=['Date'])
aapl = aapl.drop(columns=[aapl.columns[0]])
X_aapl = aapl.iloc[:, 1:]
y_aapl = aapl.iloc[:, 0]



FileNotFoundError: [Errno 2] No such file or directory: 'project1/breast_wisc_dataset.csv'

In [3]:
def SSScaleData(X):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X

In [4]:
def do_kmeans(X, k):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    y_pred = kmeans.predict(X)
    return y_pred

def do_hdbscan(X, min_cluster_size):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    clusterer.fit(X)
    y_pred = clusterer.labels_
    return y_pred

def do_dbscan(X, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(X)
    y_pred = dbscan.labels_
    return y_pred

def do_optics(X, min_samples, xi):
    optics = OPTICS(min_samples=min_samples, xi=xi)
    optics.fit(X)
    y_pred = optics.labels_
    return y_pred

def do_bisecting_kmeans(X, k):
    Z = linkage(X, 'ward')
    y_pred = fcluster(Z, k, criterion='maxclust')
    return y_pred

In [5]:
X_iris_scaled = SSScaleData(X_iris)
X_breast_wisc_scaled = SSScaleData(X_breast_wisc)
X_cybersecurity_scaled = SSScaleData(X_cybersecurity)
X_aapl_scaled = SSScaleData(X_aapl)


In [6]:
def evaluate_clustering(X, labels_true, labels_pred):
    silhouette = silhouette_score(X, labels_pred)
    v_measure = v_measure_score(labels_true, labels_pred)
    ari = adjusted_rand_score(labels_true, labels_pred)
    nmi = normalized_mutual_info_score(labels_true, labels_pred)
    ami = adjusted_mutual_info_score(labels_true, labels_pred)
    return silhouette, v_measure, ari, nmi, ami

In [7]:
labels_kmeans_iris = do_kmeans(X_iris_scaled, 3)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_iris_scaled, y_iris, labels_kmeans_iris)
print(f"KMeans - Iris: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_hdbscan_iris = do_hdbscan(X_iris_scaled, 5)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_iris_scaled, y_iris, labels_hdbscan_iris)
print(f"HDBSCAN - Iris: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_dbscan_iris = do_dbscan(X_iris_scaled, 0.5, 5)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_iris_scaled, y_iris, labels_dbscan_iris)
print(f"DBSCAN - Iris: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_optics_iris = do_optics(X_iris_scaled, 5, 0.05)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_iris_scaled, y_iris, labels_optics_iris)
print(f"OPTICS - Iris: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_bisecting_kmeans_iris = do_bisecting_kmeans(X_iris_scaled, 3)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_iris_scaled, y_iris, labels_bisecting_kmeans_iris)
print(f"Bisecting KMeans - Iris: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

# write to csv
iris_results = pd.DataFrame({'KMeans': labels_kmeans_iris, 'HDBSCAN': labels_hdbscan_iris, 'DBSCAN': labels_dbscan_iris, 'OPTICS': labels_optics_iris, 'BisectingKMeans': labels_bisecting_kmeans_iris})
iris_results.to_csv('project1/iris_results.csv', index=False)



KMeans - Iris: Silhouette=0.45994823920518635, V-measure=0.659486892724918, ARI=0.6201351808870379, NMI=0.659486892724918, AMI=0.6552228479234864
HDBSCAN - Iris: Silhouette=0.49486290250095627, V-measure=0.7174643320814477, ARI=0.5637510205230709, NMI=0.7174643320814476, AMI=0.7125764811325077
DBSCAN - Iris: Silhouette=0.35651648142700726, V-measure=0.5114298559522713, ARI=0.4420986685885924, NMI=0.5114298559522713, AMI=0.5051666404374137
OPTICS - Iris: Silhouette=-0.300864934206103, V-measure=0.2923566511548631, ARI=0.05141642728695205, NMI=0.29235665115486303, AMI=0.2656899926556021
Bisecting KMeans - Iris: Silhouette=0.4466890410285909, V-measure=0.6754701853436886, ARI=0.6153229932145449, NMI=0.6754701853436886, AMI=0.6712861348071291


In [8]:
labels_kmeans_breast = do_kmeans(X_breast_wisc_scaled, 2)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_breast_wisc_scaled, y_breast_wisc, labels_kmeans_breast)
print(f"KMeans - Breast: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_hdbscan_breast = do_hdbscan(X_breast_wisc_scaled, 5)
print(labels_hdbscan_breast)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_breast_wisc_scaled, y_breast_wisc, labels_hdbscan_breast)
print(f"HDBSCAN - Breast: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_dbscan_breast = do_dbscan(X_breast_wisc_scaled, 0.5, 5)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_breast_wisc_scaled, y_breast_wisc, labels_dbscan_breast)
print(f"DBSCAN - Breast: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_optics_breast = do_optics(X_breast_wisc_scaled, 5, 0.05)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_breast_wisc_scaled, y_breast_wisc, labels_optics_breast)
print(f"OPTICS - Breast: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_bisecting_kmeans_breast = do_bisecting_kmeans(X_breast_wisc_scaled, 2)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_breast_wisc_scaled, y_breast_wisc, labels_bisecting_kmeans_breast)
print(f"Bisecting KMeans - Breast: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")



KMeans - Breast: Silhouette=0.34338224069077805, V-measure=0.5324078598532422, ARI=0.6536246043910179, NMI=0.5324078598532422, AMI=0.5317737219826534
[-1 -1 -1 -1 -1 -1  0 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1 -1  1  1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1  1 -1 -1 -1 -1
  1  1  1  1  1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1
 -1  1  1 -1 -1 -1 -1  1 -1 -1 -1 -1  1 -1 -1 -1  1 -1  1 -1  1  1 -1 -1
 -1 -1  1 -1  1 -1  1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1
  1 -1 -1  1 -1  1 -1 -1 -1 -1  1  0 -1  1 -1  1 -1  1 -1 -1 -1 -1  1  1
  1 -1 -1 -1  1  1 -1 -1 -1  1  1  1 -1 -1  1  1 -1 -1 -1 -1 -1  1 -1 -1
 -1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1
 -1 -1 -1  1 -1 -1 -1 -1  1 -1 -1 -1  1 -1  1 -1 -1 -1 -1  1 -1 -1 -1 -1
 -1 -1 -1 -1  1  1  1  1  1 -1  1  1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1
  1  1 -1 -1 -1 -1  1 -1 -1  1 -1  1 -1  0  0  1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1  1  1 -1  1  1 -1 -1 -1 -1  1 -1  1  

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [42]:
labels_kmeans_cybersecurity = do_kmeans(X_cybersecurity_scaled, 2)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_cybersecurity_scaled, y_cybersecurity, labels_kmeans_cybersecurity)
print(f"KMeans - Cybersecurity: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_hdbscan_cybersecurity = do_hdbscan(X_cybersecurity_scaled, 5)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_cybersecurity_scaled, y_cybersecurity, labels_hdbscan_cybersecurity)
print(f"HDBSCAN - Cybersecurity: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_dbscan_cybersecurity = do_dbscan(X_cybersecurity_scaled, 0.5, 5)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_cybersecurity_scaled, y_cybersecurity, labels_dbscan_cybersecurity)
print(f"DBSCAN - Cybersecurity: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_optics_cybersecurity = do_optics(X_cybersecurity_scaled, 5, 0.05)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_cybersecurity_scaled, y_cybersecurity, labels_optics_cybersecurity)
print(f"OPTICS - Cybersecurity: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_bisecting_kmeans_cybersecurity = do_bisecting_kmeans(X_cybersecurity_scaled, 2)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_cybersecurity_scaled, y_cybersecurity, labels_bisecting_kmeans_cybersecurity)
print(f"Bisecting KMeans - Cybersecurity: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")



KMeans - Cybersecurity: Silhouette=0.36880311644677655, V-measure=0.04828531952311552, ARI=0.10892228965242054, NMI=0.04828531952311551, AMI=0.04494870903810518
HDBSCAN - Cybersecurity: Silhouette=0.4052320336110424, V-measure=0.0600217407888711, ARI=-0.0020362306540487804, NMI=0.06002174078887109, AMI=0.04570107458966869
DBSCAN - Cybersecurity: Silhouette=-0.02173099826981661, V-measure=0.01762159104372252, ARI=-0.046486687946188085, NMI=0.01762159104372252, AMI=0.001222600333807796
OPTICS - Cybersecurity: Silhouette=0.19911699539283256, V-measure=0.05304008957427311, ARI=-0.0050578223203026264, NMI=0.0530400895742731, AMI=0.0345166160593443
Bisecting KMeans - Cybersecurity: Silhouette=0.3810439339941886, V-measure=0.07725057757991001, ARI=0.16930721934402113, NMI=0.07725057757991002, AMI=0.07338669881295742


In [43]:
labels_kmeans_aapl = do_kmeans(X_aapl_scaled, 2)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_aapl_scaled, y_aapl, labels_kmeans_aapl)
print(f"KMeans - AAPL: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_hdbscan_aapl = do_hdbscan(X_aapl_scaled, 5)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_aapl_scaled, y_aapl, labels_hdbscan_aapl)
print(f"HDBSCAN - AAPL: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_dbscan_aapl = do_dbscan(X_aapl_scaled, 0.5, 5)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_aapl_scaled, y_aapl, labels_dbscan_aapl)
print(f"DBSCAN - AAPL: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_optics_aapl = do_optics(X_aapl_scaled, 5, 0.05)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_aapl_scaled, y_aapl, labels_optics_aapl)
print(f"OPTICS - AAPL: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")

labels_bisecting_kmeans_aapl = do_bisecting_kmeans(X_aapl_scaled, 2)
silhouette, v_measure, ari, nmi, ami = evaluate_clustering(X_aapl_scaled, y_aapl, labels_bisecting_kmeans_aapl)
print(f"Bisecting KMeans - AAPL: Silhouette={silhouette}, V-measure={v_measure}, ARI={ari}, NMI={nmi}, AMI={ami}")





KMeans - AAPL: Silhouette=0.3267231356065447, V-measure=0.1367675603434691, ARI=0.00033288845819545254, NMI=0.13676756034346912, AMI=0.056339709679159494




HDBSCAN - AAPL: Silhouette=-0.036876328744232745, V-measure=0.41052435371244195, ARI=0.000577192883894681, NMI=0.41052435371244195, AMI=0.03975411144147575




DBSCAN - AAPL: Silhouette=0.16776610791023275, V-measure=0.24188226746672153, ARI=0.0002435294815260172, NMI=0.2418822674667215, AMI=0.0321459427889916




OPTICS - AAPL: Silhouette=-0.4621664904552028, V-measure=0.36939630006505064, ARI=-1.611485946750071e-06, NMI=0.3693963000650507, AMI=-2.9669746050205473e-05
Bisecting KMeans - AAPL: Silhouette=0.3174148100277985, V-measure=0.12970170819897975, ARI=0.00028472057800784935, NMI=0.12970170819897975, AMI=0.04652114021469082


