# Clustering

In [None]:
from collections import Counter
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.cluster import KMeans

In [None]:
np.random.seed(42)

In [None]:
# Import the Data
filename = "data/two-hour-sample.csv"
data = pd.read_csv(filename)

In [None]:
# Initialize the clustering
n_samples, n_features = data.shape
n_clusters = 10 #Note most clustering algorithms require setting the number of clusters

sample_size = 300

print("n_clusters: %d, \t n_samples %d, \t n_features %d"
      % (n_clusters, n_samples, n_features))

In [None]:
# Run K-means
columns = ["Dur", "TotBytes", "TotPkts", "TcpRtt","PCRatio"]
df = data[columns]

kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
kmeans.fit(df)

In [None]:
# Kmeans produces several different outputs
# First, the cluster centers
kmeans.cluster_centers_

In [None]:
# Next, the labels for each instance
print(kmeans.labels_)

In [None]:
#Finally, inertia provides an attempt to evaluate the overall goodness of the cluster
print(kmeans.inertia_)

In [None]:
#We can also examine the cluster sizes
# Notice the single mega cluster, this is typical of many clustering algorithms
c = Counter(kmeans.labels_)
for k in c.keys():
      print("Cluster ID: " + str(k) + " Count: " + str(c[k]))