# Experiments with WCDS
This notebook contains all experiments that are done using WCDS.

In [None]:
import matplotlib.pyplot as plt
from wcds.wcds import WCDS
from wcds.clusterers import AgglomerativeClustering
from sklearn.preprocessing import minmax_scale, MinMaxScaler
from sklearn.metrics import *
from scipy.io import arff
import pandas as pd
import numpy as np
import random
import math
import time

## Datasets
In the following sections `datastream` will be the variable storing the current datastream. It uses a pandas dataframe for that.

By executing one of the following cells, the chosen dataset/-stream will be loaded.

In [None]:
# Complex8
url = "http://www2.cs.uh.edu/~ml_kdd/restored/Complex&Diamond/Complex8.data"
datastream = pd.read_csv(url, names=['X', 'Y', "Class"], header=None)
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# Complex9
url = "http://www2.cs.uh.edu/~ml_kdd/restored/Complex&Diamond/Complex9.txt"
datastream = pd.read_csv(url, names=['X', 'Y', "Class"], header=None)
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# D31
url = "http://cs.joensuu.fi/sipu/datasets/D31.txt"
datastream = pd.read_csv(url, names=['X', 'Y', "Class"], header=None, sep="\t")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# Jain
url = "http://cs.joensuu.fi/sipu/datasets/jain.txt"
datastream = pd.read_csv(url, names=['X', 'Y', "Class"], header=None, sep="\t")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# Agglomeration
url = "http://cs.joensuu.fi/sipu/datasets/Aggregation.txt"
datastream = pd.read_csv(url, names=['X', 'Y', "Class"], header=None, sep="\t")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# Forest Cover Type
data = arff.loadarff('../Datasets/covtypeNorm.arff')
datastream = pd.DataFrame(data[0])
labels = datastream["class"].astype(int)
datastream = datastream.select_dtypes(exclude="O")
datastream["Class"] = labels

In [None]:
# 10% of Network Intrusion Detection (KDD Cup 1999)
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
header = ["class",
    "duration",
    "protocol_type",
    "service",
    "flag",
    "src_bytes",
    "dst_bytes",
    "land",
    "wrong_fragment",
    "urgent",
    "hot",
    "num_failed_logins",
    "logged_in",
    "num_compromised",
    "root_shell",
    "su_attempted",
    "num_root",
    "num_file_creations",
    "num_shells",
    "num_access_files",
    "num_outbound_cmds",
    "is_host_login",
    "is_guest_login",
    "count",
    "srv_count",
    "serror_rate",
    "srv_serror_rate",
    "rerror_rate",
    "srv_rerror_rate",
    "same_srv_rate",
    "diff_srv_rate",
    "srv_diff_host_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate"]
datastream = pd.read_csv(url, header=None)
#datastream = datastream.select_dtypes(exclude=["object"])
#scaler = MinMaxScaler()
#datastream = pd.DataFrame(scaler.fit_transform(datastream), columns=datastream.columns)

In [None]:
import sklearn

datastream = pd.DataFrame(sklearn.datasets.fetch_kddcup99())

In [None]:
datastream = datastream.select_dtypes(exclude="object")
datastream.drop(columns=[""])

In [None]:
# Gas mixture dataset CO2
# TODO

In [None]:
#  Gas mixture dataset ETHYLNEE
# TODO

In [None]:
# SAM KNN DATASETS
# TODO

Take a look at the first rows of the dataset and its description.

In [None]:
datastream.head()

In [None]:
datastream.describe()

## Online clustering
The next step, is to perform the online step of stream clustering with WCDS on the previous selected `datastream`.

In [None]:
%%time

# Parameters
OMEGA = 1000
DELTA = 100
GAMMA = 100
EPSILON = 0.7
DIM = len(datastream.iloc[0])-1
µ = 0.5

c_online = WCDS(
    omega=OMEGA,
    delta=DELTA,
    gamma=GAMMA,
    epsilon=EPSILON,
    dimension=DIM,
    µ=µ)

c_offline = AgglomerativeClustering()

assigned_discriminators = []
time_ = 0
for i in range(len(datastream)):
    if i > 0 and i % 200 == 0:
        print("Instance: {} Number of discriminators: {}".format(i, len(c_online.discriminators)))
        print(homogeneity_completeness_v_measure(datastream["Class"][max(0,i-OMEGA):i], assigned_discriminators[max(0,i-OMEGA):i]))
        actual_clusters = c_offline.fit(c_online.discriminators, n_clusters=7)
        print(homogeneity_completeness_v_measure(datastream["Class"][max(0,i-OMEGA):i], [return_correct(cluster, actual_clusters) for cluster in assigned_discriminators[max(0,i-OMEGA):i]]))
    k = c_online.record(list(datastream.iloc[i])[:-1], time_)
    time_ += 1
    assigned_discriminators.append(k)

In [None]:
# Optional save
c_online.save("wcds.json")

## Offline clustering
Now we perform offline clustering on the current configuration of WCDS.

In [None]:
%%time

N_CLUSTERS = None
THRESHOLD = 0.5

c_offline = AgglomerativeClustering()
actual_clusters = c_offline.fit(c_online.discriminators, n_clusters=N_CLUSTERS, distance_threshold=THRESHOLD)

## Plot results and evaluate clustering

In [None]:
predicted_discriminators = []
time_ = 0

for x, y in zip(datastream["X"], datastream["Y"]):
    k = c_online.record((x, y), time_)
    time_ += 1
    predicted_discriminators.append(k)

In [None]:
# Initialize color list
number_of_colors = len(c_online.discriminators)
colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(number_of_colors)]
color_dict = {key: value for (key, value) in enumerate(colors)}

In [None]:
def return_correct(index, clustering):
    # Returns correct offline cluster for given discriminator
    for i in range(len(clustering)):
        if index in clustering[i]:
            return i

In [None]:
# Plot results of online vs offline clustering
%matplotlib inline

# Online
plt.figure(1, figsize=(6,6))
for i in range(len(assigned_discriminators)):
    k = assigned_discriminators[i]
    plt.scatter(datastream["X"][i], datastream["Y"][i], marker="o", color=color_dict[int(k)])
    plt.axis('scaled', xlim=[0, 1, 0, 1])
plt.suptitle('Online Clustering', fontsize=18)
    
# Offline
plt.figure(2, figsize=(6,6))        
for i in range(len(assigned_discriminators)):
    k = assigned_discriminators[i]
    correct = return_correct(k, actual_clusters)
    plt.scatter(datastream["X"][i], datastream["Y"][i], marker="o", color=color_dict[correct])
plt.axis('scaled', xlim=[0, 1, 0, 1])
plt.suptitle('Offline Clustering', fontsize=18)
    
print("Found {} Microclusters and formed {} Clusters.".format(len(c_online.discriminators), len(actual_clusters)))
plt.show()

In [None]:
# Calculate Homogeneity, Completeness, V-Measure of clusterings
print("Homogeneity, Completeness, V-Measure")

# Online
print("Online: ", homogeneity_completeness_v_measure(datastream["Class"], assigned_discriminators))

# Offline
print("Offline: ", homogeneity_completeness_v_measure(datastream["Class"], [return_correct(assigned_discriminators[i], actual_clusters) for i in range(len(assigned_discriminators))]))

The following cell visualizes the behavior of the online clustering over time. 

In [None]:
# Live online clustering plot
%matplotlib notebook
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
plt.ion()
ax.axis('scaled', xlim=[0, 1, 0, 1])
fig.show()
fig.canvas.draw()

for i in range(len(assigned_discriminators)):
    k = assigned_discriminators[i]
    ax.scatter(datastream["X"][i], datastream["Y"][i], marker="o", color=color_dict[int(k)])
    fig.canvas.draw()

In [None]:
# Show discriminator bubble
from matplotlib import cm
cm_subsection = np.linspace(0.0, 1.0, 1000)
colormap = cm.get_cmap("Greys")
colors = [colormap(x) for x in cm_subsection]

def plot_discriminator(c_id, step=0.01):
    points = []
    border = []
    
    for i in np.arange(0,1,step):
        for j in np.arange(0,1,step):
            matching_rate = c_online.discriminators[c_id].matching(c_online.addressing((i,j)))
            c = colors[round(matching_rate*999)]
            if abs(c_online.epsilon - matching_rate) < 0.015:
                border.append((((i,j), "black")))
            points.append(((i,j), c))
    # Plot heat map
    plt.scatter([point[0][0] for point in points], [point[0][1] for point in points], marker="s", s=1, c=[point[1] for point in points])
    # Plot epsilon border
    plt.scatter([point[0][0] for point in border], [point[0][1] for point in border], marker="o", s=1, c=[point[1] for point in border])
    # Plot points assigned to discriminator
    plt.scatter([datastream["X"][i] for i in range(len(assigned_discriminators)) if assigned_discriminators[i] == c_id],
                [datastream["Y"][i] for i in range(len(assigned_discriminators)) if assigned_discriminators[i] == c_id],
                marker="X", s=2, color="white")
    plt.axis('scaled', xlim=[0, 1, 0, 1])
    plt.colorbar(colors)
    plt.show()

In [None]:
def plot_descision_boundaries(step=0.01):
    points = []
    
    for i in np.arange(0,1,step):
        for j in np.arange(0,1,step):
            matching_rate = c_online.discriminators[c_id].matching(c_online.addressing((i,j)))
            c = colors[round(matching_rate*999)]
            if abs(c_online.epsilon - matching_rate) < 0.015:
                border.append((((i,j), "black")))
            points.append(((i,j), c))
    # Plot points assigned to discriminator
    plt.scatter([datastream["X"][i] for i in range(len(assigned_discriminators)) if assigned_discriminators[i] == c_id],
                [datastream["Y"][i] for i in range(len(assigned_discriminators)) if assigned_discriminators[i] == c_id],
                marker="X", s=2, color="white")
    plt.axis('scaled', xlim=[0, 1, 0, 1])
    plt.colorbar(colors)
    plt.show()

In [None]:
plot_discriminator(0)

In [None]:
for i in range(len(c_online.discriminators)):
    plot_discriminator(i)

## Results