# Experiments with WCDS
This notebook contains all experiments that are done using WCDS.

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
from wcds.wcds import WCDS
from wcds.clusterers import AgglomerativeClustering
import time
from sklearn.datasets import *
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import random
import math

## Datasets
In the following sections `datastream` will be the variable storing the current datastream.

In [None]:
# Complex8
url = "http://www2.cs.uh.edu/~ml_kdd/restored/Complex&Diamond/Complex8.data"
complex8 = pd.read_csv(url)
data = complex8
x = data["563.225"]
y = data["56.748"]
x = list(x / 675) # Normalizing between [0:1]
y = list(y / 401) # Normalizing between [0:1]
data = list(zip(x,y))
np.random.shuffle(data)
datastream = list(zip(data, list(range(len(data)))))

In [14]:
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
kddcup = pd.read_csv(url, sep=",")

In [15]:
kddcup.dropna()
kddcupo.drop(columns=["tcp", "http"])

Unnamed: 0,0,tcp,http,SF,181,5450,0.1,0.2,0.3,0.4,...,9.1,1.00.1,0.00.6,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.00,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.00,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.00,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.00,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.00,0.0,normal.
5,0,tcp,http,SF,212,1940,0,0,0,0,...,69,1.0,0.0,1.00,0.04,0.00,0.00,0.00,0.0,normal.
6,0,tcp,http,SF,159,4087,0,0,0,0,...,79,1.0,0.0,0.09,0.04,0.00,0.00,0.00,0.0,normal.
7,0,tcp,http,SF,210,151,0,0,0,0,...,89,1.0,0.0,0.12,0.04,0.00,0.00,0.00,0.0,normal.
8,0,tcp,http,SF,212,786,0,0,0,1,...,99,1.0,0.0,0.12,0.05,0.00,0.00,0.00,0.0,normal.
9,0,tcp,http,SF,210,624,0,0,0,0,...,109,1.0,0.0,0.06,0.05,0.00,0.00,0.00,0.0,normal.


## Online clustering
In this case we are using the sliding window model with the sw discriminators and sw neurons.

In [None]:
%%time
# Stream clustering (online clustering only)

OMEGA = 3000 # For batch clustering has to be over 2500
DELTA = 200
GAMMA = 500
EPSILON = 0.8
µ = 1

clusterer = WCDS(
    omega=OMEGA,
    delta=DELTA,
    gamma=GAMMA,
    epsilon=EPSILON,
    dimension=len(datastream[0][0]),
    µ=µ,
    seed=123456)

predictions = []
for i in datastream:
    obs, time_ = i
    k = clusterer.record(obs, time_)
    predictions.append(k)

In [None]:
%%time
from wcds.clusterers import AgglomerativeClustering

agg = AgglomerativeClustering()
actual_clusters = agg.fit(clusterer.discriminators, n_clusters=8)
print(len(actual_clusters))
print(actual_clusters)

In [None]:
clusterer.save("wcds.json")

## Plot results

In [None]:
# Plot results

number_of_colors = max(predictions)+1
colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(number_of_colors)]
color_dict = {key: value for (key, value) in zip(list(range(len(colors))), colors)}

for i in range(len(predictions)):
    k = predictions[i]
    plt.scatter(datastream[i][0][0], datastream[i][0][1], marker="o", color=color_dict[int(k)])

print("Found {} Clusters.".format(max(predictions)+1))
plt.show()

In [None]:
# Plot results after agglomerative clustering
def return_correct(index, clustering):
    for i in range(len(clustering)):
        if index in clustering[i]:
            return i
        
for i in range(len(predictions)):
    k = predictions[i]
    C = return_correct(k, actual_clusters)
    plt.scatter(datastream[i][0][0], datastream[i][0][1], marker="o", color=color_dict[C])

plt.show()

In [None]:
# Live clustering plot

number_of_colors = max(predictions)+1
colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(number_of_colors)]
color_dict = {key: value for (key, value) in zip(list(range(len(colors))), colors)}

fig = plt.figure()
ax = fig.add_subplot(111)
plt.ion()

fig.show()
fig.canvas.draw()

for i in range(len(predictions)):
    k = predictions[i]
    #ax.clear()
    ax.scatter(datastream[i][0][0], datastream[i][0][1], marker="o", color=color_dict[int(k)])
    fig.canvas.draw()
    time.sleep(.01)

#fig.savefig('plotcircles.png')

In [None]:
# Show Discriminator Bubble
def plot_discriminator(c_id, step):
    plt.close()
    step = 0.01 # Density of scatter plot
    c_id = 6 # Cluster
    points = []

    for i in np.arange(0,1,step):
        for j in np.arange(0,1,step):
            if clusterer.discriminators[c_id].matching(clusterer.addressing((i,j))) > clusterer.epsilon:
                points.append(((i,j), 1))
            else:
                points.append(((i,j), 0))

    for i in range(len(points)):
        k = points[i][1]
        if k == 0:
            c = "white"
        else:
            c = color_dict[c_id]
        plt.scatter(points[i][0][0], points[i][0][1], marker="s", s=1, color=c)
    plt.show()