# Experiments with WCDS
This notebook contains all experiments that are done using WCDS.

## 1. Importing the required packages

In [None]:
import matplotlib.pyplot as plt
from wcds.wcds import WCDS
from wcds.clusterers import *
from sklearn.preprocessing import minmax_scale, MinMaxScaler, LabelEncoder
from sklearn.metrics import *
from scipy.io import arff
import pandas as pd
import urllib.request
import io
import csv
import numpy as np
import random
import math
import time

## 2. Clustering of 2D data sets
### 2.1 Loading data sets
In the following sections `datastream` will be the variable storing the current data set. It uses a pandas dataframe for that. The last column contains the instance's class for evaluation and should not be passed to the algorithm.

By executing one of the following cells, the chosen data set will be loaded.

In [None]:
# Complex8
url = "http://www2.cs.uh.edu/~ml_kdd/restored/Complex&Diamond/Complex8.data"
datastream = pd.read_csv(url, names=["X", "Y", "Class"], header=None)
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# Complex9
url = "http://www2.cs.uh.edu/~ml_kdd/restored/Complex&Diamond/Complex9.txt"
datastream = pd.read_csv(url, names=["X", "Y", "Class"], header=None)
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# D31
url = "http://cs.joensuu.fi/sipu/datasets/D31.txt"
datastream = pd.read_csv(url, names=["X", "Y", "Class"], header=None, sep="\t")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# Jain
url = "http://cs.joensuu.fi/sipu/datasets/jain.txt"
datastream = pd.read_csv(url, names=["X", "Y", "Class"], header=None, sep="\t")
datastream["X"] = minmax_scale(datastream["X"], feature_range=(0,1))
datastream["Y"] = minmax_scale(datastream["Y"], feature_range=(0,1))
datastream = datastream.sample(frac=1).reset_index(drop=True)

In [None]:
# Aggregation
url = "http://cs.joensuu.fi/sipu/datasets/Aggregation.txt"
datastream = pd.read_csv(url, names=["X", "Y", "Class"], header=None, sep="\t")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream = datastream.sample(frac=1).reset_index(drop=True)

We can take a look at the first rows of the data set and its description.

In [None]:
datastream.head()

In [None]:
datastream.describe()

### 2.2 Online clustering
The next step is to perform the online step of stream clustering with WCDS on the previous selected `datastream`.

In [None]:
%%time

# Parameters
OMEGA = math.inf
DELTA = 200
GAMMA = 200
BETA = 60
EPSILON = 0.1
µ = 0
DIM = 2
print_step = 100

# Clusterer instance
c_online = WCDS(
    omega=OMEGA,
    delta=DELTA,
    gamma=GAMMA,
    epsilon=EPSILON,
    dimension=DIM,
    beta=BETA,
    µ=µ)

# Results
results = pd.DataFrame(columns=["Time Stamp", "Class", "Assigned Discriminator", "Number of Discriminators"])

# Online step
for time_ in range(len(datastream)):
    k, _ = c_online.record(list(datastream.iloc[time_])[:-1], time_)
    results.loc[len(results)] = [time_, datastream["Class"][time_], k, len(c_online.discriminators)]
    if time_ % print_step == 0 and time_ > 0:
        print("Observation: {} #Discriminators: {} Cluster Measures: {}".format(time_, len(c_online), homogeneity_completeness_v_measure(datastream["Class"][max(time_-OMEGA, 0):time_], results["Assigned Discriminator"][max(time_-OMEGA, 0):time_])))

In [None]:
# Optional save
c_online.save()

### 2.3 Offline clustering
Now we perform offline clustering on the current configuration of WCDS. There are three variants, but the first one is the one presented in the original paper and therefore performs best most of the times.

In [None]:
N_CLUSTERS = 2
THRESHOLD = None

In [None]:
%%time
# MergeClustering

c_offline = MergeClustering(n_clusters=N_CLUSTERS, distance_threshold=THRESHOLD)
actual_clusters1 = c_offline.fit(c_online.discriminators)

In [None]:
%%time
# MinDistanceClustering

c_offline = MinDistanceClustering(n_clusters=N_CLUSTERS, distance_threshold=THRESHOLD)
actual_clusters2 = c_offline.fit(c_online.discriminators)

In [None]:
%%time
# CentroidClustering

c_offline = CentroidClustering(n_clusters=N_CLUSTERS, distance_threshold=THRESHOLD)
centroids = [c_online.centroid(d) for d in c_online.discriminators.values()]
actual_clusters3 = c_offline.fit(c_online.discriminators, centroids)

### 2.4 Results
The following section summarizes and visualizes the obtained results of one exemplary run.

| Data set | #Instances | Dimensions | Homogenity | Completeness | V-Measure | #Microclusters | Omega | Delta | Gamma | Beta | Epsilon | Mu | Runtime Online |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Complex8 | 2551 | 2 | 0.9053588976743742 | 0.9335710366851351 | 0.9192485574153881 | 91 | Inf | 200 | 200 | 70 | 0.1 | 0 | 24min 46s |
| Complex9 | 3031 | 2 | 0.9050691859653084 | 0.8300021530934564 | 0.8659117998655447 | 98 | Inf | 200 | 200 | 80 | 0.1 | 0 | 21min 22s |
| D31 | 3100 | 2 | 0.9055106871046765 | 0.9517232533653724 | 0.9280420288574567 | 44 | Inf | 200 | 200 | 30 | 0.3 | 0 | 5min 24s |
| Jain | 373 | 2 | 1.0 | 1.0 | 1.0 | 33 | Inf | 200 | 200 | 60 | 0.1 | 0 | 35.7 s |
| Aggregation | 788 | 2 | 0.9888391642302382 | 0.9876953626611313 | 0.9882669324921871 | 38 | Inf | 200 | 200 | 50 | 0.1 | 0 | 1min 43s |

Plot results of actual vs online vs offline clustering.

In [None]:
def color_dict(i, shift=0):
    color = "#"
    random.seed(i+shift)
    color += "".join(random.choices("0123456789ABCDEF", k=6))
    return color

In [None]:
%matplotlib inline

# Actual Clustering
plt.figure(figsize=(10,10))
plt.scatter(datastream["X"],
            datastream["Y"],
            marker="o",
            color=[color_dict(c) for c in datastream["Class"]])
plt.axis("scaled", xlim=[0, 1, 0, 1])
plt.suptitle("Actual Clustering", fontsize=18)
print("{} Classes".format(len(np.unique(datastream["Class"]))))

In [None]:
%matplotlib inline

# Online
plt.figure(figsize=(10,10))
plt.scatter(datastream["X"],
            datastream["Y"],
            marker="o",
            color=[color_dict(ass) for ass in results["Assigned Discriminator"]])
plt.axis("scaled", xlim=[0, 1, 0, 1])
plt.suptitle("Online Clustering", fontsize=18)
print("Found {} Microclusters".format(len(c_online)))
print("Homogenity/Completeness/V-Measure: ",
      homogeneity_completeness_v_measure(datastream["Class"], results["Assigned Discriminator"]))

In [None]:
%matplotlib inline

# Offline1
plt.figure(figsize=(10,10))
plt.scatter(datastream["X"],
            datastream["Y"],
            marker="o",
            color=[color_dict(actual_clusters1[ass]) for ass in results["Assigned Discriminator"]])
plt.axis("scaled", xlim=[0, 1, 0, 1])
plt.suptitle("1. Offline Clustering", fontsize=18)
print("Formed {} Clusters.".format(len(np.unique(list(actual_clusters1.values())))))
print("Homogenity/Completeness/V-Measure: ", 
      homogeneity_completeness_v_measure(datastream["Class"], [actual_clusters1[ass] for ass in results["Assigned Discriminator"]]))

In [None]:
%matplotlib inline

# Offline2
plt.figure(figsize=(10,10))
plt.scatter(datastream["X"],
            datastream["Y"],
            marker="o",
            color=[color_dict(actual_clusters2[ass]) for ass in results["Assigned Discriminator"]])
plt.axis("scaled", xlim=[0, 1, 0, 1])
plt.suptitle("2. Offline Clustering", fontsize=18)
print("Formed {} Clusters.".format(len(np.unique(list(actual_clusters2.values())))))
print("Homogenity/Completeness/V-Measure: ",
      homogeneity_completeness_v_measure(datastream["Class"], [actual_clusters2[ass] for ass in results["Assigned Discriminator"]]))

In [None]:
%matplotlib inline

# Offline3
plt.figure(figsize=(10,10))
plt.scatter(datastream["X"],
            datastream["Y"],
            marker="o",
            color=[color_dict(actual_clusters3[ass]) for ass in results["Assigned Discriminator"]])
plt.axis("scaled", xlim=[0, 1, 0, 1])
plt.suptitle("3. Offline Clustering", fontsize=18)
print("Formed {} Clusters.".format(len(np.unique(list(actual_clusters3.values())))))
print("Homogenity/Completeness/V-Measure: ",
      homogeneity_completeness_v_measure(datastream["Class"], [actual_clusters3[ass] for ass in results["Assigned Discriminator"]]))

The following cell visualizes the behavior of the online clustering over time. 

In [None]:
# Live online clustering plot
%matplotlib notebook

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
plt.ion()
ax.axis("scaled", xlim=[0, 1, 0, 1])
fig.show()
fig.canvas.draw()

for i in range(len(results["Assigned Discriminator"])):
    ax.scatter(datastream["X"][i],
               datastream["Y"][i],
               marker="o",
               color=color_dict(int(results["Assigned Discriminator"][i])))
    fig.canvas.draw()

## 3. Stream clustering
### 3.1 Loading data streams
In the following sections `datastream` will be the variable storing the current data stream. It uses a pandas dataframe for that. The last column contains the instance's class for evaluation and should not be passed to the algorithm.

By executing one of the following cells, the chosen data stream will be loaded.

In [None]:
# Transient Chess Board
url_data = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/chess/transientChessboard.data"
url_labels = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/chess/transientChessboard.labels"
datastream = pd.read_csv(url_data, names=["X", "Y"], header=None, sep=" ")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream["Class"] = pd.read_csv(url_labels, names=["Class"], header=None)

In [None]:
# Moving Squares
url_data = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/movingSquares/movingSquares.data"
url_labels = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/movingSquares/movingSquares.labels"
datastream = pd.read_csv(url_data, names=["X", "Y"], header=None, sep=" ")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream["Class"] = pd.read_csv(url_labels, names=["Class"], header=None)

In [None]:
# Interchanging RBF
url_data = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/rbf/interchangingRBF.data"
url_labels = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/rbf/interchangingRBF.labels"
datastream = pd.read_csv(url_data, names=["X", "Y"], header=None, sep=" ")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream["Class"] = pd.read_csv(url_labels, names=["Class"], header=None)

In [None]:
# Moving RBF
url_data = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/rbf/movingRBF.data"
url_labels = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/rbf/movingRBF.labels"
datastream = pd.read_csv(url_data, header=None, sep=" ")
scaler = MinMaxScaler()
datastream = pd.DataFrame(scaler.fit_transform(datastream), columns=datastream.columns)
datastream["Class"] = pd.read_csv(url_labels, names=["Class"], header=None)

In [None]:
# Mixed Drift
url_data = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/mixedDrift/mixedDrift.data"
url_labels = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/mixedDrift/mixedDrift.labels"
datastream = pd.read_csv(url_data, names=["X", "Y"], header=None, sep=" ")
datastream["X"] = minmax_scale(datastream["X"])
datastream["Y"] = minmax_scale(datastream["Y"])
datastream["Class"] = pd.read_csv(url_labels, names=["Class"], header=None)

In [None]:
# SEA Concepts
url_data = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/sea/SEA_training_data.csv"
url_labels = "https://raw.githubusercontent.com/vlosing/driftDatasets/master/artificial/sea/SEA_training_class.csv"
datastream = pd.read_csv(url_data, header=None)
scaler = MinMaxScaler()
datastream = pd.DataFrame(scaler.fit_transform(datastream), columns=datastream.columns)
datastream["Class"] = pd.read_csv(url_labels, names=["Class"], header=None)

In [None]:
# 10% of Network Intrusion Detection (KDD Cup 1999)
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
header = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised",
    "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files",
    "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "Class"]
datastream = pd.read_csv(url, names=header, header=None)
label_encoder = LabelEncoder() 
classes = label_encoder.fit_transform(datastream["Class"]) 
datastream = datastream.select_dtypes(exclude=["object"])
datastream = datastream.drop(columns=["land", "logged_in", "is_host_login", "is_guest_login"])
scaler = MinMaxScaler()
datastream = pd.DataFrame(scaler.fit_transform(datastream), columns=datastream.columns)
datastream["Class"] = classes

In [None]:
# Power Supply Data Stream
url = "http://www.cse.fau.edu/~xqzhu/Stream/powersupply.arff"
ftpstream = urllib.request.urlopen(url)
data, meta = arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
datastream = pd.DataFrame(data)
datastream["attribute0"] = minmax_scale(datastream["attribute0"])
datastream["attribute1"] = minmax_scale(datastream["attribute1"])
datastream["Class"] = [int(obs) for obs in datastream["class"]]
datastream = datastream.drop(columns=["class"])

In [None]:
# Forest Cover Type (From: https://moa.cms.waikato.ac.nz/datasets/)
data = arff.loadarff("./datasets/covtypeNorm.arff")
datastream = pd.DataFrame(data[0])
labels = datastream["class"].astype(int)
datastream = datastream.select_dtypes(exclude="O")
datastream["Class"] = labels

We can take a look at the first rows of the data stream and its description.

In [None]:
datastream.head()

In [None]:
datastream.describe()

### 3.2 Clustering
This part contains the online and offline clustering of the loaded data stream.

In [None]:
%%time

# Parameters
OMEGA = 250
DELTA = 50
GAMMA = 50
BETA = 40
EPSILON = 0.2
µ = 0.5
DIM = len(datastream.iloc[0])-1
print_step = 200

# Clusterer instance
c_online = WCDS(
    omega=OMEGA,
    delta=DELTA,
    gamma=GAMMA,
    epsilon=EPSILON,
    dimension=DIM,
    beta=BETA,
    µ=µ)

# Lists
results = pd.DataFrame(columns=["Time Stamp", "Class", "Assigned Discriminator", "Number of Discriminators",
                                "Homogenity", "Completeness", "V-Measure"])
discriminator_lifespan = dict()

# Online clustering
for time_ in range(len(datastream)):
    k, deleted_discriminators = c_online.record(list(datastream.iloc[time_])[:-1], time_)
    if k not in discriminator_lifespan:
        discriminator_lifespan[k] = [c_online.discriminators[k].creation_time, None]
    for id_ in deleted_discriminators:
        discriminator_lifespan[id_][1] = time_
    current_evaluation = homogeneity_completeness_v_measure(datastream["Class"][max(time_-OMEGA, 0):time_], list(results["Assigned Discriminator"][max(time_-OMEGA, 0):time_]))
    results.loc[len(results)] = [time_, datastream["Class"][time_], k, len(c_online.discriminators), current_evaluation[0], current_evaluation[1], current_evaluation[2]]
    if time_ % print_step == 0 and time_ > 0:
        print("Observation: {} #Discriminators: {} Cluster Measures: {}".format(time_, len(c_online), current_evaluation))

In [None]:
NAME = "TCB"

In [None]:
# Save the results CSV files
results.to_csv("./results/" + NAME + "_results.csv", index_label=False)
discr_lifespan = pd.DataFrame(columns=["Discriminator ID", "Birth", "Death"])
keys = list(discriminator_lifespan.keys())
births = [i[0] for i in discriminator_lifespan.values()]
deaths = [i[1] for i in discriminator_lifespan.values()]
for i in range(len(discriminator_lifespan)):
    discr_lifespan.loc[len(discr_lifespan)] = [keys[i], births[i], deaths[i]]
discr_lifespan.to_csv("./results/" + NAME + "_discriminator_life_spans.csv", index_label=False)

In [None]:
# Load the results stored a CSV file
results = pd.read_csv("./results/" + NAME + "_results.csv")
discr_lifespan = pd.read_csv("./results/" + NAME + "_discriminator_life_spans.csv")
discriminator_lifespan = dict()
for i in range(len(discr_lifespan)):
    death = None if np.isnan(discr_lifespan["Death"][i]) else discr_lifespan["Death"][i]
    discriminator_lifespan[discr_lifespan["Discriminator ID"][i]] = [discr_lifespan["Birth"][i], death]

### 3.3 Results
Visualisations and results of the experiments.

| Data stream | #Instances | #Numerical Features | Omega | Delta | Gamma | Beta | Epsilon | Mu | Runtime Online |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| TCB | 200000 | 2 | 250 | 50 | 50 | 50 | 0.1 | 1 | 1h 36min 48s |
| Moving RBF | 200000 | 10 | 500 | 50 | 50 | 50 | 0.1 | 1 | 5h 30min 15s |
| NID | 494021 | 34 | 1000 | 50 | 50 | 68 | 0.1 | 1 | 57min 45s |
| FCT | 581012 | 10 | 1000 | 50 | 50 | 70 | 0.1 | 1 | ~28h |

In [None]:
# Set active discriminator's death to current time for plotting
for discr in discriminator_lifespan.values():
    if discr[1] is None or np.isnan(discr[1]):
        discr[1] = time_

In [None]:
# Color management
def color_dict(i, shift=8765):
    color = "#"
    random.seed(i+shift)
    color += "".join(random.choices("0123456789ABCDEF", k=6))
    return color

def most_common(lst):
    return max(set(lst), key=lst.count)

In [None]:
# Calculate most absorbed class and thus color for every discriminator
discr_colors_by_id = dict()
for discr_id in np.unique(results["Assigned Discriminator"]):
    classes = [datastream["Class"][i] for i in range(len(results)) if results["Assigned Discriminator"][i] == discr_id]
    discr_colors_by_id[discr_id] = color_dict(most_common(classes))

Sinlge plots:

In [None]:
# Discriminator life spans
%matplotlib inline

plt.figure(dpi=200)
for i in discriminator_lifespan:
    birth = int(discriminator_lifespan[i][0])
    death = int(discriminator_lifespan[i][1])
    plt.plot(range(birth, death), [i for _ in range(birth, death)], color=discr_colors_by_id[i])
plt.xlabel("Time")
plt.ylabel("Discriminator ID")
plt.savefig(NAME + "_" + str(OMEGA) + "_" + str(time_) +"_DLS.png")

In [None]:
# Class life spans
%matplotlib inline

plt.figure(dpi=200)
for i in np.unique(datastream["Class"]):
    plt.scatter([k for k, j in enumerate(datastream["Class"][:time_]) if j == i], [i for j in datastream["Class"][:time_] if j == i], s=1, color=color_dict(i))
plt.xlabel("Time")
plt.ylabel("Class")
plt.savefig(NAME + "_" + str(OMEGA) + "_" + str(time_) +"_CLS.png")

In [None]:
# Purity
%matplotlib inline

plt.figure(dpi=200) 
plt.plot(range(time_), [i for i in results["Homogenity"][:time_]])
plt.xlabel("Time")
plt.ylabel("Purity")
plt.savefig(NAME + "_" + str(OMEGA) + "_" + str(time_) +"_P.png")

In [None]:
# #Discriminators vs #Classes in sliding window
%matplotlib inline

fig, ax1 = plt.subplots(dpi=200)
ax1.plot(results["Number of Discriminators"][:time_], color="blue", alpha=.5)
ax2 = ax1.twinx()
ax2.plot([len(np.unique(datastream["Class"][max(0,i-OMEGA):i])) for i in range(time_)], color="orange", alpha=.5)
ax1.set_xlabel("Time")
ax1.set_ylabel("# Discriminators", color="blue", alpha=.5)
ax2.set_ylabel("# Classes in SW", color="orange", alpha=.5)
plt.savefig(NAME + "_" + str(OMEGA) + "_" + str(time_) +"_DC.png")

Joint plots:

In [None]:
# Class life spans and discriminator life spans
%matplotlib inline

fig, axs = plt.subplots(2, dpi=200)
axs[1].set_xlabel("Time")
axs[1].set_ylabel("Class")
axs[0].set_ylabel("Discriminator ID")
for i in discriminator_lifespan:
    birth = int(discriminator_lifespan[i][0])
    death = int(discriminator_lifespan[i][1])
    axs[0].plot(range(birth, death), [i for _ in range(birth, death)], color=discr_colors_by_id[i])
for i in np.unique(datastream["Class"]):
    axs[1].scatter([k for k, j in enumerate(datastream["Class"][:time_]) if j == i], [i for j in datastream["Class"][:time_] if j == i], s=1, color=color_dict(i))
fig.savefig(NAME + "_" + str(OMEGA) + "_" + str(time_) +"_CLSDLS.png")

In [None]:
# #Discriminators, #Classes, Purity
%matplotlib inline

fig, axs = plt.subplots(2, dpi=200)
ax3 = axs[0].twinx()
axs[1].set_xlabel("Time")
axs[1].set_ylabel("Purity")
axs[0].set_ylabel("# Discriminators", color="blue", alpha=.5)
ax3.set_ylabel("# Classes in SW", color="orange", alpha=.5)
axs[0].plot(results["Number of Discriminators"][:time_], color="blue", alpha=.5)
ax3.plot([len(np.unique(datastream["Class"][max(0,i-OMEGA):i])) for i in range(time_)], color="orange", alpha=.5)
axs[1].plot(range(0,len(results["Number of Discriminators"][:time_])), [i for i in results["Homogenity"][:time_]])
fig.savefig(NAME + "_" + str(OMEGA) + "_" + str(time_) +"_DCP.png")

Live Plot:

In [None]:
# Live plot of the sliding windows, only works for 2D data streams like TCB
%matplotlib notebook

MAX = 10000

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
plt.ion()
ax.set_xlim(0,1)
ax.set_ylim(0,1)
fig.show()
fig.canvas.draw()

for window in range(0, MAX-OMEGA, OMEGA):
    ax.clear()
    ax.set_xlim(0,1)
    ax.set_ylim(0,1)
    ax.scatter(datastream["X"][window:window+OMEGA], datastream["Y"][window:window+OMEGA], color=[color_dict(datastream["Class"][i]) for i in range(window,window+OMEGA)])
    fig.canvas.draw()
    time.sleep(.5)