# HST benchmark using `skmultiflow`

In this notebook, we perform the same analysis as in the [previous notebook](./07_creme044_benchmark.ipynb), this time using the library `scikit-multiflow`. We find nothing different.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skmultiflow.evaluation import EvaluatePrequential

import h5py
import time

from skmultiflow.data import DataStream
from skmultiflow.anomaly_detection import HalfSpaceTrees

from sklearn.metrics import roc_auc_score

In [2]:
# Read in the http data
arrays = {}
f = h5py.File("http.mat")
for k,v in f.items():
    arrays[k] = np.array(v)

In [3]:
X = pd.DataFrame(arrays["X"]).T
y = pd.DataFrame(arrays["y"]).astype(int).T

In [4]:
X.dtypes

0    float64
1    float64
2    float64
dtype: object

In [5]:
y.dtypes

0    int32
dtype: object

In [6]:
X.shape

(567498, 3)

In [7]:
stream = DataStream(data=X, y=y)
stream.prepare_for_use()

In [8]:
hst = HalfSpaceTrees(n_features=3)

In [9]:
counter = 0
y_true = []
y_pred = []
start_time = time.time()
while stream.has_more_samples():
    if counter % 10000 == 0 and counter > 1:
        print(f"iteration: {counter}; time elapsed: {time.time()-start_time}")
        if np.mean(y_true) != 0:
            print(f"rocauc: {roc_auc_score(y_true, y_pred)}")
        else:
            print(f"rocauc: have only encountered one class")
    X, y = stream.next_sample()
    if counter > 250:
        yhat = hst.predict_proba(X)[0][0]
#         print(yhat)
        y_pred.append(yhat)
        y_true.append(y)
    hst.partial_fit(X, y)
    counter += 1

iteration: 10000; time elapsed: 46.37686800956726
rocauc: have only encountered one class
iteration: 20000; time elapsed: 83.83153009414673
rocauc: have only encountered one class
iteration: 30000; time elapsed: 119.3799216747284
rocauc: have only encountered one class
iteration: 40000; time elapsed: 151.83430290222168
rocauc: have only encountered one class
iteration: 50000; time elapsed: 188.51997900009155
rocauc: have only encountered one class
iteration: 60000; time elapsed: 224.9317181110382
rocauc: have only encountered one class
iteration: 70000; time elapsed: 261.61531949043274
rocauc: have only encountered one class
iteration: 80000; time elapsed: 297.1365633010864
rocauc: have only encountered one class
iteration: 90000; time elapsed: 333.0256862640381
rocauc: have only encountered one class
iteration: 100000; time elapsed: 369.0213453769684
rocauc: have only encountered one class
iteration: 110000; time elapsed: 406.15833926200867
rocauc: have only encountered one class
iter

In [10]:
roc_auc_score(y_true, y_pred)

0.4643603341096978

# With improper preprocessing:
We repeat the analysis, but this time we will rescale and shuffle the dataset before streaming it to our model:

In [14]:
X = pd.DataFrame(arrays["X"]).T
y = pd.DataFrame(arrays["y"]).astype(int).T

In [15]:
X["label"] = y

In [16]:
X

Unnamed: 0,0,1,2,label
0,-2.302585,5.371103,10.716107,0
1,-2.302585,5.088213,8.418058,0
2,-2.302585,5.464255,7.113224,0
3,-2.302585,5.451468,7.616825,0
4,-2.302585,5.476882,6.186414,0
...,...,...,...,...
567493,-2.302585,5.357058,7.735477,0
567494,-2.302585,5.389528,5.464255,0
567495,-2.302585,5.384954,8.191491,0
567496,-2.302585,5.389528,7.118097,0


In [19]:
X_shuffled = X.sample(frac=1).reset_index(drop=True)

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
ss = StandardScaler()

In [22]:
X_shuffled[[0,1,2]] = ss.fit_transform(X_shuffled[[0,1,2]])

In [23]:
stream = DataStream(data=X_shuffled[[0,1,2]], y=X_shuffled[["label"]])
stream.prepare_for_use()
hst = HalfSpaceTrees(n_features=3, size_limit=25)
counter = 0
y_true = []
y_pred = []
start_time = time.time()
while stream.has_more_samples():
    if counter % 10000 == 0 and counter > 0:
        print(f"iteration: {counter}; time elapsed: {time.time()-start_time}; roc_auc: {roc_auc_score(y_true, y_pred)}")
    X, y = stream.next_sample()
    if counter > 250:
#         print(hst.predict_proba(X)[0][0])
        yhat = hst.predict_proba(X)[0][0]
        y_pred.append(yhat)
        y_true.append(y)
    hst.partial_fit(X, y)
    counter += 1

iteration: 10000; time elapsed: 42.48803210258484; roc_auc: 0.026033550024489066
iteration: 20000; time elapsed: 80.49550080299377; roc_auc: 0.013527088957079839
iteration: 30000; time elapsed: 116.57068657875061; roc_auc: 0.009280169371912497
iteration: 40000; time elapsed: 156.4720811843872; roc_auc: 0.007274025197702383
iteration: 50000; time elapsed: 193.39114546775818; roc_auc: 0.006152062647445824
iteration: 60000; time elapsed: 228.73001670837402; roc_auc: 0.005323667463357042
iteration: 70000; time elapsed: 266.105925321579; roc_auc: 0.008033486297920993
iteration: 80000; time elapsed: 308.31938648223877; roc_auc: 0.009734251574895134
iteration: 90000; time elapsed: 351.24908232688904; roc_auc: 0.008788051341061209
iteration: 100000; time elapsed: 390.1844992637634; roc_auc: 0.007956055548902038
iteration: 110000; time elapsed: 430.8786463737488; roc_auc: 0.00718191438055675
iteration: 120000; time elapsed: 467.90228056907654; roc_auc: 0.006698625044817612
iteration: 130000; ti

In [24]:
roc_auc_score(y_true, y_pred)

0.003648572598435957