In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from skmultiflow.data import DataStream
from skmultiflow.anomaly_detection import HalfSpaceTrees
from skmultiflow.evaluation import EvaluatePrequential

import h5py
import time

In [2]:
# Read in the http data
arrays = {}
f = h5py.File("http.mat")
for k,v in f.items():
    arrays[k] = np.array(v)

In [3]:
X = pd.DataFrame(arrays["X"]).T
y = pd.DataFrame(arrays["y"]).astype(int).T

In [4]:
X.dtypes

0    float64
1    float64
2    float64
dtype: object

In [5]:
y.dtypes

0    int32
dtype: object

In [6]:
X.shape

(567498, 3)

In [7]:
stream = DataStream(data=X, y=y)
stream.prepare_for_use()

In [8]:
hst = HalfSpaceTrees(n_features=3)

In [9]:
counter = 0
y_true = []
y_pred = []
start_time = time.time()
while stream.has_more_samples():
    if counter % 10000 == 0:
        print(f"iteration: {counter}; time elapsed: {time.time()-start_time}")
    X, y = stream.next_sample()
    if counter > 250:
        yhat = hst.predict(X)
        y_pred.append(yhat)
        y_true.append(y)
    hst.partial_fit(X, y)
    counter += 1

iteration: 0; time elapsed: 0.0
iteration: 10000; time elapsed: 25.80697250366211
iteration: 20000; time elapsed: 47.38527989387512
iteration: 30000; time elapsed: 68.49681162834167
iteration: 40000; time elapsed: 89.55150318145752
iteration: 50000; time elapsed: 110.79966402053833
iteration: 60000; time elapsed: 131.84537029266357
iteration: 70000; time elapsed: 153.3149447441101
iteration: 80000; time elapsed: 174.35140323638916
iteration: 90000; time elapsed: 195.24612712860107
iteration: 100000; time elapsed: 216.37684059143066
iteration: 110000; time elapsed: 237.93442678451538
iteration: 120000; time elapsed: 261.2869610786438
iteration: 130000; time elapsed: 283.9838194847107
iteration: 140000; time elapsed: 305.0800642967224
iteration: 150000; time elapsed: 326.8689315319061
iteration: 160000; time elapsed: 347.88043117523193
iteration: 170000; time elapsed: 369.1109838485718
iteration: 180000; time elapsed: 390.21863174438477
iteration: 190000; time elapsed: 411.49627113342285

In [11]:
from sklearn.metrics import roc_auc_score

In [12]:
roc_auc_score(y_true, y_pred)

0.5042477467020117