# Libraries

In [1]:
# Libraries
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch

import scipy.io

from sklearn.model_selection import train_test_split

SEED = 0

torch.manual_seed(SEED)
np.random.seed(SEED)

# Datasets

In [2]:
def get_data(dataset):
    if dataset == 'KDD99':
        df = pd.read_table('./data/kdd.txt', delimiter = ',', header = None)
        df['y'] = np.loadtxt('./data/kddlabel.txt', delimiter=',')
        df['y'] = 1 - df['y']
    elif dataset == 'NSL':
        df = pd.read_table('./data/nsl.txt', delimiter = ',', header = None)
        df['y'] = np.loadtxt('./data/nsllabel.txt', delimiter=',')
    elif dataset == 'UNSW':
        df = pd.read_table('./data/unsw.txt', delimiter = ',', header = None)
        df['y'] = np.loadtxt('./data/unswlabel.txt', delimiter=',')
    elif dataset == 'DoS':
        df = pd.read_table('./data/dos.txt', delimiter = ',', header = None)
        df['y'] = np.loadtxt('./data/doslabel.txt', delimiter=',')
    elif dataset == 'Syn':
        df = pd.read_table('./data/syn.txt', delimiter = ',', header = None)
        df['y'] = np.loadtxt('./data/synlabel.txt', delimiter=',')
    else:
        data = scipy.io.loadmat(f"./data/{dataset}.mat")
        df = pd.DataFrame(data['X'])
        df['y'] = data['y']

    return df

In [3]:
dataset = 'cover' # ['KDD99','NSL','UNSW','DoS','Syn','ionosphere','cardio','satellite','satimage-2','mammography','pima','cover']
sample_size = 10000

df = get_data(dataset)
if len(df) <= sample_size:
    random_sample = df
else:
    _, random_sample = train_test_split(df, test_size=sample_size/len(df), stratify=df['y'], random_state=SEED)

In [4]:
print(df.shape)
print()
print(df['y'].value_counts()/len(df))

(286048, 11)

y
0    0.990397
1    0.009603
Name: count, dtype: float64


In [5]:
print(random_sample.shape)
print()
print(random_sample['y'].value_counts()/len(random_sample))

(10000, 11)

y
0    0.9904
1    0.0096
Name: count, dtype: float64


In [6]:
def preprocess(df):
    X = df.drop(columns=['y']).to_numpy().reshape(-1,df.shape[1]-1)
    y = df['y'].to_numpy()
    return X, y

X_all, y_all = preprocess(random_sample) 

# Models

In [7]:
rslt = []

## MemStream

In [8]:
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from torch.autograd import Variable

In [9]:
device = torch.device('cpu')

In [10]:
# from https://github.com/Stream-AD/MemStream?tab=readme-ov-file
# file: memstream.py
class MemStream(nn.Module):
    def __init__(self, in_dim, params):
        super(MemStream, self).__init__()
        self.params = params
        self.in_dim = in_dim
        self.out_dim = in_dim*2
        self.memory_len = params['memory_len']
        self.max_thres = torch.tensor(params['beta']).to(device)
        self.memory = torch.randn(self.memory_len, self.out_dim).to(device)
        self.mem_data = torch.randn(self.memory_len, self.in_dim).to(device)
        self.memory.requires_grad = False
        self.mem_data.requires_grad = False
        self.batch_size = params['memory_len']
        self.num_mem_update = 0
        self.encoder = nn.Sequential(
            nn.Linear(self.in_dim, self.out_dim),
            nn.Tanh(),
        ).to(device)
        self.decoder = nn.Sequential(
            nn.Linear(self.out_dim, self.in_dim)
        ).to(device)
        self.clock = 0
        self.last_update = -1
        self.optimizer = torch.optim.Adam(self.parameters(), lr=params['lr'])
        self.loss_fn = nn.MSELoss()
        self.count = 0


    def train_autoencoder(self, data, epochs):
        self.mean, self.std = self.mem_data.mean(0), self.mem_data.std(0)
        new = (data - self.mean) / self.std
        new[:, self.std == 0] = 0
        new = Variable(new)
        for epoch in range(epochs):
            self.optimizer.zero_grad()
            output = self.decoder(self.encoder(new + 0.001*torch.randn_like(new).to(device)))
            loss = self.loss_fn(output, new)
            loss.backward()
            self.optimizer.step()


    def update_memory(self, output_loss, encoder_output, data):
        if output_loss <= self.max_thres:
            least_used_pos = self.count%self.memory_len
            self.memory[least_used_pos] = encoder_output
            self.mem_data[least_used_pos] = data
            self.mean, self.std = self.mem_data.mean(0), self.mem_data.std(0)
            self.count += 1
            return 1
        return 0

    def initialize_memory(self, x):
        mean, std = model.mem_data.mean(0), model.mem_data.std(0)
        new = (x - mean) / std
        new[:, std == 0] = 0
        self.memory = self.encoder(new)
        self.memory.requires_grad = False
        self.mem_data = x

    def forward(self, x):
        new = (x - self.mean) / self.std
        new[:, self.std == 0] = 0
        encoder_output = self.encoder(new)
        loss_values = torch.norm(self.memory - encoder_output, dim=1, p=1).min()
        self.update_memory(loss_values, encoder_output, x)
        return loss_values

In [11]:
epochs = 5000
lr = 1e-2

if dataset == 'KDD99':
    beta = 1.0
    memlen = 256
elif dataset == 'NSL':
    beta = 0.1
    memlen = 2048
elif dataset == 'UNSW':
    beta = 0.1
    memlen = 2048
elif dataset == 'DoS':
    beta = 0.1
    memlen = 2048
elif dataset == 'Syn':
    beta = 1.0
    memlen = 16
elif dataset == 'ionosphere':
    beta = 0.001
    memlen = 4
elif dataset == 'cardio':
    beta = 1.0
    memlen = 64
elif dataset == 'satellite':
    beta = 0.01
    memlen = 32
elif dataset == 'satimage-2':
    beta = 10.0
    memlen = 256
elif dataset == 'mammography':
    beta = 0.1
    memlen = 128
elif dataset == 'pima':
    beta = 0.001
    memlen = 64
elif dataset == 'cover':
    beta = 0.0001
    memlen = 2048
# else:
#     beta = 1.0
#     memlen = 16

params = {
    'beta': beta
    ,'memory_len': memlen # N
    ,'batch_size': 1,
    'lr': lr
}

In [12]:
X = torch.FloatTensor(X_all)
y = y_all


model = MemStream(X[0].shape[0], params).to(device)
model.max_thres=model.max_thres.float()
batch_size = params['batch_size']


data_loader = DataLoader(X, batch_size=batch_size)
init_data = X[y == 0][:memlen].to(device)
model.mem_data = init_data
torch.set_grad_enabled(True)
model.train_autoencoder(Variable(init_data).to(device), epochs=epochs)
torch.set_grad_enabled(False)
model.initialize_memory(Variable(init_data[:memlen]))

err = []
for data in data_loader:
    output = model(data.to(device))
    err.append(output)

In [13]:
scores = np.array([i.cpu() for i in err])
auc = np.round(roc_auc_score(y, scores),4)

rslt.append({
    'model': 'MemStream'
    ,'metric': 'AUC'
    ,dataset: auc
})

## PySad Models

In [14]:
from pysad.models import ExactStorm
from pysad.models import HalfSpaceTrees
from pysad.models import IForestASD
from pysad.models import RSHash
from pysad.models import RobustRandomCutForest
from pysad.models import LODA
from pysad.models import KitNet

from pysad.utils import ArrayStreamer
from pysad.evaluation import AUROCMetric

In [15]:
def run_pysad_model(X_all, y_all, model):
    iterator = ArrayStreamer(shuffle=False)  # Create streamer to simulate streaming data.
    auroc = AUROCMetric()  # Tracker of area under receiver-operating- characteristics curve metric.

    for X, y in tqdm(iterator.iter(X_all, y_all)):  # Iterate over examples.
        scores = []

        # Fit & Score.
        model.fit_partial(X)
        score = model.score_partial(X)
        scores.append(score)

        if ~np.isnan(score):
            auroc.update(y, score)  # update AUROC metric.
        
    rslt = np.round(auroc.get(),3)

    # Output score.
    return rslt

In [16]:
# STORM
model = ExactStorm(window_size=10000, max_radius=0.1)
auc = run_pysad_model(X_all, y_all, model)

rslt.append({
    'model': 'ExactStorm'
    ,'metric': 'AUC'
    ,dataset: auc
})

10000it [00:23, 424.45it/s]


In [17]:
# HS-Tree
model = HalfSpaceTrees(window_size=100, num_trees=25, max_depth=15, initial_window_X=None, feature_mins=np.min(X_all, axis=0), feature_maxes=np.max(X_all, axis=0))
auc = run_pysad_model(X_all, y_all, model)

rslt.append({
    'model': 'HalfSpaceTrees'
    ,'metric': 'AUC'
    ,dataset: auc
})

10000it [01:25, 117.24it/s]


In [18]:
# iForestASD
model = IForestASD(window_size=100, initial_window_X=None) 
auc = run_pysad_model(X_all, y_all, model)

rslt.append({
    'model': 'IForestASD'
    ,'metric': 'AUC'
    ,dataset: auc
})

10000it [00:47, 209.11it/s]


In [19]:
# RSHash
model = RSHash(sampling_points=1000, decay=0.015, num_components=100, num_hash_fns=1, feature_mins=np.min(X_all, axis=0), feature_maxes=np.max(X_all, axis=0)) 
auc = run_pysad_model(X_all, y_all, model)

rslt.append({
    'model': 'RSHash'
    ,'metric': 'AUC'
    ,dataset: auc
})

10000it [00:47, 210.47it/s]


In [20]:
# RCF
model = RobustRandomCutForest(num_trees=4, shingle_size=4, tree_size=256)
auc = run_pysad_model(X_all, y_all, model)

rslt.append({
    'model': 'RobustRandomCutForest'
    ,'metric': 'AUC'
    ,dataset: auc
})

10000it [00:26, 371.81it/s]


In [21]:
# LODA
model = LODA(num_bins=10, num_random_cuts=100)
auc = run_pysad_model(X_all, y_all, model)

rslt.append({
    'model': 'LODA'
    ,'metric': 'AUC'
    ,dataset: auc
})

10000it [02:09, 77.09it/s]


In [22]:
# Kitsune
model = KitNet(max_size_ae=10, learning_rate=0.1, hidden_ratio=0.75, grace_feature_mapping=np.round(0.1*len(y_all)), grace_anomaly_detector=np.round(0.1*len(y_all)))

try:
    auc = run_pysad_model(X_all, y_all, model)
except:
    auc = None

rslt.append({
    'model': 'Kitsune'
    ,'metric': 'AUC'
    ,dataset: auc
})

  x = (x - self.norm_min) / (self.norm_max - self.norm_min + 0.0000000000000001)
1274it [00:00, 12723.57it/s]

Feature-Mapper: train-mode, Anomaly-Detector: off-mode
The Feature-Mapper found a mapping: 10 features to 1 autoencoders.
Feature-Mapper: execute-mode, Anomaly-Detector: train-mode


3378it [00:00, 6900.77it/s] 

Feature-Mapper: execute-mode, Anomaly-Detector: exeute-mode


10000it [00:01, 7215.98it/s]


# Results

In [23]:
rslt_df = pd.DataFrame(rslt)
rslt_df[dataset] = np.round(rslt_df[dataset], 3)
rslt_df

Unnamed: 0,model,metric,cover
0,MemStream,AUC,0.97
1,ExactStorm,AUC,0.5
2,HalfSpaceTrees,AUC,0.745
3,IForestASD,AUC,0.848
4,RSHash,AUC,0.384
5,RobustRandomCutForest,AUC,0.787
6,LODA,AUC,0.5
7,Kitsune,AUC,0.701
