# Libraries

In [34]:
# Libraries
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch

import scipy.io

from sklearn.model_selection import train_test_split

SEED = 0

torch.manual_seed(SEED)
np.random.seed(SEED)

# Datasets

In [35]:
def get_data(dataset):
    if dataset == 'NSL':
        df = pd.read_table('./data/nsl.txt', delimiter = ',', header = None)
        df['y'] = np.loadtxt('./data/nsllabel.txt', delimiter=',')
        return df

    return None

In [36]:
dataset = 'NSL' 
df = get_data(dataset)

In [37]:
print(df.shape)
print()
print(df['y'].value_counts()/len(df))

(125973, 127)

y
0.0    0.534583
1.0    0.465417
Name: count, dtype: float64


In [38]:
def preprocess(df):
    X = df.drop(columns=['y']).to_numpy().reshape(-1,df.shape[1]-1)
    y = df['y'].to_numpy()
    return X, y

X_all, y_all = preprocess(df) 

# Models

In [39]:
rslt = []

## MemStream

In [40]:
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from torch.autograd import Variable

In [41]:
device = torch.device('cpu')

In [42]:
# from https://github.com/Stream-AD/MemStream?tab=readme-ov-file
# file: memstream.py
class MemStream(nn.Module):
    def __init__(self, in_dim, params):
        super(MemStream, self).__init__()
        self.params = params
        self.in_dim = in_dim
        self.out_dim = in_dim*2
        self.memory_len = params['memory_len']
        self.max_thres = torch.tensor(params['beta']).to(device)
        self.memory = torch.randn(self.memory_len, self.out_dim).to(device)
        self.mem_data = torch.randn(self.memory_len, self.in_dim).to(device)
        self.memory.requires_grad = False
        self.mem_data.requires_grad = False
        self.batch_size = params['memory_len']
        self.num_mem_update = 0
        self.encoder = nn.Sequential(
            nn.Linear(self.in_dim, self.out_dim),
            nn.Tanh(),
        ).to(device)
        self.decoder = nn.Sequential(
            nn.Linear(self.out_dim, self.in_dim)
        ).to(device)
        self.clock = 0
        self.last_update = -1
        self.optimizer = torch.optim.Adam(self.parameters(), lr=params['lr'])
        self.loss_fn = nn.MSELoss()
        self.count = 0


    def train_autoencoder(self, data, epochs):
        self.mean, self.std = self.mem_data.mean(0), self.mem_data.std(0)
        new = (data - self.mean) / self.std
        new[:, self.std == 0] = 0
        new = Variable(new)
        for epoch in range(epochs):
            self.optimizer.zero_grad()
            output = self.decoder(self.encoder(new + 0.001*torch.randn_like(new).to(device)))
            loss = self.loss_fn(output, new)
            loss.backward()
            self.optimizer.step()


    def update_memory(self, output_loss, encoder_output, data):
        if output_loss <= self.max_thres:
            least_used_pos = self.count%self.memory_len
            self.memory[least_used_pos] = encoder_output
            self.mem_data[least_used_pos] = data
            self.mean, self.std = self.mem_data.mean(0), self.mem_data.std(0)
            self.count += 1
            return 1
        return 0

    def initialize_memory(self, x):
        mean, std = model.mem_data.mean(0), model.mem_data.std(0)
        new = (x - mean) / std
        new[:, std == 0] = 0
        self.memory = self.encoder(new)
        self.memory.requires_grad = False
        self.mem_data = x

    def forward(self, x):
        new = (x - self.mean) / self.std
        new[:, self.std == 0] = 0
        encoder_output = self.encoder(new)
        loss_values = torch.norm(self.memory - encoder_output, dim=1, p=1).min()
        self.update_memory(loss_values, encoder_output, x)
        return loss_values

In [43]:
X = torch.FloatTensor(X_all)
y = y_all


epochs = 5000
lr = 1e-2

In [44]:
for k in tqdm([3,4,5,6,7,8,9,10,11,12,13,14]):

    memlen = 2**k

    params = {
        'beta': 0.1
        ,'memory_len': memlen # N
        ,'batch_size': 1,
        'lr': lr
    }

    model = MemStream(X[0].shape[0], params).to(device)
    model.max_thres=model.max_thres.float()
    batch_size = params['batch_size']


    data_loader = DataLoader(X, batch_size=batch_size)
    init_data = X[y == 0][:memlen].to(device)
    model.mem_data = init_data
    torch.set_grad_enabled(True)
    model.train_autoencoder(Variable(init_data).to(device), epochs=epochs)
    torch.set_grad_enabled(False)
    model.initialize_memory(Variable(init_data[:memlen]))

    err = []
    for data in data_loader:
        output = model(data.to(device))
        err.append(output)


    scores = np.array([i.cpu() for i in err])
    auc = np.round(roc_auc_score(y, scores),4)

    rslt.append({
        'memlen': 2**k
        ,'metric': 'AUC'
        ,dataset: auc
    })

100%|██████████| 12/12 [1:00:12<00:00, 301.02s/it]


# Results

In [45]:
rslt_df = pd.DataFrame(rslt)
rslt_df[dataset] = np.round(rslt_df[dataset], 3)
rslt_df

Unnamed: 0,memlen,metric,NSL
0,8,AUC,0.92
1,16,AUC,0.653
2,32,AUC,0.603
3,64,AUC,0.889
4,128,AUC,0.934
5,256,AUC,0.934
6,512,AUC,0.955
7,1024,AUC,0.972
8,2048,AUC,0.978
9,4096,AUC,0.986
