In [1]:
! export CUDA_VISIBLE_DEVICES=""

In [2]:
# force torch to use CPU
import torch
# torch.cuda.empty_cache()
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
import pathlib

import numpy as np
import pandas as pd

from sops_anomaly.datasets import KddCup, SupervisedDataset
from sops_anomaly.detectors import AutoEncoder, VariationalAutoEncoder, LSTM_AD
from sops_anomaly.evaluation import Result

In [4]:
RESULTS_ROOT = pathlib.Path("../results/kdd_cup")
N_TRAIN_SAMPLES = 77000
N_TEST_SAMPLES = 10000

if not RESULTS_ROOT.exists():
    RESULTS_ROOT.mkdir(parents=True)

In [5]:
def best_result(predictions, targets, max_error=None):
    """Try various threshold levels to get best scores."""
    best_f1 = -1
    best_result = None
    if max_error is None:
        threshold_range = np.linspace(np.min(predictions), 3*np.mean(predictions), 100)
    else:
        threshold_range = np.linspace(0, max_error, 100)
    for threshold in threshold_range:
        labels = (predictions > threshold).astype(np.int32)
        result = Result(labels, targets)
        if result.f1 > best_f1:
            best_f1 = result.f1
            best_result = result
    return best_result

In [6]:
def save_to_file(result, filename, root_folder=None):
    """Save tesults to csv file."""
    columns = ['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']
    scores = [[result.accuracy, result.f1, result.precision, result.recall, result.roc_auc]]
        
    if root_folder is None:
        file_path = RESULTS_ROOT / filename
    else:
        file_path = root_folder / filename

    pd.DataFrame(data=scores, columns=columns).to_csv(file_path)

In [7]:
kdd = KddCup()
kdd = SupervisedDataset(kdd)

def get_dataset(anomaly_percentage=None):
    
    if anomaly_percentage is None:
        train_data = kdd.get_train_samples(n_samples=N_TRAIN_SAMPLES)
    else:
        train_data = kdd.get_train_samples(n_samples=N_TRAIN_SAMPLES, anomaly_percentage=(anomaly_percentage/100))
    test_data, test_targets = kdd.get_test_samples(n_samples=N_TEST_SAMPLES)
    
    return train_data, test_data, test_targets

## Auto-Encoder collect best results 

In [8]:
shapes = ((100, 50, 20), (50, ), (50, 20, ), (80, 40,))
# shapes = ((500, 300, 300,),)
# latent_sizes = (10, 50, 100)
latent_sizes = (20, 10)
for shape in shapes:
    for latent in latent_sizes:
        train_data, test_data, test_targets = get_dataset()
        print(f"Testing for shape={shape}, latent={latent}, train={len(train_data)}, test={len(test_data)}")

        model = AutoEncoder(window_size=1, latent_size=latent, layers=shape)
        model.train(train_data, epochs=30, verbose=False, learning_rate=1e-6)
        predictions = model.predict(test_data)
        result = best_result(predictions, test_targets)
        print(np.min(predictions), np.mean(predictions), np.max(predictions))
        print(result)


        save_to_file(result, f"autoencoder{'_'.join(str(x) for x in shape)}_{latent}.csv")

Testing for shape=(100, 50, 20), latent=20, train=77000, test=10000
0.012025554664433002 0.32631135311294346 366.4491271972656
Result(accuracy=0.94,
	precision=0.93,
	recall=1.0,
	f1=0.97,
	roc_auc=0.85,
	y_pred%=0.8609,
	y_label%=0.8031,
)
Testing for shape=(100, 50, 20), latent=10, train=77000, test=10000
0.010279115289449692 0.27023307687211784 175.03125
Result(accuracy=0.94,
	precision=0.94,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.8553,
	y_label%=0.8019,
)
Testing for shape=(50,), latent=20, train=77000, test=10000
0.00925627164542675 0.2517364828764461 198.85528564453125
Result(accuracy=0.94,
	precision=0.93,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.8565,
	y_label%=0.8017,
)
Testing for shape=(50,), latent=10, train=77000, test=10000
0.019776316359639168 0.2493967878671363 199.3540496826172
Result(accuracy=0.94,
	precision=0.93,
	recall=1.0,
	f1=0.96,
	roc_auc=0.84,
	y_pred%=0.8609,
	y_label%=0.7977,
)
Testing for shape=(50, 20), latent=20, train=77000, test=10000

## VariationalAutoEncoder

In [9]:
shapes = ((100, 50, 20), (50, ), (50, 20, ), (80, 40,))
# shapes = ((500, 300, 300,),)
# latent_sizes = (10, 50, 100)
latent_sizes = (20, 10)
for shape in shapes:
    for latent in latent_sizes:
        train_data, test_data, test_targets = get_dataset()
        print(f"Testing for shape={shape}, latent={latent}, train={len(train_data)}, test={len(test_data)}")

        model = VariationalAutoEncoder(window_size=1, latent_size=latent, layers=shape, l_samples=30)
        model.train(train_data, epochs=30, verbose=False, learning_rate=1e-6)
        predictions = model.predict(test_data)
        result = best_result(predictions, test_targets)
#         print(np.min(predictions), np.mean(predictions), np.max(predictions))
        print(result)


        save_to_file(result, f"variationalautoencoder{'_'.join(str(x) for x in shape)}_{latent}.csv")

Testing for shape=(100, 50, 20), latent=20, train=77000, test=10000
Result(accuracy=0.94,
	precision=0.93,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.8585,
	y_label%=0.8023,
)
Testing for shape=(100, 50, 20), latent=10, train=77000, test=10000
Result(accuracy=0.95,
	precision=0.94,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.8548,
	y_label%=0.8035,
)
Testing for shape=(50,), latent=20, train=77000, test=10000
Result(accuracy=0.91,
	precision=0.9,
	recall=1.0,
	f1=0.95,
	roc_auc=0.77,
	y_pred%=0.8906,
	y_label%=0.8017,
)
Testing for shape=(50,), latent=10, train=77000, test=10000
Result(accuracy=0.8,
	precision=0.8,
	recall=1.0,
	f1=0.89,
	roc_auc=0.5,
	y_pred%=0.9999,
	y_label%=0.8019,
)
Testing for shape=(50, 20), latent=20, train=77000, test=10000
Result(accuracy=0.89,
	precision=0.88,
	recall=1.0,
	f1=0.94,
	roc_auc=0.74,
	y_pred%=0.9035,
	y_label%=0.7965,
)
Testing for shape=(50, 20), latent=10, train=77000, test=10000
Result(accuracy=0.93,
	precision=0.92,
	recall=1.0,


## Check impact of anomalous samples in trainign set for the best models

In [10]:
RESULTS_ROOT = pathlib.Path("../results/kdd_cup")
N_TRAIN_SAMPLES = 77000
N_TEST_SAMPLES = 10000

if not RESULTS_ROOT.exists():
    RESULTS_ROOT.mkdir(parents=True)

In [14]:
# AutoEncoder
for percent in (0, 1, 3, 5, 10, 15, 20):
    result_folder = RESULTS_ROOT / f"{percent}percent"
    if not result_folder.exists():
        result_folder.mkdir()
    
    train_data, test_data, test_targets = get_dataset(percent)
    print(f"Testing for {percent}%; train={len(train_data)}, test={len(test_data)}")
    
    # TODO: insert best model ----------------------------
    model = AutoEncoder(window_size=1, latent_size=20, layers=(50,))
    model.train(train_data, epochs=30, verbose=False, learning_rate=1e-6)
    # --------------------------------

    predictions = model.predict(test_data)

    result = best_result(predictions, test_targets)
    print(result)


    save_to_file(result, "autoencoder", root_folder=result_folder)

Testing for 0%; train=77000, test=10000


KeyboardInterrupt: 

In [12]:
# VariationalAutoEncoder
for percent in (0, 1, 3, 5, 10, 15, 20):
    result_folder = RESULTS_ROOT / f"{percent}percent"
    if not result_folder.exists():
        result_folder.mkdir()
    
    train_data, test_data, test_targets = get_dataset(percent)
    print(f"Testing for {percent}%; train={len(train_data)}, test={len(test_data)}")
    # TODO: insert best model ----------------------------
    model = VariationalAutoEncoder(window_size=1, latent_size=10, layers=(80, 40), l_samples=30)
    model.train(train_data, epochs=30, verbose=False, learning_rate=1e-6)
    # --------------------------------

    predictions = model.predict(test_data)

    result = best_result(predictions, test_targets)
    print(result)


    save_to_file(result, "variationalautoencoder", root_folder=result_folder)

Testing for 0%; train=77000, test=10000
Result(accuracy=0.95,
	precision=0.94,
	recall=1.0,
	f1=0.97,
	roc_auc=0.87,
	y_pred%=0.8544,
	y_label%=0.8021,
)
Testing for 1%; train=77000, test=10000
Result(accuracy=0.94,
	precision=0.93,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.853,
	y_label%=0.7982,
)
Testing for 3%; train=77000, test=10000
Result(accuracy=0.94,
	precision=0.94,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.8494,
	y_label%=0.7999,
)
Testing for 5%; train=77000, test=10000
Result(accuracy=0.94,
	precision=0.94,
	recall=0.99,
	f1=0.97,
	roc_auc=0.87,
	y_pred%=0.8485,
	y_label%=0.8007,
)
Testing for 10%; train=77000, test=10000
Result(accuracy=0.94,
	precision=0.94,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.8525,
	y_label%=0.7985,
)
Testing for 15%; train=77000, test=10000
Result(accuracy=0.94,
	precision=0.94,
	recall=1.0,
	f1=0.97,
	roc_auc=0.86,
	y_pred%=0.8542,
	y_label%=0.8026,
)
Testing for 20%; train=77000, test=10000
Result(accuracy=0.93,
	precision=