In [11]:
! export CUDA_VISIBLE_DEVICES=""

In [12]:
# force torch to use CPU
import torch
# torch.cuda.empty_cache()
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
import pathlib

import numpy as np
import pandas as pd

from sops_anomaly.datasets import MNIST, SupervisedDataset
from sops_anomaly.detectors import AutoEncoder, VariationalAutoEncoder, LSTM_AD
from sops_anomaly.evaluation import Result

In [14]:
RESULTS_ROOT = pathlib.Path("../results")
N_TRAIN_SAMPLES = 5000
N_TEST_SAMPLES = 1000

if not RESULTS_ROOT.exists():
    RESULTS_ROOT.mkdir()

In [15]:
def best_result(predictions, targets, max_error=None):
    """Try various threshold levels to get best scores."""
    best_f1 = 0
    best_result = None
    if max_error is None:
        threshold_range = np.linspace(np.min(predictions), 3*np.mean(predictions), 100)
    else:
        threshold_range = np.linspace(0, max_error, 100)
    for threshold in threshold_range:
        labels = (predictions > threshold).astype(np.int32)
        result = Result(labels, targets)
        if result.f1 > best_f1:
            best_f1 = result.f1
            best_result = result
    return best_result

In [16]:
def save_to_file(results, filename, root_folder=None):
    """Save tesults to csv file."""
    index = list(range(10)) + ['total']
    columns = ['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']
    scores = []
    for i in index:
        result = results[i]
        scores.append([result.accuracy, result.f1, result.precision, result.recall, result.roc_auc])
        
    if root_folder is None:
        file_path = RESULTS_ROOT / filename
    else:
        file_path = root_folder / filename

    pd.DataFrame(data=scores, index=index, columns=columns).to_csv(file_path)

## Auto-Encoder collect best results 

In [20]:
shapes = ((512, 256, 128, 64), (512, 256, 256, 128), (512, 256, 128, 128, 64))
# shapes = ((500, 300, 300,),)
latent_sizes = (32,)
# latent_sizes = (100,)
for shape in shapes:
    for latent in latent_sizes:
        results = {}
        for anomaly_class in range(1):
            print(f"Testing for class {anomaly_class}; shape={shape}, latent={latent}")
            mnist = MNIST(anomaly_class=anomaly_class)
            mnist = SupervisedDataset(mnist)
            train_data = mnist.get_train_samples(n_samples=N_TRAIN_SAMPLES)
            test_data, test_targets = mnist.get_test_samples(n_samples=N_TEST_SAMPLES)

            model = AutoEncoder(window_size=1, latent_size=latent, layers=shape)
            model.train(train_data, epochs=30, verbose=False, learning_rate=1e-3)
            predictions = model.predict(test_data)
            
            result = best_result(predictions, test_targets)
            print(result)

            results[anomaly_class] = result

#         results['total'] = sum(list(results.values()), Result([],[]))
#         save_to_file(results, f"autoencoder2{'_'.join(str(x) for x in shape)}_{latent}_1e6.csv")

Testing for class 0; shape=(512, 256, 128, 64), latent=32
Result(accuracy=0.92,
	precision=0.57,
	recall=0.84,
	f1=0.68,
	roc_auc=0.88,
	y_pred%=0.142,
	y_label%=0.097,
)
Testing for class 0; shape=(512, 256, 256, 128), latent=32
Result(accuracy=0.93,
	precision=0.68,
	recall=0.74,
	f1=0.71,
	roc_auc=0.85,
	y_pred%=0.117,
	y_label%=0.107,
)
Testing for class 0; shape=(512, 256, 128, 128, 64), latent=32
Result(accuracy=0.92,
	precision=0.66,
	recall=0.74,
	f1=0.7,
	roc_auc=0.84,
	y_pred%=0.13,
	y_label%=0.117,
)


In [None]:
#1e-4
Testing for class 0; shape=(512, 256, 128, 128, 64), latent=32
Result(accuracy=0.94,
	precision=0.64,
	recall=0.78,
	f1=0.71,
	roc_auc=0.87,
	y_pred%=0.114,
	y_label%=0.093,
)

# Testing for class 0; shape=(500, 200), latent=100, 1e-6
# Result(accuracy=0.82,
# 	precision=0.38,
# 	recall=0.64,
# 	f1=0.47,
# 	roc_auc=0.74,
# 	y_pred%=0.218,
# 	y_label%=0.129,
# )

## Variational Auto-Encoder collect best results

In [25]:
# shapes = ((600,), (600, 400), (500,)) 
# shapes = ((600,),) 
shapes = ((512, 256, 128,),)

latent_sizes = (128, 64)
for shape in shapes:
    for latent in latent_sizes:
        results = {}
        for anomaly_class in range(1):
            print(f"Testing for class: {anomaly_class}; shape={shape}, latent={latent}")
            mnist = MNIST(anomaly_class=anomaly_class)
            mnist = SupervisedDataset(mnist)
            train_data = mnist.get_train_samples(n_samples=N_TRAIN_SAMPLES)
            test_data, test_targets = mnist.get_test_samples(n_samples=N_TEST_SAMPLES)

            model = VariationalAutoEncoder(window_size=1, latent_size=latent, layers=shape, l_samples=30)
            model.train(train_data, epochs=30, verbose=False, learning_rate=1e-3)
            predictions = model.predict(test_data)

            result = best_result(predictions, test_targets)
            print(result)

            results[anomaly_class] = result
            
#         results['total'] = sum(list(results.values()), Result([],[]))
#         save_to_file(results, f"variationalautoencoder{'_'.join(str(x) for x in shape)}_{latent}_1e4_BEST.csv")

Testing for class: 0; shape=(512, 256, 128), latent=128
Result(accuracy=0.87,
	precision=0.4,
	recall=0.57,
	f1=0.47,
	roc_auc=0.74,
	y_pred%=0.141,
	y_label%=0.1,
)
Testing for class: 0; shape=(512, 256, 128), latent=64
Result(accuracy=0.92,
	precision=0.6,
	recall=0.47,
	f1=0.53,
	roc_auc=0.72,
	y_pred%=0.07,
	y_label%=0.089,
)


In [None]:
# latent 50 = 0.78
1e-4
Testing for class: 0; shape=(512, 256, 128, 128, 64), latent=32
Result(accuracy=0.91,
	precision=0.5,
	recall=0.55,
	f1=0.53,
	roc_auc=0.75,
	y_pred%=0.102,
	y_label%=0.092,
)

Testing for class: 0; shape=(512, 256, 128), latent=128
Result(accuracy=0.92,
	precision=0.69,
	recall=0.55,
	f1=0.61,
	roc_auc=0.76,
	y_pred%=0.086,
	y_label%=0.108,
)

1e-5
Testing for class: 0; shape=(512, 256, 128), latent=128
Result(accuracy=0.85,
	precision=0.41,
	recall=0.72,
	f1=0.52,
	roc_auc=0.8,
	y_pred%=0.195,
	y_label%=0.111,
)


## Check impact of anomalous samples in trainign set for the best models

In [26]:
RESULTS_ROOT = pathlib.Path("../results")
N_TRAIN_SAMPLES = 5000
N_TEST_SAMPLES = 1000

if not RESULTS_ROOT.exists():
    
    RESULTS_ROOT.mkdir()

In [27]:
def get_dataset(anomaly_class, anomaly_percentage):
    mnist = MNIST(anomaly_class=anomaly_class)
    mnist = SupervisedDataset(mnist)
    train_data = mnist.get_train_samples(n_samples=N_TRAIN_SAMPLES)
    test_data, test_targets = mnist.get_test_samples(n_samples=N_TEST_SAMPLES)
    
    return train_data, mnist.data[1][train_data.index], test_data, test_targets

In [30]:
x, y, a, b = get_dataset(0, 20)
y.sum() / len(y)

0.0

In [28]:
# AutoEncoder
for percent in (0, 1, 3, 5, 10, 15, 20):
    print(f"Testing for {percent}% ...")
    result_folder = RESULTS_ROOT / f"{percent}percent"
    if not result_folder.exists():
        result_folder.mkdir()
    
    results = {}
    for anomaly_class in range(10):
        
        train_data, y, test_data, test_targets = get_dataset(anomaly_class, percent)
        print(f"anomaly class: {anomaly_class}, {y.sum()/len(y)}%")
        
        model = AutoEncoder(window_size=1, latent_size=32, layers=(512, 256, 128, 128, 64))
        model.train(train_data, epochs=30, verbose=False, learning_rate=1e-3)
        predictions = model.predict(test_data)
        
        result = best_result(predictions, test_targets)
        results[anomaly_class] = result
        print(result)

    results['total'] = sum(list(results.values()), Result([],[]))
    save_to_file(results, "autoencoder3", root_folder=result_folder)

Testing for 0% ...
anomaly class: 0, 0.0%
Result(accuracy=0.9,
	precision=0.5,
	recall=0.73,
	f1=0.6,
	roc_auc=0.82,
	y_pred%=0.149,
	y_label%=0.103,
)
anomaly class: 1, 0.0%
Result(accuracy=0.15,
	precision=0.12,
	recall=1.0,
	f1=0.21,
	roc_auc=0.52,
	y_pred%=0.965,
	y_label%=0.111,
)
anomaly class: 2, 0.0%
Result(accuracy=0.93,
	precision=0.63,
	recall=0.75,
	f1=0.69,
	roc_auc=0.85,
	y_pred%=0.114,
	y_label%=0.096,
)
anomaly class: 3, 0.0%
Result(accuracy=0.85,
	precision=0.38,
	recall=0.71,
	f1=0.5,
	roc_auc=0.79,
	y_pred%=0.191,
	y_label%=0.103,
)
anomaly class: 4, 0.0%
Result(accuracy=0.76,
	precision=0.23,
	recall=0.62,
	f1=0.33,
	roc_auc=0.69,
	y_pred%=0.266,
	y_label%=0.099,
)
anomaly class: 5, 0.0%
Result(accuracy=0.82,
	precision=0.3,
	recall=0.78,
	f1=0.44,
	roc_auc=0.8,
	y_pred%=0.23,
	y_label%=0.09,
)
anomaly class: 6, 0.0%
Result(accuracy=0.91,
	precision=0.53,
	recall=0.68,
	f1=0.6,
	roc_auc=0.81,
	y_pred%=0.121,
	y_label%=0.094,
)
anomaly class: 7, 0.0%
Result(accuracy=

anomaly class: 0, 0.0%
Result(accuracy=0.94,
	precision=0.65,
	recall=0.66,
	f1=0.66,
	roc_auc=0.81,
	y_pred%=0.088,
	y_label%=0.086,
)
anomaly class: 1, 0.0%
Result(accuracy=0.18,
	precision=0.11,
	recall=1.0,
	f1=0.2,
	roc_auc=0.55,
	y_pred%=0.914,
	y_label%=0.099,
)
anomaly class: 2, 0.0%
Result(accuracy=0.96,
	precision=0.84,
	recall=0.77,
	f1=0.8,
	roc_auc=0.87,
	y_pred%=0.098,
	y_label%=0.107,
)
anomaly class: 3, 0.0%
Result(accuracy=0.85,
	precision=0.41,
	recall=0.65,
	f1=0.51,
	roc_auc=0.76,
	y_pred%=0.188,
	y_label%=0.12,
)
anomaly class: 4, 0.0%
Result(accuracy=0.75,
	precision=0.18,
	recall=0.52,
	f1=0.27,
	roc_auc=0.65,
	y_pred%=0.252,
	y_label%=0.088,
)
anomaly class: 5, 0.0%
Result(accuracy=0.89,
	precision=0.44,
	recall=0.53,
	f1=0.48,
	roc_auc=0.73,
	y_pred%=0.119,
	y_label%=0.098,
)
anomaly class: 6, 0.0%
Result(accuracy=0.91,
	precision=0.52,
	recall=0.58,
	f1=0.55,
	roc_auc=0.76,
	y_pred%=0.107,
	y_label%=0.096,
)
anomaly class: 7, 0.0%
Result(accuracy=0.83,
	precis

In [29]:
# VariationalAutoEncoder
for percent in (0, 1, 3, 5, 10, 15, 20):
    print(f"Testing for {percent}% ...")
    result_folder = RESULTS_ROOT / f"{percent}percent"
    if not result_folder.exists():
        result_folder.mkdir()
    
    results = {}
    for anomaly_class in range(10):
        train_data, y, test_data, test_targets = get_dataset(anomaly_class, percent)
        print(f"anomaly class: {anomaly_class}, {y.sum()/len(y)}%")
        
        model = VariationalAutoEncoder(window_size=1, latent_size=128, layers=(512, 256, 128), l_samples=30)
        model.train(train_data, epochs=30, verbose=False, learning_rate=1e-4)
        predictions = model.predict(test_data)
        
        result = best_result(predictions, test_targets)
        results[anomaly_class] = result
        print(result)

    results['total'] = sum(list(results.values()), Result([],[]))
    save_to_file(results, "variationalautoencoder3", root_folder=result_folder)

Testing for 0% ...
anomaly class: 0, 0.0%
Result(accuracy=0.89,
	precision=0.49,
	recall=0.75,
	f1=0.59,
	roc_auc=0.82,
	y_pred%=0.167,
	y_label%=0.11,
)
anomaly class: 1, 0.0%
Result(accuracy=0.15,
	precision=0.15,
	recall=1.0,
	f1=0.26,
	roc_auc=0.5,
	y_pred%=0.999,
	y_label%=0.147,
)
anomaly class: 2, 0.0%
Result(accuracy=0.84,
	precision=0.26,
	recall=0.44,
	f1=0.32,
	roc_auc=0.66,
	y_pred%=0.149,
	y_label%=0.087,
)
anomaly class: 3, 0.0%
Result(accuracy=0.51,
	precision=0.15,
	recall=0.83,
	f1=0.25,
	roc_auc=0.65,
	y_pred%=0.554,
	y_label%=0.099,
)
anomaly class: 4, 0.0%
Result(accuracy=0.32,
	precision=0.1,
	recall=0.84,
	f1=0.19,
	roc_auc=0.55,
	y_pred%=0.746,
	y_label%=0.093,
)
anomaly class: 5, 0.0%
Result(accuracy=0.32,
	precision=0.11,
	recall=0.92,
	f1=0.2,
	roc_auc=0.59,
	y_pred%=0.756,
	y_label%=0.092,
)
anomaly class: 6, 0.0%
Result(accuracy=0.45,
	precision=0.13,
	recall=0.85,
	f1=0.23,
	roc_auc=0.62,
	y_pred%=0.622,
	y_label%=0.098,
)
anomaly class: 7, 0.0%
Result(accu

KeyboardInterrupt: 