In [21]:
! export CUDA_VISIBLE_DEVICES=""

In [22]:
# force torch to use CPU
import torch
# torch.cuda.empty_cache()
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
import pathlib

import numpy as np
import pandas as pd

from sops_anomaly.datasets import MNIST, SupervisedDataset
from sops_anomaly.detectors import AutoEncoder, VariationalAutoEncoder, LSTM_AD
from sops_anomaly.evaluation import Result

In [24]:
RESULTS_ROOT = pathlib.Path("../results")
N_TRAIN_SAMPLES = 5000
N_TEST_SAMPLES = 1000

if not RESULTS_ROOT.exists():
    RESULTS_ROOT.mkdir()

In [25]:
def best_result(predictions, targets, max_error=None):
    """Try various threshold levels to get best scores."""
    best_f1 = 0
    best_result = None
    if max_error is None:
        threshold_range = np.linspace(np.min(predictions), 3*np.mean(predictions), 100)
    else:
        threshold_range = np.linspace(0, max_error, 100)
    for threshold in threshold_range:
        labels = (predictions > threshold).astype(np.int32)
        result = Result(labels, targets)
        if result.f1 > best_f1:
            best_f1 = result.f1
            best_result = result
    return best_result

In [26]:
def save_to_file(results, filename, root_folder=None):
    """Save tesults to csv file."""
    index = list(range(10)) + ['total']
    columns = ['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']
    scores = []
    for i in index:
        result = results[i]
        scores.append([result.accuracy, result.f1, result.precision, result.recall, result.roc_auc])
        
    if root_folder is None:
        file_path = RESULTS_ROOT / filename
    else:
        file_path = root_folder / filename

    pd.DataFrame(data=scores, index=index, columns=columns).to_csv(file_path)

## Auto-Encoder collect best results 

In [56]:
# shapes = ((200,), (400,), (500, 200), (300, 150))
shapes = ((500, 300, 300,),)
# latent_sizes = (10, 50, 100)
latent_sizes = (100,)
for shape in shapes:
    for latent in latent_sizes:
        results = {}
        for anomaly_class in range(10):
            print(f"Testing for class {anomaly_class}; shape={shape}, latent={latent}")
            mnist = MNIST(anomaly_class=anomaly_class)
            mnist = SupervisedDataset(mnist)
            train_data = mnist.get_train_samples(n_samples=N_TRAIN_SAMPLES)
            test_data, test_targets = mnist.get_test_samples(n_samples=N_TEST_SAMPLES)

            model = AutoEncoder(window_size=1, latent_size=latent, layers=shape)
            model.train(train_data, epochs=30, verbose=False, learning_rate=1e-6)
            predictions = model.predict(test_data)
            
            result = best_result(predictions, test_targets, model.max_error)
            print(result)

            results[anomaly_class] = result

        results['total'] = sum(list(results.values()), Result([],[]))
        save_to_file(results, f"autoencoder{'_'.join(str(x) for x in shape)}_{latent}_1e6_BEST.csv")

Testing for class 0; shape=(500, 300, 300), latent=100
Result(accuracy=0.89,
	precision=0.39,
	recall=0.39,
	f1=0.39,
	roc_auc=0.67,
	y_pred%=0.088,
	y_label%=0.087,
)
Testing for class 1; shape=(500, 300, 300), latent=100
Result(accuracy=0.11,
	precision=0.11,
	recall=1.0,
	f1=0.2,
	roc_auc=0.5,
	y_pred%=1.0,
	y_label%=0.109,
)
Testing for class 2; shape=(500, 300, 300), latent=100
Result(accuracy=0.63,
	precision=0.17,
	recall=0.66,
	f1=0.27,
	roc_auc=0.64,
	y_pred%=0.403,
	y_label%=0.102,
)
Testing for class 3; shape=(500, 300, 300), latent=100
Result(accuracy=0.29,
	precision=0.12,
	recall=0.94,
	f1=0.21,
	roc_auc=0.58,
	y_pred%=0.8,
	y_label%=0.099,
)
Testing for class 4; shape=(500, 300, 300), latent=100
Result(accuracy=0.1,
	precision=0.1,
	recall=1.0,
	f1=0.18,
	roc_auc=0.5,
	y_pred%=1.0,
	y_label%=0.099,
)
Testing for class 5; shape=(500, 300, 300), latent=100
Result(accuracy=0.2,
	precision=0.1,
	recall=0.94,
	f1=0.19,
	roc_auc=0.53,
	y_pred%=0.891,
	y_label%=0.098,
)
Testing

In [None]:
# Testing for class 0; shape=(500, 200), latent=100, 1e-6
# Result(accuracy=0.82,
# 	precision=0.38,
# 	recall=0.64,
# 	f1=0.47,
# 	roc_auc=0.74,
# 	y_pred%=0.218,
# 	y_label%=0.129,
# )

## Variational Auto-Encoder collect best results

In [27]:
# shapes = ((600,), (600, 400), (500,)) 
# shapes = ((600,),) 
shapes = ((500, 200,),(600, 400), (500,))

latent_sizes = (100, 200)
for shape in shapes:
    for latent in latent_sizes:
        results = {}
        for anomaly_class in range(10):
            print(f"Testing for class: {anomaly_class}; shape={shape}, latent={latent}")
            mnist = MNIST(anomaly_class=anomaly_class)
            mnist = SupervisedDataset(mnist)
            train_data = mnist.get_train_samples(n_samples=N_TRAIN_SAMPLES)
            test_data, test_targets = mnist.get_test_samples(n_samples=N_TEST_SAMPLES)

            model = VariationalAutoEncoder(window_size=1, latent_size=latent, layers=shape, l_samples=30)
            model.train(train_data, epochs=30, verbose=False, learning_rate=1e-4)
            predictions = model.predict(test_data)

            result = best_result(predictions, test_targets, model.max_error)
            print(result)

            results[anomaly_class] = result
            
        results['total'] = sum(list(results.values()), Result([],[]))
        save_to_file(results, f"variationalautoencoder{'_'.join(str(x) for x in shape)}_{latent}_1e4_BEST.csv")

Testing for class: 0; shape=(500, 200), latent=100
Result(accuracy=0.9,
	precision=0.44,
	recall=0.64,
	f1=0.52,
	roc_auc=0.78,
	y_pred%=0.123,
	y_label%=0.085,
)
Testing for class: 1; shape=(500, 200), latent=100
Result(accuracy=0.11,
	precision=0.11,
	recall=1.0,
	f1=0.2,
	roc_auc=0.5,
	y_pred%=0.998,
	y_label%=0.108,
)
Testing for class: 2; shape=(500, 200), latent=100
Result(accuracy=0.7,
	precision=0.19,
	recall=0.67,
	f1=0.29,
	roc_auc=0.68,
	y_pred%=0.333,
	y_label%=0.093,
)
Testing for class: 3; shape=(500, 200), latent=100
Result(accuracy=0.51,
	precision=0.13,
	recall=0.75,
	f1=0.22,
	roc_auc=0.62,
	y_pred%=0.535,
	y_label%=0.091,
)
Testing for class: 4; shape=(500, 200), latent=100
Result(accuracy=0.11,
	precision=0.11,
	recall=1.0,
	f1=0.2,
	roc_auc=0.5,
	y_pred%=0.999,
	y_label%=0.108,
)
Testing for class: 5; shape=(500, 200), latent=100
Result(accuracy=0.29,
	precision=0.12,
	recall=0.94,
	f1=0.21,
	roc_auc=0.58,
	y_pred%=0.794,
	y_label%=0.098,
)
Testing for class: 6; sh

Result(accuracy=0.13,
	precision=0.13,
	recall=1.0,
	f1=0.23,
	roc_auc=0.5,
	y_pred%=1.0,
	y_label%=0.13,
)
Testing for class: 2; shape=(500,), latent=200
Result(accuracy=0.77,
	precision=0.23,
	recall=0.56,
	f1=0.33,
	roc_auc=0.68,
	y_pred%=0.239,
	y_label%=0.099,
)
Testing for class: 3; shape=(500,), latent=200
Result(accuracy=0.48,
	precision=0.12,
	recall=0.78,
	f1=0.21,
	roc_auc=0.62,
	y_pred%=0.567,
	y_label%=0.086,
)
Testing for class: 4; shape=(500,), latent=200
Result(accuracy=0.27,
	precision=0.12,
	recall=0.96,
	f1=0.22,
	roc_auc=0.57,
	y_pred%=0.833,
	y_label%=0.107,
)
Testing for class: 5; shape=(500,), latent=200
Result(accuracy=0.35,
	precision=0.11,
	recall=0.83,
	f1=0.2,
	roc_auc=0.56,
	y_pred%=0.716,
	y_label%=0.098,
)
Testing for class: 6; shape=(500,), latent=200
Result(accuracy=0.38,
	precision=0.13,
	recall=0.89,
	f1=0.22,
	roc_auc=0.6,
	y_pred%=0.702,
	y_label%=0.099,
)
Testing for class: 7; shape=(500,), latent=200
Result(accuracy=0.2,
	precision=0.12,
	recall=0

In [None]:
# latent 50 = 0.78

## Check impact of anomalous samples in trainign set for the best models

In [13]:
RESULTS_ROOT = pathlib.Path("../results")
N_TRAIN_SAMPLES = 5000
N_TEST_SAMPLES = 1000

if not RESULTS_ROOT.exists():
    
    RESULTS_ROOT.mkdir()

In [14]:
def get_dataset(anomaly_class, anomaly_percentage):
    mnist = MNIST(anomaly_class=anomaly_class)
    mnist = SupervisedDataset(mnist)
    train_data = mnist.get_train_samples(n_samples=N_TRAIN_SAMPLES, anomaly_percentage=(anomaly_percentage/100))
    test_data, test_targets = mnist.get_test_samples(n_samples=N_TEST_SAMPLES)
    
    return train_data, mnist.data[1][train_data.index], test_data, test_targets

In [19]:
# AutoEncoder
for percent in (0, 1, 3, 5, 10, 15, 20):
    print(f"Testing for {percent}% ...")
    result_folder = RESULTS_ROOT / f"{percent}percent"
    if not result_folder.exists():
        result_folder.mkdir()
    
    results = {}
    for anomaly_class in range(10):
        
        train_data, y, test_data, test_targets = get_dataset(anomaly_class, percent)
        print(f"anomaly class: {anomaly_class}, {y.sum()/len(y)}%")
        
        model = AutoEncoder(window_size=1, latent_size=100, layers=(500, 300, 300,))
        model.train(train_data, epochs=30, verbose=False, learning_rate=1e-6)
        predictions = model.predict(test_data)
        
        result = best_result(predictions, test_targets)
        results[anomaly_class] = result
        print(result)

    results['total'] = sum(list(results.values()), Result([],[]))
    save_to_file(results, "autoencoder2", root_folder=result_folder)

Testing for 0% ...
anomaly class: 0, 0.0%
Result(accuracy=0.93,
	precision=0.71,
	recall=0.58,
	f1=0.64,
	roc_auc=0.78,
	y_pred%=0.084,
	y_label%=0.104,
)
anomaly class: 1, 0.0%
Result(accuracy=0.1,
	precision=0.1,
	recall=0.99,
	f1=0.19,
	roc_auc=0.5,
	y_pred%=0.999,
	y_label%=0.104,
)
anomaly class: 2, 0.0%
Result(accuracy=0.65,
	precision=0.19,
	recall=0.75,
	f1=0.3,
	roc_auc=0.69,
	y_pred%=0.402,
	y_label%=0.1,
)
anomaly class: 3, 0.0%
Result(accuracy=0.5,
	precision=0.15,
	recall=0.75,
	f1=0.25,
	roc_auc=0.61,
	y_pred%=0.561,
	y_label%=0.11,
)
anomaly class: 4, 0.0%
Result(accuracy=0.27,
	precision=0.11,
	recall=0.87,
	f1=0.2,
	roc_auc=0.54,
	y_pred%=0.806,
	y_label%=0.102,
)
anomaly class: 5, 0.0%
Result(accuracy=0.32,
	precision=0.09,
	recall=0.78,
	f1=0.16,
	roc_auc=0.53,
	y_pred%=0.731,
	y_label%=0.082,
)
anomaly class: 6, 0.0%
Result(accuracy=0.46,
	precision=0.14,
	recall=0.81,
	f1=0.24,
	roc_auc=0.62,
	y_pred%=0.603,
	y_label%=0.105,
)
anomaly class: 7, 0.0%
Result(accuracy

anomaly class: 0, 0.2%
Result(accuracy=0.93,
	precision=0.65,
	recall=0.51,
	f1=0.57,
	roc_auc=0.74,
	y_pred%=0.075,
	y_label%=0.097,
)
anomaly class: 1, 0.2%
Result(accuracy=0.1,
	precision=0.1,
	recall=0.99,
	f1=0.18,
	roc_auc=0.5,
	y_pred%=0.999,
	y_label%=0.102,
)
anomaly class: 2, 0.2%
Result(accuracy=0.71,
	precision=0.18,
	recall=0.58,
	f1=0.28,
	roc_auc=0.65,
	y_pred%=0.309,
	y_label%=0.097,
)
anomaly class: 3, 0.2%
Result(accuracy=0.45,
	precision=0.11,
	recall=0.76,
	f1=0.2,
	roc_auc=0.59,
	y_pred%=0.599,
	y_label%=0.088,
)
anomaly class: 4, 0.2%
Result(accuracy=0.14,
	precision=0.11,
	recall=0.99,
	f1=0.2,
	roc_auc=0.52,
	y_pred%=0.958,
	y_label%=0.105,
)
anomaly class: 5, 0.2%
Result(accuracy=0.1,
	precision=0.09,
	recall=1.0,
	f1=0.17,
	roc_auc=0.51,
	y_pred%=0.986,
	y_label%=0.089,
)
anomaly class: 6, 0.2%
Result(accuracy=0.42,
	precision=0.1,
	recall=0.8,
	f1=0.18,
	roc_auc=0.59,
	y_pred%=0.624,
	y_label%=0.079,
)
anomaly class: 7, 0.2%
Result(accuracy=0.11,
	precision=0

In [20]:
# VariationalAutoEncoder
for percent in (0, 1, 3, 5, 10, 15, 20):
    print(f"Testing for {percent}% ...")
    result_folder = RESULTS_ROOT / f"{percent}percent"
    if not result_folder.exists():
        result_folder.mkdir()
    
    results = {}
    for anomaly_class in range(10):
        train_data, y, test_data, test_targets = get_dataset(anomaly_class, percent)
        print(f"anomaly class: {anomaly_class}, {y.sum()/len(y)}%")
        
        model = VariationalAutoEncoder(window_size=1, latent_size=100, layers=(500, 200,), l_samples=30)
        model.train(train_data, epochs=30, verbose=False, learning_rate=1e-4)
        predictions = model.predict(test_data)
        
        result = best_result(predictions, test_targets)
        results[anomaly_class] = result
        print(result)

    results['total'] = sum(list(results.values()), Result([],[]))
    save_to_file(results, "variationalautoencoder2", root_folder=result_folder)

Testing for 0% ...
anomaly class: 0, 0.0%
Result(accuracy=0.85,
	precision=0.36,
	recall=0.56,
	f1=0.44,
	roc_auc=0.72,
	y_pred%=0.162,
	y_label%=0.103,
)
anomaly class: 1, 0.0%
Result(accuracy=0.12,
	precision=0.12,
	recall=1.0,
	f1=0.21,
	roc_auc=0.5,
	y_pred%=0.999,
	y_label%=0.115,
)
anomaly class: 2, 0.0%
Result(accuracy=0.61,
	precision=0.17,
	recall=0.81,
	f1=0.28,
	roc_auc=0.7,
	y_pred%=0.447,
	y_label%=0.093,
)
anomaly class: 3, 0.0%
Result(accuracy=0.5,
	precision=0.13,
	recall=0.74,
	f1=0.22,
	roc_auc=0.61,
	y_pred%=0.545,
	y_label%=0.093,
)
anomaly class: 4, 0.0%
Result(accuracy=0.08,
	precision=0.08,
	recall=1.0,
	f1=0.15,
	roc_auc=0.5,
	y_pred%=0.995,
	y_label%=0.078,
)
anomaly class: 5, 0.0%
Result(accuracy=0.21,
	precision=0.09,
	recall=0.98,
	f1=0.17,
	roc_auc=0.56,
	y_pred%=0.866,
	y_label%=0.082,
)
anomaly class: 6, 0.0%
Result(accuracy=0.49,
	precision=0.13,
	recall=0.77,
	f1=0.22,
	roc_auc=0.62,
	y_pred%=0.561,
	y_label%=0.093,
)
anomaly class: 7, 0.0%
Result(accur

anomaly class: 0, 0.2%
Result(accuracy=0.87,
	precision=0.41,
	recall=0.75,
	f1=0.53,
	roc_auc=0.81,
	y_pred%=0.184,
	y_label%=0.1,
)
anomaly class: 1, 0.2%
Result(accuracy=0.12,
	precision=0.12,
	recall=1.0,
	f1=0.22,
	roc_auc=0.5,
	y_pred%=0.999,
	y_label%=0.123,
)
anomaly class: 2, 0.2%
Result(accuracy=0.64,
	precision=0.18,
	recall=0.64,
	f1=0.28,
	roc_auc=0.64,
	y_pred%=0.395,
	y_label%=0.11,
)
anomaly class: 3, 0.2%
Result(accuracy=0.38,
	precision=0.14,
	recall=0.9,
	f1=0.25,
	roc_auc=0.61,
	y_pred%=0.714,
	y_label%=0.114,
)
anomaly class: 4, 0.2%
Result(accuracy=0.24,
	precision=0.1,
	recall=0.9,
	f1=0.18,
	roc_auc=0.54,
	y_pred%=0.837,
	y_label%=0.091,
)
anomaly class: 5, 0.2%
Result(accuracy=0.26,
	precision=0.08,
	recall=0.93,
	f1=0.15,
	roc_auc=0.57,
	y_pred%=0.794,
	y_label%=0.068,
)
anomaly class: 6, 0.2%
Result(accuracy=0.46,
	precision=0.13,
	recall=0.78,
	f1=0.22,
	roc_auc=0.6,
	y_pred%=0.592,
	y_label%=0.096,
)
anomaly class: 7, 0.2%
Result(accuracy=0.1,
	precision=0.