In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [2]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

from load_datasets import shuttle
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = shuttle(random_state=1)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')

Anomalies ratio: 7.16848383188892 %


In [4]:
autoencoder_layers = [9, 6, 2]
batch_size = 16

data_input = tf.placeholder(tf.float32, shape=[None, 9])

ae1svm = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.4, 1e3, 3.0, 50,
                       autoencoder_activation='sigmoid',
                       full_op=tf.train.AdamOptimizer(1e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

ae_only = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.3, 1e3, 3.0, 50,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(1e-3))

In [5]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, epochs_1=7, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))


Combined train
.......SVM train
Train time: 18.87569284439087
Test time: 0.21557235717773438
{'AUPRC': 0.9498931742185709,
 'AUROC': 0.9755952034726462,
 'Confusion matrix': array([[ 2929,   149],
       [   16, 39831]], dtype=int64),
 'F1': 0.9726050141125686,
 'Precision': 0.9945670628183362,
 'Recall': 0.951591942820013}


In [6]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.09, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s finished


Train time: 4.605691194534302
Test time: 3.355423927307129
{'AUPRC': 0.7697150055818226,
 'AUROC': 0.982248764128379,
 'Confusion matrix': array([[ 3035,    43],
       [  858, 38989]], dtype=int64),
 'F1': 0.8707502510400229,
 'Precision': 0.7796044181864886,
 'Recall': 0.9860298895386614}


In [7]:
# Train conventional RBF-OCSVM on raw input
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 40.37992811203003
Test time: 10.66106390953064
{'AUPRC': 0.43839963022686296,
 'AUROC': 0.9338761332271377,
 'Confusion matrix': array([[ 2943,   135],
       [ 3522, 36325]], dtype=int64),
 'F1': 0.6167871738447029,
 'Precision': 0.45522041763341065,
 'Recall': 0.956140350877193}


In [8]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=10)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
..........AE time: 16.776585340499878


In [9]:
# Train conventional OCSVM on bottleneck layer
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.1, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 16.79263401031494
Test time: 3.839857339859009
{'AUPRC': 0.4151140132100934,
 'AUROC': 0.8501191725340982,
 'Confusion matrix': array([[ 2315,   763],
       [ 2067, 37780]], dtype=int64),
 'F1': 0.6206434316353887,
 'Precision': 0.528297581013236,
 'Recall': 0.7521117608836907}


In [10]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.1, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 97.39589357376099
Test time: 20.373255968093872
{'AUPRC': 0.41352506614499684,
 'AUROC': 0.8472843935069183,
 'Confusion matrix': array([[ 2295,   783],
       [ 2034, 37813]], dtype=int64),
 'F1': 0.6196840826245443,
 'Precision': 0.5301455301455301,
 'Recall': 0.7456140350877193}


In [15]:
    dec = DEC(dims=autoencoder_layers, n_clusters=5)
    t0 = time.time()
    dec.pretrain(x=x_train, epochs=1)
    dec.compile(loss='kld')
    y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    scores = dec.cluster_score(x_test)
    print('Test time:', time.time() - t0)
    threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
    out_y = np.array([1 if s > threshold else -1 for s in scores])
    pprint(metrics(y_test, out_y))


...Pretraining...
Epoch 1/1
Pretraining time:  1.295757532119751
Update interval 10
Save interval 13413.75
Initializing cluster centers with k-means.
delta_label  0.0006057217407510949 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 6.307149887084961
Test time: 0.9062542915344238
{'AUPRC': 0.4933196996768739,
 'AUROC': 0.8309585578091561,
 'Confusion matrix': array([[ 2112,   966],
       [  966, 38881]], dtype=int64),
 'F1': 0.6861598440545809,
 'Precision': 0.6861598440545809,
 'Recall': 0.6861598440545809}


In [16]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=0.01, layers_sizes=autoencoder_layers, learning_rate=1e-2)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=20, iteration=3, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = np.array([1 if s == 0 else -1 for s in s_sum_test])
    pprint(metrics(y_test, out_y))

X shape:  (42924, 9)
L shape:  (42924, 9)
S shape:  (42924, 9)
Out iteration:  1
    iteration :  5 , cost :  9.673011e-05
    iteration :  10 , cost :  6.266473e-05
    iteration :  15 , cost :  5.0375333e-05
    iteration :  20 , cost :  3.6201553e-05
Out iteration:  2
    iteration :  5 , cost :  1.4361158e-05
    iteration :  10 , cost :  1.2063044e-05
    iteration :  15 , cost :  1.1468858e-05
    iteration :  20 , cost :  1.1822683e-05
Out iteration:  3
    iteration :  5 , cost :  1.0284e-05
    iteration :  10 , cost :  9.371776e-06
    iteration :  15 , cost :  8.129136e-06
    iteration :  20 , cost :  7.867877e-06
Train time: 62.65993547439575
Test time: 0.19852781295776367
{'AUPRC': 0.1582223599370553,
 'AUROC': 0.7914046365424422,
 'Confusion matrix': array([[ 3019,    59],
       [15860, 23987]], dtype=int64),
 'F1': 0.27499202987657695,
 'Precision': 0.15991313099210763,
 'Recall': 0.9808317089018843}
