In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [2]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from load_datasets import usps
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = usps(random_state=3)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')

Anomalies ratio: 5.0 %


In [4]:
autoencoder_layers = [256, 128, 64, 32]
batch_size = 16

data_input = tf.placeholder(tf.float32, shape=[None, 256])

In [5]:
# Train conventional OCSVM
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.11, verbose=True, shrinking=True, gamma=0.04)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 0.07118916511535645
Test time: 0.017548322677612305
{'AUPRC': 0.5102040816326531,
 'AUROC': 0.9747368421052632,
 'Confusion matrix': array([[ 25,   0],
       [ 24, 451]], dtype=int64),
 'F1': 0.6756756756756758,
 'Precision': 0.5102040816326531,
 'Recall': 1.0}


In [6]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.08, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


Train time: 0.46273279190063477
Test time: 0.11179733276367188
{'AUPRC': 0.6097560975609756,
 'AUROC': 0.9831578947368421,
 'Confusion matrix': array([[ 25,   0],
       [ 16, 459]], dtype=int64),
 'F1': 0.7575757575757575,
 'Precision': 0.6097560975609756,
 'Recall': 1.0}


In [7]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=1.7, layers_sizes=autoencoder_layers, learning_rate=1e-2)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=5, iteration=5, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = np.array([1 if s == 0 else -1 for s in s_sum_test])
    pprint(metrics(y_test, out_y))


X shape:  (500, 256)
L shape:  (500, 256)
S shape:  (500, 256)
Out iteration:  1
    iteration :  5 , cost :  0.0074171685
Out iteration:  2
    iteration :  5 , cost :  0.005052941
Out iteration:  3
    iteration :  5 , cost :  0.005326478
Out iteration:  4
    iteration :  5 , cost :  0.0056101335
Out iteration:  5
    iteration :  5 , cost :  0.005672169
Train time: 2.7923684120178223
Test time: 0.01905059814453125
{'AUPRC': 0.5681818181818182,
 'AUROC': 0.98,
 'Confusion matrix': array([[ 25,   0],
       [ 19, 456]], dtype=int64),
 'F1': 0.7246376811594203,
 'Precision': 0.5681818181818182,
 'Recall': 1.0}


In [8]:
dec = DEC(dims=autoencoder_layers, n_clusters=5)
t0 = time.time()
dec.pretrain(x=x_train, epochs=1)
dec.compile(loss='kld')
y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
print('Train time:', time.time() - t0)

t0 = time.time()
scores = dec.cluster_score(x_test)
print('Test time:', time.time() - t0)
threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
out_y = np.array([1 if s > threshold else -1 for s in scores])
pprint(metrics(y_test, out_y))

...Pretraining...
Epoch 1/1
Pretraining time:  2.2835824489593506
Update interval 10
Save interval 156.25
Initializing cluster centers with k-means.
delta_label  0.0 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 3.767235517501831
Test time: 0.027072429656982422
{'AUPRC': 0.7506153846153846,
 'AUROC': 0.9357894736842105,
 'Confusion matrix': array([[ 22,   3],
       [  4, 471]], dtype=int64),
 'F1': 0.8627450980392156,
 'Precision': 0.8461538461538461,
 'Recall': 0.88}


In [9]:
ae1svm = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.28, 1e3, 3.0, 500,
                       autoencoder_activation='sigmoid', seed=10,
                       full_op=tf.train.AdamOptimizer(5e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

In [10]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, epochs_1=120, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)
    pprint(metrics(y_test, out_y))

Combined train
........................................................................................................................SVM train
Train time: 17.582940101623535
Test time: 0.04261422157287598
{'AUPRC': 0.8064516129032258,
 'AUROC': 0.9936842105263157,
 'Confusion matrix': array([[ 25,   0],
       [  6, 469]], dtype=int64),
 'F1': 0.8928571428571428,
 'Precision': 0.8064516129032258,
 'Recall': 1.0}


In [11]:
ae_only = AEOneClassSVM(data_input, batch_size, 'test_ae', autoencoder_layers[1:], 0.28, 1e3, 3.0, 500,
                       autoencoder_activation='sigmoid',
                       ae_op=tf.train.AdamOptimizer(5e-3))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=100)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
....................................................................................................AE time: 11.076406955718994


In [12]:
# Train conventional OCSVM on 
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.12, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 0.04612278938293457
Test time: 0.0025072097778320312
{'AUPRC': 0.4716981132075472,
 'AUROC': 0.9705263157894737,
 'Confusion matrix': array([[ 25,   0],
       [ 28, 447]], dtype=int64),
 'F1': 0.641025641025641,
 'Precision': 0.4716981132075472,
 'Recall': 1.0}


In [13]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.11, verbose=True, shrinking=True, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 0.11680960655212402
Test time: 0.10628986358642578
{'AUPRC': 0.49019607843137253,
 'AUROC': 0.9726315789473684,
 'Confusion matrix': array([[ 25,   0],
       [ 26, 449]], dtype=int64),
 'F1': 0.6578947368421052,
 'Precision': 0.49019607843137253,
 'Recall': 1.0}
