In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [2]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from load_datasets import musk
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = musk(random_state=1)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')

autoencoder_layers = [166, 80, 20]
batch_size = 16

data_input = tf.placeholder(tf.float32, shape=[None, 166])

Anomalies ratio: 3.1372549019607843 %


In [10]:
# Train conventional OCSVM
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.4, verbose=True, shrinking=True)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 0.3307621479034424
Test time: 0.19251537322998047
{'AUPRC': 0.9873615510159528,
 'AUROC': 0.8024275118004045,
 'Confusion matrix': array([[897, 586],
       [  0,  49]], dtype=int64),
 'F1': 0.753781512605042,
 'P@10': 0.9684625492772667,
 'Precision': 1.0,
 'Recall': 0.6048550236008091}


In [104]:
# Train conventional OCSVM
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.115, verbose=True, shrinking=True, kernel='linear')
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)
t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear
[LibSVM]Train time: 0.10411667823791504
Test time: 0.056182861328125
{'AUPRC': 0.9973472197524952,
 'AUROC': 0.9585300067430884,
 'Confusion matrix': array([[1360,  123],
       [   0,   49]], dtype=int64),
 'F1': 0.9567358424199789,
 'P@10': 0.9697766097240473,
 'Precision': 1.0,
 'Recall': 0.9170600134861767}


In [99]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.04, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


Train time: 0.5083460807800293
Test time: 0.14738988876342773
{'AUPRC': 0.9996117882564627,
 'AUROC': 0.9939312204989885,
 'Confusion matrix': array([[1465,   18],
       [   0,   49]], dtype=int64),
 'F1': 0.9938941655359566,
 'P@10': 0.9730617608409987,
 'Precision': 1.0,
 'Recall': 0.987862440997977}


In [86]:
# Train Local outlier factor
print('LOF')
lof = LocalOutlierFactor(contamination=0.1)
t0 = time.time()
lof.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = lof._predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

LOF
Train time: 0.6838169097900391
Test time: 0.6326839923858643
{'AUPRC': 0.9643362970772152,
 'AUROC': 0.4403236682400539,
 'Confusion matrix': array([[1306,  177],
       [  49,    0]], dtype=int64),
 'F1': 0.9203664552501762,
 'P@10': 0.9678055190538765,
 'Precision': 0.9638376383763838,
 'Recall': 0.8806473364801078}


In [131]:
ae1svm = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.28, 1e4, 3.0, 500,
                       autoencoder_activation='sigmoid',
                       full_op=tf.train.AdamOptimizer(5e-3),
                       svm_op=tf.train.AdamOptimizer(1e-5))

ae_only = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.25, 1e4, 3.0, 500,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(5e-3))

In [133]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, x_train, y_train, epochs_1=71, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))

Combined train
Epoch: 1 Loss: 0.35437215169270836 ( 2.5310294300902123e-05 x 10000.0 + 0.10126920213886335 ) AUROC: 0.4659244264507422
Epoch: 2 Loss: 0.24059498107511235 ( 2.0111786822477975e-05 x 10000.0 + 0.03947711770051445 ) AUROC: 0.3690958164642375
Epoch: 3 Loss: 0.19159184873493668 ( 1.7698292979617525e-05 x 10000.0 + 0.014608917984307981 ) AUROC: 0.36808367071524967
Epoch: 4 Loss: 0.15702077828201594 ( 1.5207761107316983e-05 x 10000.0 + 0.004943164501314849 ) AUROC: 0.510501012145749
Epoch: 5 Loss: 0.133790598351971 ( 1.323814534283931e-05 x 10000.0 + 0.0014091488582636017 ) AUROC: 0.4973009446693657
Epoch: 6 Loss: 0.11680223053576899 ( 1.1658988698245654e-05 x 10000.0 + 0.00021234468307370453 ) AUROC: 0.6688174763832658
Epoch: 7 Loss: 0.10398127736608967 ( 1.0412869354089101e-05 x 10000.0 + -0.00014741173367095148 ) AUROC: 0.5500168690958165
Epoch: 8 Loss: 0.093745432336346 ( 9.399521000245038e-06 x 10000.0 + -0.0002497758740693136 ) AUROC: 0.6306089743589743
Epoch: 9 Loss: 0.

Epoch: 70 Loss: 0.030102932998557496 ( 3.03878564557998e-06 x 10000.0 + -0.00028492202556211184 ) AUROC: 0.9615384615384616
Epoch: 71 Loss: 0.03050560296750536 ( 3.078952431678772e-06 x 10000.0 + -0.00028392066752988527 ) AUROC: 0.9935897435897436
SVM train
Train time: 13.401631355285645
Test time: 0.014037132263183594
{'AUPRC': 0.9998058941282314,
 'AUROC': 0.9969656102494943,
 'Confusion matrix': array([[1474,    9],
       [   0,   49]], dtype=int64),
 'F1': 0.9969563747040919,
 'P@10': 0.973718791064389,
 'Precision': 1.0,
 'Recall': 0.9939312204989885}


In [54]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=100)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
Epoch: 1 Loss: 2.4847946817578832e-05
Epoch: 2 Loss: 1.8565625780158574e-05
Epoch: 3 Loss: 1.549348356871823e-05
Epoch: 4 Loss: 1.3128240757128772e-05
Epoch: 5 Loss: 1.1289648078625499e-05
Epoch: 6 Loss: 9.984816454984004e-06
Epoch: 7 Loss: 9.084189176754234e-06
Epoch: 8 Loss: 8.417192383918887e-06
Epoch: 9 Loss: 7.894887814140007e-06
Epoch: 10 Loss: 7.463475457983079e-06
Epoch: 11 Loss: 7.0988474523319915e-06
Epoch: 12 Loss: 6.785515233192569e-06
Epoch: 13 Loss: 6.510776291096132e-06
Epoch: 14 Loss: 6.2657520174980165e-06
Epoch: 15 Loss: 6.049388130895453e-06
Epoch: 16 Loss: 5.8572921974986205e-06
Epoch: 17 Loss: 5.692568017182008e-06
Epoch: 18 Loss: 5.558453305484423e-06
Epoch: 19 Loss: 5.383723798920127e-06
Epoch: 20 Loss: 5.165726749920377e-06
Epoch: 21 Loss: 5.013275635885258e-06
Epoch: 22 Loss: 4.883406659454302e-06
Epoch: 23 Loss: 4.763764578824729e-06
Epoch: 24 Loss: 4.651054155592825e-06
Epoch: 25 Loss: 4.555836770366999e-06
Epoch: 26 Loss: 4.679041007667585e

In [58]:
# Train conventional OCSVM on bottleneck layer
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.21, verbose=True, shrinking=True)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 0.04612135887145996
Test time: 0.02406620979309082
{'AUPRC': 0.993874881379745,
 'AUROC': 0.9042481456507081,
 'Confusion matrix': array([[1199,  284],
       [   0,   49]], dtype=int64),
 'F1': 0.8941088739746458,
 'P@10': 0.9684625492772667,
 'Precision': 1.0,
 'Recall': 0.8084962913014161}


In [65]:
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.21, verbose=True, shrinking=True, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))


OCSVM-Linear
[LibSVM]Train time: 0.030548810958862305
Test time: 0.009020328521728516
{'AUPRC': 0.9737229631001396,
 'AUROC': 0.5914720574676263,
 'Confusion matrix': array([[1149,  334],
       [  29,   20]], dtype=int64),
 'F1': 0.8635851183765502,
 'P@10': 0.9678055190538765,
 'Precision': 0.9753820033955858,
 'Recall': 0.7747808496291302}


In [70]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.25, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 1.280402660369873
Test time: 0.7008624076843262
{'AUPRC': 0.9914286758615274,
 'AUROC': 0.8666451071325361,
 'Confusion matrix': array([[1148,  335],
       [   2,   47]], dtype=int64),
 'F1': 0.8720091150778579,
 'P@10': 0.9684625492772667,
 'Precision': 0.9982608695652174,
 'Recall': 0.7741065407956844}


In [108]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=0.8, layers_sizes=autoencoder_layers, learning_rate=5e-3)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=50, iteration=5, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = [1 if s == 0 else -1 for s in s_sum_test]
    pprint(metrics(y_test, out_y))



X shape:  (1530, 166)
L shape:  (1530, 166)
S shape:  (1530, 166)
Out iteration:  1
    iteration :  5 , cost :  0.022789463
    iteration :  10 , cost :  0.01646576
    iteration :  15 , cost :  0.011062074
    iteration :  20 , cost :  0.008873121
    iteration :  25 , cost :  0.007635975
    iteration :  30 , cost :  0.0067086588
    iteration :  35 , cost :  0.0059770215
    iteration :  40 , cost :  0.0054218457
    iteration :  45 , cost :  0.0049435017
    iteration :  50 , cost :  0.0045602918
Out iteration:  2
    iteration :  5 , cost :  0.0032973778
    iteration :  10 , cost :  0.003116399
    iteration :  15 , cost :  0.0029618056
    iteration :  20 , cost :  0.0028378225
    iteration :  25 , cost :  0.0027382926
    iteration :  30 , cost :  0.0026529657
    iteration :  35 , cost :  0.0025803691
    iteration :  40 , cost :  0.0025227075
    iteration :  45 , cost :  0.002480416
    iteration :  50 , cost :  0.002445822
Out iteration:  3
    iteration :  5 , cost :  0.

In [114]:
    dec = DEC(dims=autoencoder_layers, n_clusters=5)
    t0 = time.time()
    dec.pretrain(x=x_train, epochs=10)
    dec.compile(loss='kld')
    y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    scores = dec.cluster_score(x_test)
    threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
    print('Test time:', time.time() - t0)
    out_y = [1 if s > 2*threshold else -1 for s in scores]
    pprint(metrics(y_test, out_y))


...Pretraining...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Pretraining time:  2.7543230056762695
Update interval 10
Save interval 478.125
Initializing cluster centers with k-means.
delta_label  0.00065359477124183 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 9.408772706985474
Test time: 0.07119011878967285
{'AUPRC': 1.0,
 'AUROC': 1.0,
 'Confusion matrix': array([[1483,    0],
       [   0,   49]], dtype=int64),
 'F1': 1.0,
 'P@10': 0.9743758212877792,
 'Precision': 1.0,
 'Recall': 1.0}
