In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [2]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from load_datasets import usps
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = usps(random_state=3)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')

Anomalies ratio: 5.0 %


In [5]:
autoencoder_layers = [256, 128, 64, 32]
batch_size = 16

data_input = tf.placeholder(tf.float32, shape=[None, 256])

In [50]:
# Train conventional OCSVM
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.11, verbose=True, shrinking=True, gamma=0.04)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 0.021056652069091797
Test time: 0.015059471130371094
{'AUPRC': 0.9974736842105263,
 'AUROC': 0.9747368421052631,
 'Confusion matrix': array([[451,  24],
       [  0,  25]], dtype=int64),
 'F1': 0.9740820734341253,
 'P@10': 0.9612244897959183,
 'Precision': 1.0,
 'Recall': 0.9494736842105264}


In [21]:
# Train conventional OCSVM
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=True, kernel='linear')
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 0.03509020805358887
Test time: 0.018445730209350586
{'AUPRC': 0.9755563207083128,
 'AUROC': 0.7610526315789473,
 'Confusion matrix': array([[419,  56],
       [  9,  16]], dtype=int64),
 'F1': 0.9280177187153932,
 'P@10': 0.9551020408163265,
 'Precision': 0.9789719626168224,
 'Recall': 0.8821052631578947}


In [29]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.08, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest
Train time: 0.25407886505126953
Test time: 0.059157609939575195
{'AUPRC': 0.9986315789473684,
 'AUROC': 0.9863157894736843,
 'Confusion matrix': array([[462,  13],
       [  0,  25]], dtype=int64),
 'F1': 0.9861259338313767,
 'P@10': 0.963265306122449,
 'Precision': 1.0,
 'Recall': 0.9726315789473684}


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [64]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=1.7, layers_sizes=autoencoder_layers, learning_rate=1e-2)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=5, iteration=5, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = [1 if s == 0 else -1 for s in s_sum_test]
    pprint(metrics(y_test, out_y))


X shape:  (500, 256)
L shape:  (500, 256)
S shape:  (500, 256)
Out iteration:  1
    iteration :  5 , cost :  0.007713088
Out iteration:  2
    iteration :  5 , cost :  0.0051952614
Out iteration:  3
    iteration :  5 , cost :  0.0051321657
Out iteration:  4
    iteration :  5 , cost :  0.005561769
Out iteration:  5
    iteration :  5 , cost :  0.0056907134
Train time: 1.6166083812713623
Test time: 0.015040874481201172
{'AUPRC': 0.998,
 'AUROC': 0.98,
 'Confusion matrix': array([[456,  19],
       [  0,  25]], dtype=int64),
 'F1': 0.9795918367346939,
 'P@10': 0.9530612244897959,
 'Precision': 1.0,
 'Recall': 0.96}


In [71]:
dec = DEC(dims=autoencoder_layers, n_clusters=5)
t0 = time.time()
dec.pretrain(x=x_train, epochs=1)
dec.compile(loss='kld')
y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
print('Train time:', time.time() - t0)

t0 = time.time()
scores = dec.cluster_score(x_test)
print('Test time:', time.time() - t0)
threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
out_y = [1 if s > threshold else -1 for s in scores]
pprint(metrics(y_test, out_y))

...Pretraining...
Epoch 1/1
Pretraining time:  1.5691707134246826
Update interval 10
Save interval 156.25
Initializing cluster centers with k-means.
delta_label  0.0 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 4.227656841278076
Test time: 0.02710247039794922
{'AUPRC': 0.9924399763453579,
 'AUROC': 0.9263157894736843,
 'Confusion matrix': array([[443,  32],
       [  2,  23]], dtype=int64),
 'F1': 0.9630434782608697,
 'P@10': 0.9571428571428572,
 'Precision': 0.9955056179775281,
 'Recall': 0.9326315789473684}


In [109]:
ae1svm = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.28, 1e3, 3.0, 500,
                       autoencoder_activation='sigmoid',
                       full_op=tf.train.AdamOptimizer(5e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

In [111]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit_svm(sess, x_train, x_train, y_train, epochs_1=100, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)
    pprint(metrics(y_test, out_y))

Combined train
Epoch: 1 Loss: 0.6347761840820313 ( 5.934059247374535e-05 x 1000.0 + 0.5754356079101562 ) AUROC: 0.5
Epoch: 2 Loss: 0.46069891357421877 ( 3.311109915375709e-05 x 1000.0 + 0.42758782958984376 ) AUROC: 0.5
Epoch: 3 Loss: 0.3463565673828125 ( 2.874456159770489e-05 x 1000.0 + 0.31761199951171876 ) AUROC: 0.49894736842105264
Epoch: 4 Loss: 0.2627142333984375 ( 2.7297617867588998e-05 x 1000.0 + 0.23541661071777345 ) AUROC: 0.43473684210526314
Epoch: 5 Loss: 0.19983882141113282 ( 2.598492056131363e-05 x 1000.0 + 0.17385389709472657 ) AUROC: 0.5
Epoch: 6 Loss: 0.14975242614746093 ( 2.2064670920372008e-05 x 1000.0 + 0.12768775939941407 ) AUROC: 0.45473684210526316
Epoch: 7 Loss: 0.1108739013671875 ( 1.7661822959780694e-05 x 1000.0 + 0.09321207427978516 ) AUROC: 0.49894736842105264
Epoch: 8 Loss: 0.08266778564453126 ( 1.50770153850317e-05 x 1000.0 + 0.06759077453613281 ) AUROC: 0.9336842105263157
Epoch: 9 Loss: 0.061742645263671875 ( 1.3086799532175063e-05 x 1000.0 + 0.04865584564

Epoch: 70 Loss: 0.001038193941116333 ( 1.8922961317002774e-06 x 1000.0 + -0.0008541021347045899 ) AUROC: 0.7694736842105263
Epoch: 71 Loss: 0.0010958502292633056 ( 1.9794292747974397e-06 x 1000.0 + -0.0008835789561271668 ) AUROC: 0.9178947368421053
Epoch: 72 Loss: 0.0010592107772827149 ( 1.9000484608113765e-06 x 1000.0 + -0.0008408377766609192 ) AUROC: 0.8178947368421052
Epoch: 73 Loss: 0.0010719246864318848 ( 1.959417946636677e-06 x 1000.0 + -0.0008874932527542114 ) AUROC: 0.9326315789473685
Epoch: 74 Loss: 0.0010790703296661376 ( 1.9499320769682527e-06 x 1000.0 + -0.0008708617687225342 ) AUROC: 0.8389473684210527
Epoch: 75 Loss: 0.0010121195316314697 ( 1.868755673058331e-06 x 1000.0 + -0.0008566362261772155 ) AUROC: 0.9157894736842105
Epoch: 76 Loss: 0.0010500468015670777 ( 1.84722279664129e-06 x 1000.0 + -0.0007971760034561157 ) AUROC: 0.6189473684210527
Epoch: 77 Loss: 0.0008812465071678161 ( 1.7474241321906447e-06 x 1000.0 + -0.0008661776185035706 ) AUROC: 0.9147368421052631
Epoch

In [131]:
ae_only = AEOneClassSVM(data_input, batch_size, 'test_ae', autoencoder_layers[1:], 0.28, 1e3, 3.0, 500,
                       autoencoder_activation='sigmoid',
                       ae_op=tf.train.AdamOptimizer(5e-3))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=100)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
Epoch: 1 Loss: 0.00010470106452703476
Epoch: 2 Loss: 5.381616204977035e-05
Epoch: 3 Loss: 2.908489853143692e-05
Epoch: 4 Loss: 2.4308552965521813e-05
Epoch: 5 Loss: 2.0358726382255554e-05
Epoch: 6 Loss: 1.6976684331893922e-05
Epoch: 7 Loss: 1.454909611493349e-05
Epoch: 8 Loss: 1.1869112960994243e-05
Epoch: 9 Loss: 1.0397476144134998e-05
Epoch: 10 Loss: 9.539371356368065e-06
Epoch: 11 Loss: 8.81444290280342e-06
Epoch: 12 Loss: 8.34535900503397e-06
Epoch: 13 Loss: 8.289985358715057e-06
Epoch: 14 Loss: 7.725147996097803e-06
Epoch: 15 Loss: 7.5929155573248864e-06
Epoch: 16 Loss: 7.307859603315592e-06
Epoch: 17 Loss: 6.666031666100025e-06
Epoch: 18 Loss: 6.457554176449776e-06
Epoch: 19 Loss: 5.873235873878002e-06
Epoch: 20 Loss: 5.615349858999252e-06
Epoch: 21 Loss: 5.377361085265875e-06
Epoch: 22 Loss: 5.271044559776783e-06
Epoch: 23 Loss: 4.9993726424872875e-06
Epoch: 24 Loss: 5.024246871471405e-06
Epoch: 25 Loss: 4.674649331718683e-06
Epoch: 26 Loss: 4.39245393499732e-0

In [170]:
# Train conventional OCSVM on 
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.12, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 0.008021116256713867
Test time: 0.0030083656311035156
{'AUPRC': 0.9976842105263158,
 'AUROC': 0.976842105263158,
 'Confusion matrix': array([[453,  22],
       [  0,  25]], dtype=int64),
 'F1': 0.9762931034482758,
 'P@10': 0.9612244897959183,
 'Precision': 1.0,
 'Recall': 0.9536842105263158}


In [148]:
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.22, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear
[LibSVM]Train time: 0.021056413650512695
Test time: 0.0030105113983154297
{'AUPRC': 0.985731343283582,
 'AUROC': 0.86,
 'Confusion matrix': array([[399,  76],
       [  3,  22]], dtype=int64),
 'F1': 0.9099201824401367,
 'P@10': 0.9530612244897959,
 'Precision': 0.9925373134328358,
 'Recall': 0.84}


In [168]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.115, verbose=True, shrinking=True, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 0.08923578262329102
Test time: 0.05213737487792969
{'AUPRC': 0.9977894736842106,
 'AUROC': 0.9778947368421053,
 'Confusion matrix': array([[454,  21],
       [  0,  25]], dtype=int64),
 'F1': 0.9773950484391819,
 'P@10': 0.9591836734693877,
 'Precision': 1.0,
 'Recall': 0.9557894736842105}
