In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [3]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from load_datasets import shuttle
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [4]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = shuttle(random_state=1)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')

Anomalies ratio: 7.16848383188892 %


In [46]:
# Train Local outlier factor
print('LOF')
lof = LocalOutlierFactor(contamination=0.2)
t0 = time.time()
lof.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = lof._predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

LOF
Train time: 2.3071341514587402
Test time: 1.9401590824127197
{'AUPRC': 0.9322628066590897,
 'AUROC': 0.5296563407992034,
 'Confusion matrix': array([[31815,  8032],
       [ 2275,   803]], dtype=int64),
 'F1': 0.8605975357398866,
 'P@10': 0.9282768262845159,
 'Precision': 0.9332648870636551,
 'Recall': 0.7984289908901548}


In [64]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.09, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


Train time: 2.269031286239624
Test time: 1.5932364463806152
{'AUPRC': 0.9972361976873878,
 'AUROC': 0.9812839300382443,
 'Confusion matrix': array([[38938,   909],
       [   45,  3033]], dtype=int64),
 'F1': 0.9878980083724471,
 'P@10': 0.928486543166725,
 'Precision': 0.99884565066824,
 'Recall': 0.9771877431174242}


In [51]:
# Train conventional RBF-OCSVM on raw input
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 22.13305139541626
Test time: 5.437685489654541
{'AUPRC': 0.9902865885503093,
 'AUROC': 0.9338761332271376,
 'Confusion matrix': array([[36325,  3522],
       [  135,  2943]], dtype=int64),
 'F1': 0.952075170036825,
 'P@10': 0.9284399394151229,
 'Precision': 0.9962973121228744,
 'Recall': 0.9116119155770823}


In [54]:
# Train conventional Linear-OCSVM
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear
[LibSVM]Train time: 10.76364779472351
Test time: 2.140688180923462
{'AUPRC': 0.9180382811170125,
 'AUROC': 0.4219378482670223,
 'Confusion matrix': array([[33367,  6480],
       [ 3058,    20]], dtype=int64),
 'F1': 0.8749475561149571,
 'P@10': 0.9282768262845159,
 'Precision': 0.9160466712422787,
 'Recall': 0.8373779707380732}


In [5]:
autoencoder_layers = [9, 6, 2]
batch_size = 16

data_input = tf.placeholder(tf.float32, shape=[None, 9])

ae1svm = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.4, 1e3, 3.0, 50,
                       autoencoder_activation='sigmoid',
                       full_op=tf.train.AdamOptimizer(1e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

ae_only = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.4, 1e3, 3.0, 50,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(1e-3))

In [67]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, x_train, y_train, epochs_1=10, epochs_2=1)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))


Combined train
Epoch: 1 Loss: 4.994146600150404e-05 ( 5.30493751361962e-08 x 1000.0 + -3.1079064876565995e-06 ) AUROC: 0.6286051785388178
Epoch: 2 Loss: 1.5221874285761835e-05 ( 2.6635494212133167e-08 x 1000.0 + -1.1413620859342879e-05 ) AUROC: 0.8540129433497166
Epoch: 3 Loss: 1.0930831429454222e-05 ( 2.2400683471184025e-08 x 1000.0 + -1.1469851879002208e-05 ) AUROC: 0.9366622097152417
Epoch: 4 Loss: 4.5092141302775475e-06 ( 1.598636033883947e-08 x 1000.0 + -1.1477146241107444e-05 ) AUROC: 0.9020896095912658
Epoch: 5 Loss: -1.0969787461150449e-06 ( 1.0425656676223337e-08 x 1000.0 + -1.1522635677278281e-05 ) AUROC: 0.9134202502342015
Epoch: 6 Loss: -4.078722485579712e-06 ( 7.424053286729316e-09 x 1000.0 + -1.1502775794006041e-05 ) AUROC: 0.9234617055998049
Epoch: 7 Loss: -6.232402684826096e-06 ( 5.251068984040392e-09 x 1000.0 + -1.1483472048564212e-05 ) AUROC: 0.967162644596896
Epoch: 8 Loss: -7.464647204219856e-06 ( 4.034737162271175e-09 x 1000.0 + -1.1499384811279792e-05 ) AUROC: 0.7

In [10]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=10)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
Epoch: 1 Loss: 8.525534683715074e-08
Epoch: 2 Loss: 3.2790212587109485e-08
Epoch: 3 Loss: 2.5360469258790546e-08
Epoch: 4 Loss: 2.0682815707613876e-08
Epoch: 5 Loss: 1.546947918478207e-08
Epoch: 6 Loss: 9.70241794617474e-09
Epoch: 7 Loss: 7.421525584745747e-09
Epoch: 8 Loss: 6.184349714055159e-09
Epoch: 9 Loss: 5.529441996439791e-09
Epoch: 10 Loss: 5.105569833690338e-09
AE time: 10.449751615524292


In [12]:
# Train conventional OCSVM on bottleneck layer
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.1, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 12.123375415802002
Test time: 2.5829243659973145
{'AUPRC': 0.9775111586871601,
 'AUROC': 0.8501191725340982,
 'Confusion matrix': array([[37780,  2067],
       [  763,  2315]], dtype=int64),
 'F1': 0.9638984564357699,
 'P@10': 0.9283467319119189,
 'Precision': 0.9802039280803259,
 'Recall': 0.9481265841845057}


In [14]:
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.2, verbose=True, shrinking=True, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear
[LibSVM]Train time: 9.664997339248657
Test time: 1.8128173351287842
{'AUPRC': 0.9279285291733779,
 'AUROC': 0.4972569094003536,
 'Confusion matrix': array([[31848,  7999],
       [ 2477,   601]], dtype=int64),
 'F1': 0.8587607183303672,
 'P@10': 0.928323430036118,
 'Precision': 0.927836853605244,
 'Recall': 0.7992571586317665}


In [15]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.1, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 71.98852777481079
Test time: 22.211646795272827
{'AUPRC': 0.9770882081832025,
 'AUROC': 0.8472843935069183,
 'Confusion matrix': array([[37813,  2034],
       [  783,  2295]], dtype=int64),
 'F1': 0.9640885738689239,
 'P@10': 0.9284399394151229,
 'Precision': 0.9797129236190278,
 'Recall': 0.9489547519261174}


In [42]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=0.01, layers_sizes=autoencoder_layers, learning_rate=1e-2)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=50, iteration=5, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = [1 if s == 0 else -1 for s in s_sum_test]
    pprint(metrics(y_test, out_y))

X shape:  (42924, 9)
L shape:  (42924, 9)
S shape:  (42924, 9)
Out iteration:  1
    iteration :  5 , cost :  8.4616746e-05
    iteration :  10 , cost :  6.342871e-05
    iteration :  15 , cost :  5.0481693e-05
    iteration :  20 , cost :  3.9625564e-05
    iteration :  25 , cost :  3.6279922e-05
    iteration :  30 , cost :  3.4029385e-05
    iteration :  35 , cost :  3.2872354e-05
    iteration :  40 , cost :  3.231164e-05
    iteration :  45 , cost :  3.1909298e-05
    iteration :  50 , cost :  3.1581512e-05
Out iteration:  2
    iteration :  5 , cost :  8.184967e-06
    iteration :  10 , cost :  8.026258e-06
    iteration :  15 , cost :  7.904077e-06
    iteration :  20 , cost :  7.717925e-06
    iteration :  25 , cost :  7.5563435e-06
    iteration :  30 , cost :  7.4317063e-06
    iteration :  35 , cost :  7.3354745e-06
    iteration :  40 , cost :  7.2591897e-06
    iteration :  45 , cost :  7.196308e-06
    iteration :  50 , cost :  7.142439e-06
Out iteration:  3
    iteration

In [39]:
    dec = DEC(dims=autoencoder_layers, n_clusters=5)
    t0 = time.time()
    dec.pretrain(x=x_train, epochs=1)
    dec.compile(loss='kld')
    y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    scores = dec.cluster_score(x_test)
    print('Test time:', time.time() - t0)
    threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
    out_y = [1 if s > 2*threshold else -1 for s in scores]
    pprint(metrics(y_test, out_y))


...Pretraining...
Epoch 1/1
Pretraining time:  1.2793703079223633
Update interval 10
Save interval 13413.75
Initializing cluster centers with k-means.
delta_label  0.00034945485043332404 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 6.26967978477478
Test time: 0.9044318199157715
{'AUPRC': 0.9856081054546645,
 'AUROC': 0.9010780073938761,
 'Confusion matrix': array([[33517,  6330],
       [  120,  2958]], dtype=int64),
 'F1': 0.9122257906483043,
 'P@10': 0.9283467319119189,
 'Precision': 0.9964324999256771,
 'Recall': 0.8411423695635807}
