In [3]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [4]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from load_datasets import forestcover
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [5]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = forestcover(random_state=1)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')


Anomalies ratio: 0.959985456884557 %


In [5]:
# Train conventional RBF-OCSVM on raw input
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

# Train conventional Linear-OCSVM
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 650.0681240558624
Test time: 204.22370100021362
{'AUPRC': 0.9986463888919337,
 'AUROC': 0.9295486795010272,
 'Confusion matrix': array([[121692,  19959],
       [     0,   1374]], dtype=int64),
 'F1': 0.9242091113111037,
 'P@10': 0.9903926161591442,
 'Precision': 1.0,
 'Recall': 0.8590973590020543}
OCSVM-Linear
[LibSVM]Train time: 438.180704832077
Test time: 145.50284790992737
{'AUPRC': 0.998651814458778,
 'AUROC': 0.9298310636705707,
 'Confusion matrix': array([[121772,  19879],
       [     0,   1374]], dtype=int64),
 'F1': 0.9245358226123003,
 'P@10': 0.9903996084326819,
 'Precision': 1.0,
 'Recall': 0.8596621273411412}


In [47]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.12, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.5s finished


Train time: 27.721354484558105
Test time: 10.483874082565308
{'AUPRC': 0.9988391944940953,
 'AUROC': 0.9396290159475843,
 'Confusion matrix': array([[125888,  15763],
       [    13,   1361]], dtype=int64),
 'F1': 0.941035761272575,
 'P@10': 0.9904066007062197,
 'Precision': 0.9998967442673211,
 'Recall': 0.8887194583871628}


In [38]:
# Train Local outlier factor
print('LOF')
lof = LocalOutlierFactor(contamination=0.15)
t0 = time.time()
lof.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = lof._predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

LOF
Train time: 593.2050950527191
Test time: 583.6405622959137
{'AUPRC': 0.991153879730001,
 'AUROC': 0.539934131117937,
 'Confusion matrix': array([[119562,  22089],
       [  1050,    324]], dtype=int64),
 'F1': 0.9117717710847507,
 'P@10': 0.9903926161591442,
 'Precision': 0.9912943985673067,
 'Recall': 0.8440604019738653}


In [7]:
autoencoder_layers = [54, 32, 16]
batch_size = 1024

nu = 0.3
alpha = 1e3
sigma = 3.0
kernel_features = 200

data_input = tf.placeholder(tf.float32, shape=[None, 54])

ae1svm = AEOneClassSVM(data_input, batch_size, 'ae1svm', autoencoder_layers[1:], nu, alpha, sigma, kernel_features,
                       autoencoder_activation='sigmoid',
                       full_op=tf.train.AdamOptimizer(1e-2),
                       svm_op=tf.train.AdamOptimizer(1e-4))

ae_only = AEOneClassSVM(data_input, batch_size, 'ae_only', autoencoder_layers[1:], nu, alpha, sigma, kernel_features,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(1e-2))

In [27]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, x_train, y_train, epochs_1=5, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))

Combined train
Epoch: 1 Loss: 0.00014666299147880716 ( 9.464924214897677e-08 x 1000.0 + 5.201374781911648e-05 ) AUROC: 0.4400035298270385
Epoch: 2 Loss: 4.721038601890115e-05 ( 4.8324804854829927e-08 x 1000.0 + -1.1144178201038983e-06 ) AUROC: 0.38412989763501587
Epoch: 3 Loss: 3.182055450516482e-05 ( 3.4719712484064854e-08 x 1000.0 + -2.8991558691099076e-06 ) AUROC: 0.4556084349754699
Epoch: 4 Loss: 2.601387028620038e-05 ( 2.8948303435186403e-08 x 1000.0 + -2.934431768506056e-06 ) AUROC: 0.39005935148362003
Epoch: 5 Loss: 1.8599943151395517e-05 ( 2.1493998147040324e-08 x 1000.0 + -2.894053615164842e-06 ) AUROC: 0.9459266053064639
SVM train
Train time: 17.82639503479004
Test time: 0.7299408912658691
{'AUPRC': 0.9990065686193581,
 'AUROC': 0.9485999694988103,
 'Confusion matrix': array([[136677,   4974],
       [    93,   1281]], dtype=int64),
 'F1': 0.9818009417393084,
 'P@10': 0.9903926161591442,
 'Precision': 0.9993200263215617,
 'Recall': 0.9648855285172713}


In [7]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=20)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
Epoch: 1 Loss: 1.1125966544705516e-07
Epoch: 2 Loss: 5.911390368346313e-08
Epoch: 3 Loss: 3.6418744710830276e-08
Epoch: 4 Loss: 2.5531345290079753e-08
Epoch: 5 Loss: 1.9358984952182237e-08
Epoch: 6 Loss: 1.6344433460584465e-08
Epoch: 7 Loss: 1.4588433644382216e-08
Epoch: 8 Loss: 1.3104460009665194e-08
Epoch: 9 Loss: 1.192884945886866e-08
Epoch: 10 Loss: 1.056525351417567e-08
Epoch: 11 Loss: 9.617295993207667e-09
Epoch: 12 Loss: 8.761474928293586e-09
Epoch: 13 Loss: 8.368218024354054e-09
Epoch: 14 Loss: 7.783030451528069e-09
Epoch: 15 Loss: 7.4201359671629306e-09
Epoch: 16 Loss: 7.190415473238131e-09
Epoch: 17 Loss: 6.942554202811951e-09
Epoch: 18 Loss: 6.426770341546737e-09
Epoch: 19 Loss: 6.25005913867288e-09
Epoch: 20 Loss: 6.119288778735013e-09
AE time: 18.138222455978394


In [8]:
# Train conventional OCSVM on 
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 272.17249059677124
Test time: 77.8259129524231
{'AUPRC': 0.9959400134759321,
 'AUROC': 0.7895934975064337,
 'Confusion matrix': array([[121115,  20536],
       [   379,    995]], dtype=int64),
 'F1': 0.9205191054361663,
 'P@10': 0.9903926161591442,
 'Precision': 0.9968805043870479,
 'Recall': 0.85502396735639}


In [9]:
print('OCSVM-Linear')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear
[LibSVM]Train time: 165.3895058631897
Test time: 38.25267243385315
{'AUPRC': 0.9977500004218368,
 'AUROC': 0.8832912238730289,
 'Confusion matrix': array([[121577,  20074],
       [   126,   1248]], dtype=int64),
 'F1': 0.9232971589571453,
 'P@10': 0.9903926161591442,
 'Precision': 0.9989646927355941,
 'Recall': 0.8582855045146169}


In [10]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False, kernel='linear', tol=0.1)
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 3992.885948896408
Test time: 1428.4482157230377
{'AUPRC': 0.9962071651017566,
 'AUROC': 0.8034464448403371,
 'Confusion matrix': array([[121122,  20529],
       [   341,   1033]], dtype=int64),
 'F1': 0.9206807695523614,
 'P@10': 0.9903926161591442,
 'Precision': 0.9971925606974964,
 'Recall': 0.8550733845860601}


In [31]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=0.1, layers_sizes=autoencoder_layers, learning_rate=1e-2)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=50, iteration=3, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = [1 if s == 0 else -1 for s in s_sum_test]
    pprint(metrics(y_test, out_y))

X shape:  (143023, 54)
L shape:  (143023, 54)
S shape:  (143023, 54)
Out iteration:  1
    iteration :  5 , cost :  0.012827324
    iteration :  10 , cost :  0.0057799467
    iteration :  15 , cost :  0.0033998762
    iteration :  20 , cost :  0.0024248757
    iteration :  25 , cost :  0.0018074346
    iteration :  30 , cost :  0.0012522194
    iteration :  35 , cost :  0.00090303144
    iteration :  40 , cost :  0.0007304976
    iteration :  45 , cost :  0.00053044077
    iteration :  50 , cost :  0.00046837973
Out iteration:  2
    iteration :  5 , cost :  0.00015460704
    iteration :  10 , cost :  0.00014110544
    iteration :  15 , cost :  0.00013211335
    iteration :  20 , cost :  0.00012469363
    iteration :  25 , cost :  0.000118431075
    iteration :  30 , cost :  0.00011327817
    iteration :  35 , cost :  0.00010917027
    iteration :  40 , cost :  0.00010583029
    iteration :  45 , cost :  0.00010290249
    iteration :  50 , cost :  9.987227e-05
Out iteration:  3
    ite

In [12]:
dec = DEC(dims=autoencoder_layers, n_clusters=5)
t0 = time.time()
dec.pretrain(x=x_train, epochs=1)
dec.compile(loss='kld')
y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
print('Train time:', time.time() - t0)

t0 = time.time()
scores = dec.cluster_score(x_test)
print('Test time:', time.time() - t0)
threshold = np.partition(scores.flatten(), int(3*counter[-1]))[int(3*counter[-1])]
out_y = [1 if s > 2*threshold else -1 for s in scores]
pprint(metrics(y_test, out_y))

...Pretraining...
Pretraining time:  0.491274356842041
Update interval 10
Save interval 698.3544921875
Initializing cluster centers with k-means.
delta_label  0.0009998391867042365 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 293.00256514549255
Test time: 6.5794923305511475
{'AUPRC': 0.9984293909982355,
 'AUROC': 0.9184462110102142,
 'Confusion matrix': array([[124423,  17228],
       [    57,   1317]], dtype=int64),
 'F1': 0.9350507832608753,
 'P@10': 0.9903926161591442,
 'Precision': 0.9995420951156813,
 'Recall': 0.8783771381776337}
