In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [2]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

from load_datasets import forestcover
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = forestcover(random_state=1)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')


Anomalies ratio: 0.959985456884557 %


In [4]:
# Train OC-SVM on raw input
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 667.1862211227417
Test time: 206.36981987953186
{'AUPRC': 0.06440725636338068,
 'AUROC': 0.9295486795010272,
 'Confusion matrix': array([[  1374,      0],
       [ 19959, 121692]], dtype=int64),
 'F1': 0.12101994979521735,
 'Precision': 0.06440725636338068,
 'Recall': 1.0}


In [5]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.12, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.4s finished


Train time: 26.88950824737549
Test time: 10.3134286403656
{'AUPRC': 0.07791594252340163,
 'AUROC': 0.9377957230451286,
 'Confusion matrix': array([[  1357,     17],
       [ 15870, 125781]], dtype=int64),
 'F1': 0.14590613407881298,
 'Precision': 0.0787716955941255,
 'Recall': 0.987627365356623}


In [6]:
autoencoder_layers = [54, 32, 16]
batch_size = 1024

nu = 0.3
alpha = 1e3
sigma = 3.0
kernel_features = 200

data_input = tf.placeholder(tf.float32, shape=[None, 54])

ae1svm = AEOneClassSVM(data_input, batch_size, 'ae1svm', autoencoder_layers[1:], nu, alpha, sigma, kernel_features,
                       autoencoder_activation='sigmoid', seed=4,
                       full_op=tf.train.AdamOptimizer(1e-2),
                       svm_op=tf.train.AdamOptimizer(1e-4))

ae_only = AEOneClassSVM(data_input, batch_size, 'ae_only', autoencoder_layers[1:], nu, alpha, sigma, kernel_features,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(1e-2))

In [7]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, epochs_1=7, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))

Combined train
.......SVM train
Train time: 17.757224321365356
Test time: 0.7118942737579346
{'AUPRC': 0.1575109070898659,
 'AUROC': 0.9519615665280303,
 'Confusion matrix': array([[  1306,     68],
       [  6599, 135052]], dtype=int64),
 'F1': 0.2814958508459964,
 'Precision': 0.16521189120809615,
 'Recall': 0.950509461426492}


In [8]:
ae_only = AEOneClassSVM(data_input, batch_size, 'ae_only', autoencoder_layers[1:], nu, alpha, sigma, kernel_features,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(1e-2))

In [9]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=20)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
....................AE time: 13.412670612335205


In [10]:
# Train OCSVM on encoded 
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 377.64098834991455
Test time: 130.73426866531372
{'AUPRC': 0.02291982735476698,
 'AUROC': 0.6948486299080782,
 'Confusion matrix': array([[   735,    639],
       [ 20573, 121078]], dtype=int64),
 'F1': 0.0648090997266555,
 'Precision': 0.03449408672798949,
 'Recall': 0.5349344978165939}


In [11]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.15, verbose=True, shrinking=False, kernel='linear', tol=0.1)
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 8277.284163236618
Test time: 1586.538587808609
{'AUPRC': 0.02459405814671357,
 'AUROC': 0.7079519335901489,
 'Confusion matrix': array([[   769,    605],
       [ 20366, 121285]], dtype=int64),
 'F1': 0.06832822426584922,
 'Precision': 0.036385143127513606,
 'Recall': 0.5596797671033479}


In [15]:
dec = DEC(dims=autoencoder_layers, n_clusters=5)
t0 = time.time()
dec.pretrain(x=x_train, epochs=5)
dec.compile(loss='kld')
y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
print('Train time:', time.time() - t0)

t0 = time.time()
scores = dec.cluster_score(x_test)
print('Test time:', time.time() - t0)
threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
out_y = np.array([1 if s > 3*threshold else -1 for s in scores])
pprint(metrics(y_test, out_y))

...Pretraining...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Pretraining time:  7.523931503295898
Update interval 10
Save interval 698.3544921875
Initializing cluster centers with k-means.
delta_label  0.0 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 20.789042234420776
Test time: 3.1183419227600098
{'AUPRC': 0.030755040442172407,
 'AUROC': 0.8441889006435923,
 'Confusion matrix': array([[ 1357,    17],
       [42389, 99262]], dtype=int64),
 'F1': 0.06015070921985816,
 'Precision': 0.031019978969505785,
 'Recall': 0.987627365356623}


In [17]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=0.1, layers_sizes=autoencoder_layers, learning_rate=1e-2)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=50, iteration=3, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = np.array([1 if s == 0 else -1 for s in s_sum_test])
    pprint(metrics(y_test, out_y))

X shape:  (143023, 54)
L shape:  (143023, 54)
S shape:  (143023, 54)
Out iteration:  1
    iteration :  5 , cost :  0.012954294
    iteration :  10 , cost :  0.005389897
    iteration :  15 , cost :  0.0034266175
    iteration :  20 , cost :  0.0025623078
    iteration :  25 , cost :  0.0018427641
    iteration :  30 , cost :  0.0013280164
    iteration :  35 , cost :  0.0010048331
    iteration :  40 , cost :  0.00075372943
    iteration :  45 , cost :  0.0006544672
    iteration :  50 , cost :  0.0005267294
Out iteration:  2
    iteration :  5 , cost :  0.0001509908
    iteration :  10 , cost :  0.00013877971
    iteration :  15 , cost :  0.00012848702
    iteration :  20 , cost :  0.00011636284
    iteration :  25 , cost :  0.00010890679
    iteration :  30 , cost :  0.000102827
    iteration :  35 , cost :  9.7523414e-05
    iteration :  40 , cost :  9.3104274e-05
    iteration :  45 , cost :  8.9624824e-05
    iteration :  50 , cost :  8.6396496e-05
Out iteration:  3
    iteration