In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [2]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from load_datasets import kddcup
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = kddcup(random_state=3, percent10=True)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')

Anomalies ratio: 4.998242118832767 %


In [12]:
# Train conventional RBF-OCSVM on raw input
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.05, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 62.47549605369568
Test time: 18.70576310157776
{'AUPRC': 0.9897978499698971,
 'AUROC': 0.9016033221154834,
 'Confusion matrix': array([[47237,  1402],
       [  430,  2130]], dtype=int64),
 'F1': 0.9809773015180778,
 'P@10': 0.9501455390806619,
 'Precision': 0.9909790840623492,
 'Recall': 0.9711753942309669}


In [13]:
# Train conventional Linear-OCSVM
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.05, verbose=True, shrinking=False, kernel='linear')
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 66.01519727706909
Test time: 18.03073811531067
{'AUPRC': 0.9583174383815292,
 'AUROC': 0.5867697435121507,
 'Confusion matrix': array([[46725,  1914],
       [ 2015,   545]], dtype=int64),
 'F1': 0.9596524918103492,
 'P@10': 0.950008790951181,
 'Precision': 0.9586581862946245,
 'Recall': 0.9606488620243014}


In [21]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.12, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.2s finished


Train time: 18.21442174911499
Test time: 5.948817253112793
{'AUPRC': 0.9956962072271266,
 'AUROC': 0.9572772066590083,
 'Confusion matrix': array([[45072,  3567],
       [   31,  2529]], dtype=int64),
 'F1': 0.961618058074289,
 'P@10': 0.9500478618453183,
 'Precision': 0.9993126843003791,
 'Recall': 0.9266637883180164}


In [23]:
autoencoder_layers = [118, 80, 40, 20]
batch_size = 128

In [28]:
dec = DEC(dims=autoencoder_layers, n_clusters=5)
t0 = time.time()
dec.pretrain(x=x_train, epochs=1)
dec.compile(loss='kld')
y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
print('Train time:', time.time() - t0)

t0 = time.time()
scores = dec.cluster_score(x_test)
print('Test time:', time.time() - t0)
threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
out_y = [1 if s > 2*threshold else -1 for s in scores]
pprint(metrics(y_test, out_y))

...Pretraining...
Epoch 1/1
Pretraining time:  2.312150001525879
Update interval 10
Save interval 1999.921875
Initializing cluster centers with k-means.
delta_label  0.00011719207781553967 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 10.319435834884644
Test time: 1.9491820335388184
{'AUPRC': 0.994932380796463,
 'AUROC': 0.9496290190870496,
 'Confusion matrix': array([[44309,  4330],
       [   30,  2530]], dtype=int64),
 'F1': 0.9531071866463035,
 'P@10': 0.950067397292387,
 'Precision': 0.999323394754054,
 'Recall': 0.910976788174099}


In [36]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=1, layers_sizes=autoencoder_layers, learning_rate=1e-3)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=20, iteration=5, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = [1 if s == 0 else -1 for s in s_sum_test]
    pprint(metrics(y_test, out_y))

X shape:  (51198, 118)
L shape:  (51198, 118)
S shape:  (51198, 118)
Out iteration:  1
    iteration :  5 , cost :  0.018701918
    iteration :  10 , cost :  0.018692752
    iteration :  15 , cost :  0.018691037
    iteration :  20 , cost :  0.011147634
Out iteration:  2
    iteration :  5 , cost :  0.0019389873
    iteration :  10 , cost :  0.0015561715
    iteration :  15 , cost :  0.0012655357
    iteration :  20 , cost :  0.0011443329
Out iteration:  3
    iteration :  5 , cost :  0.0012811279
    iteration :  10 , cost :  0.0011820417
    iteration :  15 , cost :  0.0011152904
    iteration :  20 , cost :  0.0010681031
Out iteration:  4
    iteration :  5 , cost :  0.0011503145
    iteration :  10 , cost :  0.0010669037
    iteration :  15 , cost :  0.0009611702
    iteration :  20 , cost :  0.0008440685
Out iteration:  5
    iteration :  5 , cost :  0.0008549378
    iteration :  10 , cost :  0.0008220407
    iteration :  15 , cost :  0.00078954484
    iteration :  20 , cost :  0.

In [40]:
data_input = tf.placeholder(tf.float32, shape=[None, 118], name='data_input')
ae1svm = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.3, 1e4, 3.0, 400,
                       autoencoder_activation='sigmoid',
                       full_op=tf.train.AdamOptimizer(1e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

In [42]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, x_train, y_train, epochs_1=10, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))

Combined train
Epoch: 1 Loss: 0.005133832961843603 ( 2.0757060618930331e-07 x 10000.0 + 0.0030581268175835117 ) AUROC: 0.5938248382399152
Epoch: 2 Loss: 0.0023592526411897498 ( 9.218054321916911e-08 x 10000.0 + 0.0014374471929903267 ) AUROC: 0.47143266682762475
Epoch: 3 Loss: 0.00111183113751735 ( 4.779282124767853e-08 x 10000.0 + 0.0006339029481790136 ) AUROC: 0.47414711687780303
Epoch: 4 Loss: 0.000636158699570021 ( 3.826987147337059e-08 x 10000.0 + 0.000253459988983773 ) AUROC: 0.5256240758559357
Epoch: 5 Loss: 0.00040789565852277095 ( 3.208330369252605e-08 x 10000.0 + 8.706260628102137e-05 ) AUROC: 0.5983669987083585
Epoch: 6 Loss: 0.00031276110387546037 ( 2.9071924136254864e-08 x 10000.0 + 2.2041857710592103e-05 ) AUROC: 0.597761574151571
Epoch: 7 Loss: 0.0002683487863874821 ( 2.683467986637356e-08 x 10000.0 + 1.9971919561988733e-09 ) AUROC: 0.9386815567580732
Epoch: 8 Loss: 0.00012692300000830207 ( 1.3301545365742023e-08 x 10000.0 + -6.092454340361148e-06 ) AUROC: 0.9533265233465

In [43]:
ae_only = AEOneClassSVM(data_input, batch_size, 'test_ae', autoencoder_layers[1:], 0.3, 1e4, 3.0, 400,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(1e-3))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=20)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
Epoch: 1 Loss: 2.3752314658983457e-07
Epoch: 2 Loss: 1.0828961513066572e-07
Epoch: 3 Loss: 6.899528518709718e-08
Epoch: 4 Loss: 4.367205356212979e-08
Epoch: 5 Loss: 3.686946095630472e-08
Epoch: 6 Loss: 3.163596261788208e-08
Epoch: 7 Loss: 2.9726613091325343e-08
Epoch: 8 Loss: 2.8324936053413792e-08
Epoch: 9 Loss: 2.6877216321419e-08
Epoch: 10 Loss: 2.545191423577547e-08
Epoch: 11 Loss: 1.6543619306538666e-08
Epoch: 12 Loss: 1.5866660503779354e-08
Epoch: 13 Loss: 1.5412354701396878e-08
Epoch: 14 Loss: 1.505210683086784e-08
Epoch: 15 Loss: 8.93159368299866e-09
Epoch: 16 Loss: 7.769486907747482e-09
Epoch: 17 Loss: 7.503624968411175e-09
Epoch: 18 Loss: 7.317159902072299e-09
Epoch: 19 Loss: 7.170258832294972e-09
Epoch: 20 Loss: 7.052394515119895e-09
AE time: 28.71333956718445


In [51]:
# Train conventional OCSVM on bottleneck layer
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.13, verbose=True, shrinking=True)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 56.95638394355774
Test time: 11.061755895614624
{'AUPRC': 0.9951433364355408,
 'AUROC': 0.9518083683168342,
 'Confusion matrix': array([[44654,  3985],
       [   37,  2523]], dtype=int64),
 'F1': 0.9569056037715633,
 'P@10': 0.950008790951181,
 'Precision': 0.9991720928151082,
 'Recall': 0.9180698616336684}


In [53]:
# Train conventional OCSVM-Linear on bottleneck layer
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.14, verbose=True, shrinking=True, kernel='linear')
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 29.205822706222534
Test time: 5.433445930480957
{'AUPRC': 0.9844829829510572,
 'AUROC': 0.8488579003281831,
 'Confusion matrix': array([[43607,  5032],
       [  509,  2051]], dtype=int64),
 'F1': 0.9402619804862271,
 'P@10': 0.9500283263982496,
 'Precision': 0.9884622359234745,
 'Recall': 0.8965439256563663}


In [54]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.13, verbose=True, shrinking=False, kernel='linear', tol=0.1)
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 920.3023386001587
Test time: 323.52916526794434
{'AUPRC': 0.9910012569558307,
 'AUROC': 0.9121187633637616,
 'Confusion matrix': array([[44422,  4217],
       [  228,  2332]], dtype=int64),
 'F1': 0.9523523673745029,
 'P@10': 0.9500478618453183,
 'Precision': 0.9948936170212765,
 'Recall': 0.9133000267275232}


In [55]:
x_train, y_train, x_test, y_test = kddcup(random_state=3, percent10=False)

In [59]:
autoencoder_layers = [113, 80, 40, 20]
batch_size = 1024
data_input = tf.placeholder(tf.float32, shape=[None, 113], name='data_input_2')

In [78]:
ae1svm = AEOneClassSVM(data_input, batch_size, 'testfull', autoencoder_layers[1:], 0.25, 1e5, 3.0, 400,
                       autoencoder_activation='sigmoid',
                       full_op=tf.train.AdamOptimizer(5e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

In [81]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, x_train, y_train, epochs_1=5, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))

Combined train
Epoch: 1 Loss: 0.0004017893567186624 ( 3.99424684781271e-09 x 100000.0 + 2.364681148951696e-06 ) AUROC: 0.9072939582793937
Epoch: 2 Loss: 0.00027372131719001166 ( 2.745042451598744e-09 x 100000.0 + -7.829387312352536e-07 ) AUROC: 0.9490486052016421
Epoch: 3 Loss: 0.00023601988392443185 ( 2.367943749345879e-09 x 100000.0 + -7.74488252508887e-07 ) AUROC: 0.6270106519548841
Epoch: 4 Loss: 0.00016080325610709533 ( 1.6159207660144933e-09 x 100000.0 + -7.888224880092238e-07 ) AUROC: 0.6307371892011053
Epoch: 5 Loss: 8.719775410032689e-05 ( 8.799845884620085e-10 x 100000.0 + -8.006993178929529e-07 ) AUROC: 0.9706068972420496
SVM train
Train time: 143.3942165374756
Test time: 4.308180809020996
{'AUPRC': 0.9970054674308171,
 'AUROC': 0.9701269396528461,
 'Confusion matrix': array([[458661,  27730],
       [    70,  25530]], dtype=int64),
 'F1': 0.9705858079697648,
 'P@10': 0.9500098636472838,
 'Precision': 0.999847405124136,
 'Recall': 0.9429882543056923}
