In [1]:
cd ..

C:\Users\Nghia\PycharmProjects\ECML\Refactor


In [2]:
from pprint import pprint

import numpy as np
import tensorflow as tf
import time
from collections import Counter

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from load_datasets import kddcup
from metrics import metrics
from models.AE1SVM import AEOneClassSVM
from models.DEC import DEC
from models.RDA import RobustL21Autoencoder

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [3]:
tf.set_random_seed(2018)

x_train, y_train, x_test, y_test = kddcup(random_state=3, percent10=True)

counter = Counter(y_train)
print('Anomalies ratio:', 100*counter[-1]/(counter[1]+counter[-1]), '%')

Anomalies ratio: 4.998242118832767 %


In [4]:
autoencoder_layers = [118, 80, 40, 20]
batch_size = 128

In [5]:
data_input = tf.placeholder(tf.float32, shape=[None, 118], name='data_input')
ae1svm = AEOneClassSVM(data_input, batch_size, 'test', autoencoder_layers[1:], 0.3, 1e4, 3.0, 400,
                       autoencoder_activation='sigmoid', seed=3,
                       full_op=tf.train.AdamOptimizer(1e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

In [6]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, epochs_1=10, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))

Combined train
..........SVM train
Train time: 38.57740807533264
Test time: 1.0728542804718018
{'AUPRC': 0.6432318122293834,
 'AUROC': 0.9736633788921956,
 'Confusion matrix': array([[ 2493,    67],
       [ 1289, 47350]], dtype=int64),
 'F1': 0.7861873226111638,
 'Precision': 0.6591750396615548,
 'Recall': 0.973828125}


In [7]:
# Train conventional RBF-OCSVM on raw input
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.1, verbose=True, shrinking=False)
t0 = time.time()
libsvm.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 150.31826615333557
Test time: 51.525691747665405
{'AUPRC': 0.3400387878100758,
 'AUROC': 0.8881571894788647,
 'Confusion matrix': array([[ 2163,   397],
       [ 3337, 45302]], dtype=int64),
 'F1': 0.5367245657568239,
 'Precision': 0.3932727272727273,
 'Recall': 0.844921875}


In [9]:
# Train Isolation Forest
print('IsolationForest')
iforest = IsolationForest(contamination=0.12, verbose=1)
t0 = time.time()
iforest.fit(x_train)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = iforest.predict(x_test)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

IsolationForest


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.6s finished


Train time: 22.20285129547119
Test time: 7.216731071472168
{'AUPRC': 0.3889741096174527,
 'AUROC': 0.953812908462088,
 'Confusion matrix': array([[ 2529,    31],
       [ 3904, 44735]], dtype=int64),
 'F1': 0.5624374513510508,
 'Precision': 0.39312917767759986,
 'Recall': 0.987890625}


In [10]:
ae_only = AEOneClassSVM(data_input, batch_size, 'test_ae', autoencoder_layers[1:], 0.3, 1e4, 3.0, 400,
                        autoencoder_activation='sigmoid', ae_op=tf.train.AdamOptimizer(1e-3))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Train autoencoder for conventional methods
    t0 = time.time()
    ae_only.fit_ae(sess, x_train, epochs=20)
    print('AE time:', time.time() - t0)

    x_train_encoded = ae_only.encode(sess, x_train)
    x_test_encoded = ae_only.encode(sess, x_test)

    x_train_rff = ae_only.encode_rff(sess, x_train)
    x_test_rff = ae_only.encode_rff(sess, x_test)

Autoencoder train
....................AE time: 34.415762424468994


In [11]:
# Train conventional OCSVM on bottleneck layer
print('OCSVM-RBF')
libsvm = OneClassSVM(nu=0.13, verbose=True, shrinking=True)
t0 = time.time()
libsvm.fit(x_train_encoded)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_encoded)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-RBF
[LibSVM]Train time: 69.54750943183899
Test time: 14.63408374786377
{'AUPRC': 0.3753516777767069,
 'AUROC': 0.9510990449086638,
 'Confusion matrix': array([[ 2527,    33],
       [ 4130, 44509]], dtype=int64),
 'F1': 0.5483345991103397,
 'Precision': 0.37960042060988436,
 'Recall': 0.987109375}


In [13]:
print('OCSVM-Linear on RFF')
libsvm = OneClassSVM(nu=0.13, verbose=True, shrinking=False, kernel='linear', tol=0.1)
t0 = time.time()
libsvm.fit(x_train_rff)
print('Train time:', time.time() - t0)

t0 = time.time()
out_y = libsvm.predict(x_test_rff)
print('Test time:', time.time() - t0)
pprint(metrics(y_test, out_y))

OCSVM-Linear on RFF
[LibSVM]Train time: 1028.5832569599152
Test time: 352.8964011669159
{'AUPRC': 0.35879784026138095,
 'AUROC': 0.9399662123308969,
 'Confusion matrix': array([[ 2475,    85],
       [ 4225, 44414]], dtype=int64),
 'F1': 0.5345572354211663,
 'Precision': 0.3694029850746269,
 'Recall': 0.966796875}


In [19]:
dec = DEC(dims=autoencoder_layers, n_clusters=5)
t0 = time.time()
dec.pretrain(x=x_train, epochs=1)
dec.compile(loss='kld')
y_pred = dec.fit(x_train, update_interval=10, batch_size=batch_size)
print('Train time:', time.time() - t0)

t0 = time.time()
scores = dec.cluster_score(x_test)
print('Test time:', time.time() - t0)
threshold = np.partition(scores.flatten(), int(counter[-1]))[int(counter[-1])]
out_y = np.array([1 if s > 1.5*threshold else -1 for s in scores])
pprint(metrics(y_test, out_y))

...Pretraining...
Epoch 1/1
Pretraining time:  2.645000696182251
Update interval 10
Save interval 1999.921875
Initializing cluster centers with k-means.
delta_label  0.0001757881167233095 < tol  0.001
Reached tolerance threshold. Stopping training.
Train time: 10.97651219367981
Test time: 1.6263179779052734
{'AUPRC': 0.25429772572048365,
 'AUROC': 0.91844006754482,
 'Confusion matrix': array([[ 2527,    33],
       [ 7307, 41332]], dtype=int64),
 'F1': 0.4077779570760045,
 'Precision': 0.25696562944885093,
 'Recall': 0.987109375}


In [42]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Robust Deep Autoencoder
    rae = RobustL21Autoencoder(sess=sess, lambda_=0.4, layers_sizes=autoencoder_layers, learning_rate=1e-3)
    t0 = time.time()
    L, S = rae.fit(x_train, sess=sess, inner_iteration=20, iteration=5, verbose=True, batch_size=batch_size)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    L_test, S_test = rae.predict(x_test, sess=sess)
    print('Test time:', time.time() - t0)

    s_sum = np.linalg.norm(S, axis=1)
    s_sum_test = np.linalg.norm(S_test, axis=1)
    out_y = np.array([1 if s == 0 else -1 for s in s_sum_test])
    pprint(metrics(y_test, out_y))

X shape:  (511989, 113)
L shape:  (511989, 113)
S shape:  (511989, 113)
Out iteration:  1
    iteration :  5 , cost :  0.020521827
    iteration :  10 , cost :  0.020522472
    iteration :  15 , cost :  0.02052387
    iteration :  20 , cost :  0.0031069887
Out iteration:  2
    iteration :  5 , cost :  0.00046778625
    iteration :  10 , cost :  0.0003619341
    iteration :  15 , cost :  0.0003257283
    iteration :  20 , cost :  0.00029764223
Out iteration:  3
    iteration :  5 , cost :  0.00035094327
    iteration :  10 , cost :  0.00032804316
    iteration :  15 , cost :  0.00030382848
    iteration :  20 , cost :  0.00027941816
Out iteration:  4
    iteration :  5 , cost :  0.00025388974
    iteration :  10 , cost :  0.00022003918
    iteration :  15 , cost :  0.00019523372
    iteration :  20 , cost :  0.00017683947
Out iteration:  5
    iteration :  5 , cost :  0.00018501005
    iteration :  10 , cost :  0.00016405268
    iteration :  15 , cost :  0.00015141712
    iteration :  

In [28]:
x_train, y_train, x_test, y_test = kddcup(random_state=3, percent10=False)

In [29]:
autoencoder_layers = [113, 80, 40, 20]
batch_size = 1024
data_input = tf.placeholder(tf.float32, shape=[None, 113], name='data_input_2')

In [37]:
ae1svm = AEOneClassSVM(data_input, batch_size, 'testfull', autoencoder_layers[1:], 0.25, 1e5, 3.0, 400,
                       autoencoder_activation='sigmoid', seed=1,
                       full_op=tf.train.AdamOptimizer(5e-3),
                       svm_op=tf.train.AdamOptimizer(1e-4))

In [39]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Autoencoder-OneclassSVM
    t0 = time.time()
    ae1svm.fit(sess, x_train, epochs_1=3, epochs_2=0)
    print('Train time:', time.time() - t0)

    t0 = time.time()
    out_y = ae1svm.predict(sess, x_test)
    print('Test time:', time.time() - t0)

    pprint(metrics(y_test, out_y))

Combined train
...SVM train
Train time: 100.39382600784302
Test time: 5.885189771652222
{'AUPRC': 0.4516191970448556,
 'AUROC': 0.9610736531604718,
 'Confusion matrix': array([[ 25172,    428],
       [ 29735, 456656]], dtype=int64),
 'F1': 0.6253369272237197,
 'Precision': 0.45844792103010545,
 'Recall': 0.98328125}
