In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random as rn

import os

import tensorflow as tf

#DNN
from tensorflow.keras.layers import Lambda, Concatenate, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras import regularizers
from tensorflow.keras.initializers import glorot_normal
from sklearn.model_selection import train_test_split

#sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


import matplotlib.ticker as ticker
from numpy import load
import pickle
from tensorflow.compat.v1.keras import backend as K
import math

import time

print("libraries loaded")

libraries loaded


In [2]:
#Parameters

training_percentage = 10
batch_size = 128
model_name = "model_N_training_perc_"+str(training_percentage)+"perc_training_batch_size_"+str(batch_size)
epochs_number = 16



In [3]:
def refresh_riproducibility(seed):
   
    #set seed 

    #Python SEED
    os.environ['PYTHONHASHSEED'] = '0'
    
    #numpy seed
    np.random.seed(seed)
    
    #tf seed
    tf.random.set_seed(seed)
    
    #rn seed
    rn.seed(seed)
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)
    
    #
    from tensorflow.compat.v1.keras import backend as K
    
    #sess
    sess = tf.compat.v1.get_default_session()
    K.set_session(sess)
    
    #tf seed
    tf.compat.v1.set_random_seed(seed)
    
    #os.environ['KERAS_BACKEND'] = "tensorflow"
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

In [4]:
def init_detector(params):
    
    #read params
    
    input_dim = params["input_dimension"]
    output_dim = input_dim
    optimizer = params["optimizer"]
    loss = params["loss"]
    inner_seed = params["seed"]
    add_noise = params["add_noise"]
    #reg_value = params["reg_value"]
    
    
    # this is our input placeholder
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    
    # encoding phase
    if add_noise:
        #add noise
        noise_layer = tf.keras.layers.GaussianNoise(0.1, seed=inner_seed)(input_layer)
        encoded_l0 = tf.keras.layers.Dense(1024, activation='relu')(noise_layer)
    else:
        encoded_l0 = tf.keras.layers.Dense(1024, activation='relu')(input_layer)
       
    encoded_l1 = tf.keras.layers.Dense(256, activation='relu')(encoded_l0)
    encoded_l2 = tf.keras.layers.Dense(64, activation='relu')(encoded_l1)
    
    latent_space = tf.keras.layers.Dense(32, activation='relu')(encoded_l2)
    
    # decoding phase 
    decoded_l2 = tf.keras.layers.Dense(64, activation="relu")(latent_space)
    
    l1_in = tf.keras.layers.Concatenate()([decoded_l2, encoded_l2])
    decoded_l1 = tf.keras.layers.Dense(256, activation="relu")(l1_in)
    
    l0_in = tf.keras.layers.Concatenate()([decoded_l1, encoded_l1])
    decoded_l0 = tf.keras.layers.Dense(1024, activation="relu")(l0_in)
    
    out_in = tf.keras.layers.Concatenate()([decoded_l0, encoded_l0])
    output_layer = tf.keras.layers.Dense(output_dim, activation="sigmoid")(out_in)

    # this model maps an input to its reconstruction
    model = tf.keras.models.Model(input_layer, output_layer)

    model.compile(optimizer=optimizer, loss=loss)
    print(model)

    return model

def build_detector(data, columns_names, params):

    view = data[columns_names]
    batch_size = params["batch_size"]
    num_epoch = params["num_epoch"]
    verbose = params["verbose_output"]
    
    model = init_detector(params) 

    history = model.fit(view, view, batch_size=batch_size, epochs=num_epoch, verbose=verbose)
    
    return model, history

In [5]:
#add more features

def add_port_type(d):
    
    d["port_well_known"] = (d["Destination_Port"] < 1024).astype(np.float64) 
    d["port_registered"] = ((d["Destination_Port"] > 1024) & (d["Destination_Port"] < 49152)).astype(np.float64)
    d["port_dynamic"] = (d["Destination_Port"] > 49152).astype(np.float64)
    
    return d
    
def add_more_feat(d, columns, add_to_list=False):
    
    range_values = [2, 4, 8, 16]
    
    list_copy = columns.copy()
    
    for c in columns:
        
        d["one_minus_"+c] = 1 - d[c].clip(0,1) 
        
        #power 
        for v in range_values:
            d["power_"+str(v)+"_"+c] = d[c]**v
        
        #root
        for v in range_values:
            d["root_"+str(v)+"_"+c] = d[c].clip(0,1)**(1/v)
        
        #sin
        d["sin_"+c] = np.sin(math.pi * d[c].clip(0,1))
        
        #log
        d["log_"+c] = np.log((d[c].clip(0,1)+1)/math.log(2))
            
        #exp
        d["exp_"+c] = np.exp(d[c]-1)
        
        if add_to_list:
            
            list_copy.append("one_minus_"+c)
            
            for v in range_values:
                list_copy.append("power_"+str(v)+"_"+c)
            for v in range_values:
                list_copy.append("root_"+str(v)+"_"+c)   

            list_copy.append("sin_"+c)
            list_copy.append("log_"+c)
            list_copy.append("exp_"+c)
                   
    return d, list_copy

In [6]:
# Read the training set
training = pd.read_csv("./your/training/path/training.csv")
print(training.shape)
#print(training.columns)
training.describe()

(529918, 80)


Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,class
count,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,...,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0,529918.0
mean,10644.367112,10389270.0,10.390315,11.517105,532.4195,17898.41,190.897188,20.277279,50.744078,57.452269,...,-3614.576,68434.82,43219.3,145390.7,43803.69,3463918.0,202440.8,3620657.0,3274066.0,0.0
std,21390.213475,28751950.0,892.412791,1173.318788,6228.642,2675470.0,448.833754,36.275793,91.964713,146.518081,...,552632.8,587232.2,397145.5,1028606.0,499367.7,12970570.0,2170149.0,13406490.0,12732160.0,0.0
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-83885310.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,176.0,2.0,1.0,18.0,0.0,6.0,0.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,31303.0,2.0,2.0,68.0,144.0,40.0,6.0,38.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,355744.8,4.0,3.0,187.0,392.0,83.0,40.0,53.0,26.162951,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65535.0,120000000.0,219759.0,291922.0,1323378.0,655453000.0,23360.0,2293.0,4638.923469,7125.596846,...,126.0,101659700.0,64349500.0,101659700.0,101659700.0,119999700.0,75145020.0,119999700.0,119999700.0,0.0


In [7]:
## SET PARAMETERS

# set random seed for reproducibility

seed = 23071982
np.random.seed(seed)

to_remove = ["Destination_Port","Flow_Bytes_s","Flow_Packets_s","port_type","class"]

DEBUG = 1
VERBOSE_OUTPUT = 1

ADD_FEATURES = 1
DROP_REDUNDANT_FEATURES = 1
ADD_NOISE = 1




print("Percentage of Training "+str(training_percentage)+"%")
if training_percentage < 100:
    X_train, X_test, y_train, y_test = train_test_split(training, training, train_size = training_percentage/100)
    training = X_train
    
print(training.shape)

#define feat
columns_of_interest = [x for x in training.columns if x not in to_remove] 
#initial feat set
original_feat = columns_of_interest.copy()


Percentage of Training 10%
(52991, 80)


In [8]:
#drop correlate feats

if DROP_REDUNDANT_FEATURES:
    cor_matrix = training.corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
    #print(upper_tri)
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
    print()
    print("number to drop:", len(to_drop))
    print("to drop: ", to_drop)

    #update feats
    columns_of_interest = [x for x in columns_of_interest if x not in to_drop]
    original_feat = columns_of_interest.copy()

    print("Final Columns:", columns_of_interest)
    print("Final Columns:", len(columns_of_interest))



number to drop: 39
to drop:  ['Total_Backward_Packets', 'Total_Length_of_Bwd_Packets', 'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_IAT_Max', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Bwd_Header_Length', 'Fwd_Packets_s', 'Max_Packet_Length', 'Packet_Length_Mean', 'Packet_Length_Std', 'Packet_Length_Variance', 'SYN_Flag_Count', 'PSH_Flag_Count', 'ECE_Flag_Count', 'Average_Packet_Size', 'Avg_Fwd_Segment_Size', 'Avg_Bwd_Segment_Size', 'Fwd_Header_Length.1', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes', 'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'act_data_pkt_fwd', 'min_seg_size_forward', 'Active_Max', 'Active_Min', 'Idle_Mean', 'Idle_Max', 'Idle_Min']
Final Columns: ['Flow_Duration', 'Total_Fwd_Packets', 'Total_Length_of_Fwd_Packets', 'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Min', 'Fwd_Packet_Length_Mean', 'Bwd_Packet_Length_Max', 'Bwd_Packet_Length_M

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


In [9]:
#fit and apply scaler on training set
scaler_x =  MinMaxScaler(feature_range=(0, 1)) #MinMaxScaler(feature_range=(0, 1)) # StandardScaler()
scaler_x.fit(training[columns_of_interest])
training[columns_of_interest] = scaler_x.transform(training[columns_of_interest])
print(training[columns_of_interest].shape)
print("done")

(52991, 36)
done


In [10]:
#add further feat
if ADD_FEATURES:
    training, columns_of_interest = add_more_feat(training, columns_of_interest, True)

    print(len(columns_of_interest))
    print(training.shape)

    print("done")

  d["power_"+str(v)+"_"+c] = d[c]**v
  d["root_"+str(v)+"_"+c] = d[c].clip(0,1)**(1/v)
  d["sin_"+c] = np.sin(math.pi * d[c].clip(0,1))
  d["log_"+c] = np.log((d[c].clip(0,1)+1)/math.log(2))
  d["exp_"+c] = np.exp(d[c]-1)
  d["one_minus_"+c] = 1 - d[c].clip(0,1)


468
(52991, 512)
done


In [11]:
#add port feat
training = add_port_type(training)
columns_of_interest.append("port_well_known")
columns_of_interest.append("port_registered")
columns_of_interest.append("port_dynamic")

print(training[columns_of_interest].shape)
print(training[columns_of_interest].describe())

  d["port_well_known"] = (d["Destination_Port"] < 1024).astype(np.float64)
  d["port_registered"] = ((d["Destination_Port"] > 1024) & (d["Destination_Port"] < 49152)).astype(np.float64)
  d["port_dynamic"] = (d["Destination_Port"] > 49152).astype(np.float64)


(52991, 471)
       Flow_Duration  Total_Fwd_Packets  Total_Length_of_Fwd_Packets  \
count   52991.000000       52991.000000                 52991.000000   
mean        0.086803           0.000857                     0.002111   
std         0.240130           0.006556                     0.010370   
min         0.000000           0.000000                     0.000000   
25%         0.000001           0.000163                     0.000078   
50%         0.000261           0.000163                     0.000295   
75%         0.003083           0.000489                     0.000812   
max         1.000000           1.000000                     1.000000   

       Fwd_Packet_Length_Max  Fwd_Packet_Length_Min  Fwd_Packet_Length_Mean  \
count           52991.000000           52991.000000            52991.000000   
mean                0.008096               0.008865                0.013170   
std                 0.018382               0.016782                0.023767   
min                 0.

In [12]:
print("input dim: ", len(columns_of_interest))

#init params
print("batch size: "+str(batch_size))
reconstructor_params = {"input_dimension": len(columns_of_interest),
                        "batch_size" : batch_size,
                        "num_epoch" : epochs_number,
                        "verbose_output" : VERBOSE_OUTPUT,
                        "optimizer":'adam',
                        "seed": seed,
                        "add_noise": ADD_NOISE,
                        #"reg_value": 10e-7,
                        "loss":"mse"}


#refresh ripr.
refresh_riproducibility(seed)

# starts training phase

start_time = int(round(time.time() * 1000))

detector, history = build_detector(training, columns_of_interest, reconstructor_params)

end_time = int(round(time.time() * 1000))
total_time = end_time - start_time
#print(start_time)
#print(end_time)
print('Total Learning Time:'+str(total_time))

input dim:  471
batch size: 128



2022-08-02 17:08:45.947955: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-08-02 17:08:45.947990: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: f95c84461acc
2022-08-02 17:08:45.947997: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: f95c84461acc
2022-08-02 17:08:45.948099: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.103.1
2022-08-02 17:08:45.949288: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.103.1
2022-08-02 17:08:45.949299: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.103.1
2022-08-02 17:08:45.949882: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instr

<keras.engine.functional.Functional object at 0x7f3d5b277d00>


2022-08-02 17:08:46.406986: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 199670088 exceeds 10% of free system memory.
2022-08-02 17:08:47.191625: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 199670088 exceeds 10% of free system memory.
2022-08-02 17:08:47.890019: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 199670088 exceeds 10% of free system memory.
2022-08-02 17:08:47.936815: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 199670088 exceeds 10% of free system memory.


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Total Learning Time:192862


In [None]:
type(history.history)
his = history.history['loss']
df_his = pd.DataFrame(data=his, columns=["Loss"])
df_his['epoch'] = range(1, len(df_his) + 1)
print(df_his)

In [15]:
#Save History
file_history = "./output/history/history_"+model_name
print(file_history)
np.save(file_history,history.history)

./output/history/history_model_N_training_perc_10perc_training_batch_size_128


In [19]:
#compute reconstruction
training_predictions = detector.predict(training[columns_of_interest], batch_size=4096)

2022-08-02 17:11:59.716674: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 199670088 exceeds 10% of free system memory.




In [20]:
# compute outlierness on training
outlierness_training = np.sum(np.power(np.absolute(training_predictions - training[columns_of_interest]), 1), axis=1)

In [21]:
#compute threshold
threshold = outlierness_training.quantile(0.95)
print(threshold)

1.9936452690116007


In [23]:
# Read the test set

testset = pd.read_csv("./your/path/to/testset.csv")
testset

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,port_type,class
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,registered,0.0
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225740,61374,61,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0
225741,61378,72,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0
225742,61375,75,1,1,6,6,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0
225743,61323,48,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,dynamic,0.0


In [None]:
#extract class
y_test = testset["class"]
print(y_test)

In [25]:
#compute scaling
testset[original_feat] = scaler_x.transform(testset[original_feat])
testset[original_feat]

Unnamed: 0,Flow_Duration,Total_Fwd_Packets,Total_Length_of_Fwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Bwd_Packet_Length_Max,Bwd_Packet_Length_Min,Flow_IAT_Mean,Flow_IAT_Std,...,Fwd_Avg_Packets_Bulk,Fwd_Avg_Bulk_Rate,Bwd_Avg_Bytes_Bulk,Bwd_Avg_Packets_Bulk,Bwd_Avg_Bulk_Rate,Init_Win_bytes_forward,Init_Win_bytes_backward,Active_Mean,Active_Std,Idle_Std
0,3.333337e-08,0.000163,0.000052,0.000257,0.002617,0.001559,0.000000,0.00000,3.647615e-08,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000519,0.000000,0.0,0.0,0.0
1,9.166676e-07,0.000000,0.000026,0.000257,0.002617,0.001559,0.000587,0.00411,1.003094e-06,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000458,0.003922,0.0,0.0,0.0
2,4.416671e-07,0.000000,0.000026,0.000257,0.002617,0.001559,0.000587,0.00411,4.833089e-07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000458,0.003922,0.0,0.0,0.0
3,2.916670e-07,0.000000,0.000026,0.000257,0.002617,0.001559,0.000587,0.00411,3.191663e-07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000488,0.005035,0.0,0.0,0.0
4,3.333337e-08,0.000163,0.000052,0.000257,0.002617,0.001559,0.000000,0.00000,3.647615e-08,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000504,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225740,5.166672e-07,0.000000,0.000026,0.000257,0.002617,0.001559,0.000587,0.00411,5.653802e-07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004410,0.003876,0.0,0.0,0.0
225741,6.083340e-07,0.000000,0.000026,0.000257,0.002617,0.001559,0.000587,0.00411,6.656896e-07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004410,0.003876,0.0,0.0,0.0
225742,6.333340e-07,0.000000,0.000026,0.000257,0.002617,0.001559,0.000587,0.00411,6.930468e-07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.004410,0.003876,0.0,0.0,0.0
225743,4.083338e-07,0.000163,0.000052,0.000257,0.002617,0.001559,0.000000,0.00000,4.468328e-07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.072021,0.000000,0.0,0.0,0.0


In [26]:
#add derived feat
if ADD_FEATURES:
    testset, _ = add_more_feat(testset, original_feat, False)
    print("done")

  d["power_"+str(v)+"_"+c] = d[c]**v
  d["root_"+str(v)+"_"+c] = d[c].clip(0,1)**(1/v)
  d["sin_"+c] = np.sin(math.pi * d[c].clip(0,1))
  d["log_"+c] = np.log((d[c].clip(0,1)+1)/math.log(2))
  d["exp_"+c] = np.exp(d[c]-1)
  d["one_minus_"+c] = 1 - d[c].clip(0,1)


done


In [27]:
#add port type
testset = add_port_type(testset)
print("done")

done


  d["port_well_known"] = (d["Destination_Port"] < 1024).astype(np.float64)
  d["port_registered"] = ((d["Destination_Port"] > 1024) & (d["Destination_Port"] < 49152)).astype(np.float64)
  d["port_dynamic"] = (d["Destination_Port"] > 49152).astype(np.float64)


In [28]:
testset.shape

(225745, 515)

In [29]:
#compute reconstructions
start_time = int(round(time.time() * 1000))

predictions = detector.predict(testset[columns_of_interest], batch_size=4096)

end_time = int(round(time.time() * 1000))
total_time = end_time - start_time
print('Total Prediction Time:'+str(total_time))
print('Single Prediction Time:'+str(total_time/testset.shape[0]))

Total Prediction Time:19581
Single Prediction Time:0.08673946266805466


In [30]:
#compute outlierness
outlierness = np.sum(np.power(np.absolute(predictions - testset[columns_of_interest]), 1), axis=1)


In [32]:
#compute prediction
y_pred = outlierness > threshold

#debug: num of yes
np.sum(y_pred)

136252

In [33]:
from sklearn.metrics import *

report_map = classification_report(y_test, y_pred, output_dict=True)
print(report_map)
acc_score = accuracy_score(y_test, y_pred)

result = str(acc_score)+";"+str(report_map['1.0']['precision']) + ";" + str(report_map['1.0']['recall']) + ";" + str(report_map['1.0']['f1-score']) 
print("acc;prec;rec;f1")
print(result)

{'0.0': {'precision': 0.69652375046093, 'recall': 0.6378968050922041, 'f1-score': 0.665922408405489, 'support': 97718}, '1.0': {'precision': 0.740304729471861, 'recall': 0.7878650597139666, 'f1-score': 0.7633447984894752, 'support': 128027}, 'accuracy': 0.7229484595450619, 'macro avg': {'precision': 0.7184142399663955, 'recall': 0.7128809324030854, 'f1-score': 0.7146336034474821, 'support': 225745}, 'weighted avg': {'precision': 0.7213533032742036, 'recall': 0.7229484595450619, 'f1-score': 0.7211736712697052, 'support': 225745}}
acc;prec;rec;f1
0.7229484595450619;0.740304729471861;0.7878650597139666;0.7633447984894752


In [34]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
far = cm[0][1]/(cm[0][1]+cm[1][1])
print("FAR: ", far)

[[ 62334  35384]
 [ 27159 100868]]
FAR:  0.259695270528139


In [35]:
auc_score = roc_auc_score(y_test, outlierness)
print("AUC: ", auc_score)

pr1, rec1, thr1 = precision_recall_curve(y_test, outlierness)
auc_score_pr = auc(rec1,pr1)
print("AUC-PR: ", auc_score_pr)

AUC:  0.7142094663296878
AUC-PR:  0.646366438527601


In [36]:
import imblearn as imb
from imblearn.metrics import geometric_mean_score
g_mean = str(round(geometric_mean_score(y_test, y_pred, average = 'binary'), 3))
print(("G-Mean: ", g_mean))

('G-Mean: ', '0.709')
