In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches

%matplotlib widget

np.set_printoptions(precision=2, linewidth=150)
plt.rc('font', size=8)

def calculate_metrics(conf_mat, plot_num, keys):
    mACC = np.zeros((plot_num,4,len(keys),5))
    mPRE = np.zeros((plot_num,4,len(keys),5))
    mREC = np.zeros((plot_num,4,len(keys),5))
    mF1S = np.zeros((plot_num,4,len(keys),5))
    mPRE_PC = np.zeros((plot_num,13,len(keys),5))
    mREC_PC = np.zeros((plot_num,13,len(keys),5))
    mF1S_PC = np.zeros((plot_num,13,len(keys),5))
    CM15 = np.zeros((plot_num,15,15,len(keys),5))
    CM13 = np.zeros((plot_num,13,13,len(keys),5))
    for i in range(plot_num):
        for k in range(len(keys)):
            for fold in range(5):
                cm15 = conf_mat[i][keys[k]][fold]
                # merge web attacks
                cm13 = np.vstack((cm15[0:12],sum(cm15[12:])))
                cm13 = np.hstack((cm13[:,:12],np.expand_dims(np.sum(cm13[:,12:], axis=1),axis=1)))
                CM15[i,:,:,k,fold] = cm15
                CM13[i,:,:,k,fold] = cm13
                # merge DoS attacks
                cm11 = np.vstack((cm15[0:2],sum(cm15[2:7]),cm15[7:]))
                cm11 = np.hstack((cm11[:,:2],np.expand_dims(np.sum(cm11[:,2:7], axis=1),axis=1),cm11[:,7:]))
                # merge both web and DoS attacks
                cm09 = np.vstack((cm11[0:8],sum(cm11[8:])))
                cm09 = np.hstack((cm09[:,:8],np.expand_dims(np.sum(cm09[:,8:], axis=1),axis=1)))
                cm_array = [cm15, cm13, cm11, cm09]
                for j in range(4):
                    cm = cm_array[j]
                    TN = cm[0,0]
                    FN = sum(cm[1:,0])
                    FP = sum(cm[0,1:])
                    TP = sum(sum(cm[1:,1:]))

                    mACC[i,j,k,fold] = sum(np.diag(cm)) / np.sum(cm) * 100
                    mPRE[i,j,k,fold] = TP / (TP + FP) * 100
                    mREC[i,j,k,fold] = TP / (TP + FN) * 100
                    mF1S[i,j,k,fold] = 2*mPRE[i,j,k,fold]*mREC[i,j,k,fold]/(mPRE[i,j,k,fold]+mREC[i,j,k,fold])
                for j in range(13):
                    mPRE_PC[i,j,k,fold] = cm13[j,j]/sum(cm13[:,j]) * 100
                    mREC_PC[i,j,k,fold] = cm13[j,j]/sum(cm13[j,:]) * 100
                    mF1S_PC[i,j,k,fold] = 2*mPRE_PC[i,j,k,fold]*mREC_PC[i,j,k,fold]/(mPRE_PC[i,j,k,fold]+mREC_PC[i,j,k,fold])
    print('ACC, PRE, REC, F1S and PRE_PC, REC_PC, F1S_PC are computed.')
    return mACC, mPRE, mREC, mF1S, mPRE_PC, mREC_PC, mF1S_PC, CM15, CM13

def print_tabular(*argv):
    print(' \\\\\n'.join([" & ".join(map(lambda x:"{:5.2f}".format(x),line))
    for line in argv])+' \\\\\n')
    
def plot_accross_folds(metric, cm_num, ax, ylabel, plot_type, plot_num, xlabel=None, xticks=[], xticklabels=[], label=[]):
    ax.set_ylabel(ylabel, fontsize=12)
    if not xlabel is None: ax.set_xlabel(xlabel, fontsize=12)
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.tick_params(axis='both', which='major', labelsize=10)
    ax.grid(True, axis='y')
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    #ax._get_lines.get_next_color()
    #colors = colors[1:]
    for i in range(plot_num):
        if plot_type == 'violin':
            vp = ax.violinplot(metric[i,cm_num].T, positions = np.arange(1,len(keys)+1), showmeans=True)
            ax.plot([1],[metric[i,cm_num,0,0]], color=colors[i], label=label[i] if label else None)
        elif plot_type == 'meanline':
            vp = ax.bar(np.arange(metric[i,cm_num].shape[0]),np.mean(metric[i,cm_num], axis=1), label=label[i] if label else None, alpha=0.8)
            
def calc_plot_kde(ax, metric, acc):
    kde = KernelDensity(bandwidth=1.6, kernel='gaussian')
    kde.fit(metric[:, None])

    x = np.linspace(65, 103, 10000)
    pdf_x = np.exp(kde.score_samples(x[:, None]))
    pdf_x[x>100] = 0
    pdf_x = pdf_x / np.trapz(pdf_x, x, dx=0.001)

    ind = np.argwhere(x>=acc)[0,0]
    print(np.trapz(pdf_x[ind:], x[ind:], dx=0.001))


    ax.hist(metric, bins=15, density=True, rwidth=0.9, alpha=0.5, label='normalized histogram')
    ax.plot(x, pdf_x, color='k', label='estimated density')
    ax.plot(metric, np.full_like(metric, -0.001), '|k', markeredgewidth=1)
    ax.plot([acc, acc], [0,pdf_x[ind]], '--', label='accuracy of top 9 features')
    ax.tick_params(axis='both', which='major', labelsize=10)
    ax.set_ylim(-0.005)
    handles, labels = plt.gca().get_legend_handles_labels()
    order = [2,0,1]
    ax.legend([handles[idx] for idx in order],[labels[idx] for idx in order], fontsize=12)

In [None]:
conf_mat_path = '../results/Accuracies/Accuracies_complete/conf_matrix.npy'
conf_mat = np.load(conf_mat_path, allow_pickle=True)
keys = sorted(conf_mat[0].keys(), key=lambda x: -100*ord(x[0])+int(x[-1]))
metrics_complete = calculate_metrics(conf_mat, 1, keys)

conf_mat_path = '../results/Accuracies/Accuracies_uniform/conf_matrix.npy'
conf_mat = np.load(conf_mat_path, allow_pickle=True)
metrics_uniform = calculate_metrics(conf_mat, 2, keys)

# ACC, PRE, REC, F1S and PRE_PC, REC_PC, F1S_PC
metrics = [np.concatenate((metrics_complete[i], metrics_uniform[i]), axis=0) for i in range(len(metrics_complete))]

keys = [key[:-1].upper()+'-'+key[-1] for key in keys]
xticks = np.arange(1,len(keys)+1)
xticklabels = [key.split('_')[-1].lstrip("0") for key in keys]

fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1,figsize=(12,10))
plt.subplots_adjust(left=0.06, bottom=0.1, right=0.99, top=0.90)
plot_accross_folds(metrics[0], 0, ax1, 'ACC-15', 'violin', 3)
plot_accross_folds(metrics[1], 0, ax2, 'PRE', 'violin', 3)
plot_accross_folds(metrics[2], 0, ax3, 'REC', 'violin', 3)
plot_accross_folds(metrics[3], 0, ax4, 'F1S', 'violin', 3, xticks=xticks, xticklabels=xticklabels, label=['Experiment 1u', 'Experiment 2u', 'Experiment 2nu'])
fig.legend(loc=1, prop={'size': 10})
fig.savefig('analyze_models.pdf')

In [None]:
from IPython.display import display
pd.options.display.float_format = '{:,.1f}'.format

classes15 = ['BENIGN', 'Bot', 'DDoS', 'DoS GoldenEye', 'DoS Hulk', 
           'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed',
           'Infiltration', 'PortScan', 'SSH-Patator', 'Web Attack - Brute Force',
           'Web Attack - Sql Injection', 'Web Attack - Xss', 'Average']

classes13 = ['BENIGN', 'Bot', 'DDoS', 'DoS GoldenEye', 'DoS Hulk', 
           'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed',
           'Infiltration', 'PortScan', 'SSH-Patator',  
           'Web Attack', 'Average']

CM15 = np.sum(metrics[7][1,:,:,11,:],axis=-1).astype(int)
CM13 = np.sum(metrics[8][1,:,:,11,:],axis=-1).astype(int)

acc15 = np.mean(metrics[0][1,0,:,:],axis=-1)
acc13 = np.mean(metrics[0][1,1,:,:],axis=-1)
pre = np.mean(metrics[1][1,0,:,:],axis=-1)
rec = np.mean(metrics[2][1,0,:,:],axis=-1)
f1s = np.mean(metrics[3][1,0,:,:],axis=-1)

# For 13-class
pre_pc = np.mean(metrics[4][1,:,:,:],axis=-1)
pre_pc = np.concatenate((pre_pc,[np.nanmean(pre_pc,axis=0)]),axis=0)
rec_pc = np.mean(metrics[5][1,:,:,:],axis=-1)
rec_pc = np.concatenate((rec_pc,[np.mean(rec_pc,axis=0)]),axis=0)
f1s_pc = np.mean(metrics[6][1,:,:,:],axis=-1)
f1s_pc = np.concatenate((f1s_pc,[np.mean(np.nan_to_num(f1s_pc),axis=0)]),axis=0)

display(pd.DataFrame(data=CM15, index=classes15[:-1], columns=classes15[:-1]))
display(pd.DataFrame(data=CM13, index=classes13[:-1], columns=classes13[:-1]))
display(pd.DataFrame(data=[acc15,acc13,pre,rec,f1s], columns=keys, index=['ACC-15','ACC-13','PRE','REC','F1S']).transpose())
display(pd.DataFrame(data=pre_pc, index=classes13))
display(pd.DataFrame(data=rec_pc, index=classes13))
display(pd.DataFrame(data=f1s_pc, index=classes13))

In [None]:
conf_mat_path = '../results/Accuracies/Accuracies_feature_analysis_1/conf_matrix.npy'
conf_mat = np.load(conf_mat_path, allow_pickle=True)
keys = sorted(conf_mat[0].keys(), key=lambda x: int(x.split('_')[1]))
metrics1 = calculate_metrics(conf_mat, 1, keys)

conf_mat_path = '../results/Accuracies/Accuracies_feature_analysis_2/conf_matrix.npy'
conf_mat = np.load(conf_mat_path, allow_pickle=True)
keys = sorted(conf_mat[0].keys(), key=lambda x: int(x.split('_')[1]))
metrics2 = calculate_metrics(conf_mat, 1, keys)

fig, (ax1, ax2, ax3) = plt.subplots(3, 1,figsize=(12,10))
plt.subplots_adjust(left=0.06, bottom=0.1, right=0.99, top=0.90)
ax1.set_xlim(-1,77)
ax1.set_ylim(96.5,98.25)
ax1.set_xlabel('Features', fontsize=12)
baseline = np.mean(metrics_uniform[0][0,0,11])
ax1.plot([-1,77],[baseline,baseline],'--k', label='mean ACC-15 with full feature set')
plot_accross_folds(metrics1[0], 0, ax1, 'ACC-15', 'meanline', 1, xticks=np.arange(9,77,10), xticklabels=[str(i).zfill(2) for i in np.arange(10,78,10)], label=['mean ACC-15 with 76 features'])
ax1.legend(loc=4, fontsize=12)

ax2.set_xlim(-1,75)
ax2.set_ylim(88,99)
ax2.set_xlabel('Number of Features', fontsize=12)
baseline = np.mean(metrics_uniform[0][0,0,11])
ax2.plot([-1,77],[baseline,baseline],'--k', label='mean ACC-15 with full feature set')
plot_accross_folds(metrics2[0], 0, ax2, 'ACC-15', 'meanline', 1, xticks=np.arange(7,77,10), xticklabels=[str(i).zfill(2) for i in np.arange(10,77,10)], label=['mean ACC-15 with increasing number of features'])
ax2.legend(loc=4, fontsize=12)

time_perf_path = '../results/Accuracies/Accuracies_feature_analysis_2/time_performance.npy'
time_perf = np.load(time_perf_path, allow_pickle=True)[0]
keys = sorted(time_perf.keys(), key=lambda x: int(x.split('_')[1]))

test_indices = [np.load('../data/test_index1.npy'),
                np.load('../data/test_index2.npy'),
                np.load('../data/test_index3.npy'),
                np.load('../data/test_index4.npy'),
                np.load('../data/test_index5.npy')]

test_len = np.array([len(test_set) for test_set in test_indices])

time = np.zeros((75,1))
for i in range(len(keys)):
    time[i] = np.mean(time_perf[keys[i]] / test_len * 1e+06)
ax3.plot(np.arange(3,78), time, '-k', label='average inference time per sample')
ax3.legend(loc=4, fontsize=12)
ax3.set_ylabel('Microseconds', fontsize=12)
ax3.set_xlabel('Number of Features', fontsize=12)
ax3.tick_params(axis='both', which='major', labelsize=10)
ax3.set_xlim(2,78)
ax3.grid(True, axis='y')

fig.savefig('analyze_features_1_2.pdf')

print('3 features:', [np.mean(metrics2[i][0,0,0]) for i in range(4)], time[0])
print('9 features:', [np.mean(metrics2[i][0,0,6]) for i in range(4)], time[6])
print('77 features:', [np.mean(metrics2[i][0,0,74]) for i in range(4)], time[74])

In [None]:
conf_mat_path = '../results/Accuracies/Accuracies_feature_analysis_3/conf_matrix.npy'
conf_mat = np.load(conf_mat_path, allow_pickle=True)
keys = sorted(conf_mat[0].keys(), key=lambda x: int(x.split('_')[1]))
metrics3 = calculate_metrics(conf_mat, 1, keys)

fig, ax1 = plt.subplots(1, 1,figsize=(6,3))
plt.subplots_adjust(left=0.065, bottom=0.1, right=0.99, top=0.90)
calc_plot_kde(ax1, np.mean(metrics3[0][0,0], axis=-1), 97.58)
print_tabular(np.mean(metrics3[0][0,0], axis=1))
fig.savefig('analyze_features_3.pdf')