In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

#make plots inline using jupyter magic
%matplotlib inline

import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn import datasets, linear_model, metrics


import matplotlib as mpl
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.neighbors import KNeighborsClassifier
#Balanced RF Classifier
from imblearn.ensemble import BalancedRandomForestClassifier as BRF

from IPython.display import Markdown as md  #enable markdown within code cell
from IPython.display import display, Math, Latex

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import confusion_matrix
import time
import random
import scipy

from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, precision_recall_curve, make_scorer,f1_score
from sklearn.metrics import precision_recall_curve as PRC
from sklearn.decomposition import PCA
from scipy.stats import gaussian_kde
from numpy.fft import fftn

## Homemade code imports
import metrics

data_dir = '/run/media/mnewlin/_userdata/uhnds/'
original_netflow_data_dir = data_dir + 'host/unconverted/'
original_netflow_file = 'netflow_day-02'
fake_dir = '/run/media/mnewlin/_userdata/uhnds/network/converted/fake/'
real_dir = '/run/media/mnewlin/_userdata/uhnds/network/converted/real/'
real_file = 'netflow_day-02'

real_host_dir = '/run/media/mnewlin/_userdata/uhnds/host/unconverted/real/tfidf/'
fake_host_dir = '/run/media/mnewlin/_userdata/uhnds/host/unconverted/fake'

# Dataset dependent number of cols
N_COLS = 20
np.seterr(all='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
"""
    Function to read in a single real sample from a given directory based
    on the desired length of the sample.
"""
def load_real_sample(sample_num, sample_length=100):

    data_dir = 'samples_{}/'.format(sample_length)
    if sample_length < 10000:
        if sample_num >= 10000:
            return -1
    elif sample_length < 100000:
        if sample_num >= 2000:
            return -1
    else:
        if sample_num >= 2000:
            return -1
        
    load_file = real_file + '_sample_{}.txt'.format(sample_num)
    filename = real_dir + data_dir + load_file
    df = pd.read_csv(filename, names=['Duration', 'SrcDevice', 
            'DstDevice', 'Protocol', 'SrcPort', 'DstPort', 'SrcPackets', 'DstPackets', 
            'SrcBytes', 'DstBytes'], sep=' ', dtype=np.float64)
    data = np.array(df)
    return data

def load_real_host_sample(sample_num, sample_length=1000):
    directory = '/run/media/mnewlin/_userdata/uhnds/host/unconverted/real/tfidf/'
    real_host_data_dir = 'samples_{}/'.format(sample_length)
        
    load_file = directory + real_host_data_dir + 'tfidf_sample_{}.csv'.format(sample_num)
    df = pd.read_csv(load_file, dtype=np.float64)
    data = np.array(df)
    return data

def load_fake_host_sample(sample_num, sample_length=1000,dist='uniform'):
    directory = '/run/media/mnewlin/_userdata/uhnds/host/unconverted/fake/{}/'.format(dist)
    
    fake_host_data_dir = directory + 'samples_{}/'.format(sample_length)
        
    load_file = fake_host_data_dir + 'tfidf_sample_{}.csv'.format(sample_num)
    df = pd.read_csv(load_file, dtype=np.float64)
    data = np.array(df)
    return data
"""
    Function to read in a single fake sample from a given directory based
    on the desired length of the sample.
"""
def load_fake_sample(sample_num, sample_length=100):
    data_dir = 'samples_{}/'.format(sample_length)
    if sample_length < 10000:
        if sample_num >= 10000:
            return -1
    elif sample_length < 100000:
        if sample_num >= 2000:
            return -1
    else:
        if sample_num >= 2000:
            return -1

    load_file = real_file + '_random_sample_{}.txt'.format(sample_num)
    filename = fake_dir + data_dir + load_file
    df = pd.read_csv(filename, names=['Duration', 'SrcDevice', 
            'DstDevice', 'Protocol', 'SrcPort', 'DstPort', 'SrcPackets', 'DstPackets', 
            'SrcBytes', 'DstBytes'], sep=' ', dtype=np.float64)
    data = np.array(df)
    return data

def load_n_host_samples(real=True, sample_length=100, num_samples=100, random_state=69, dist='uniform'):

    sample_set = np.array([])
    sample_range= 10000
    random.seed(a=random_state)
    sample_list = random.sample(range(sample_range), num_samples)
    if real:
        for num in sample_list:
            data = load_real_host_sample(sample_length=sample_length, sample_num=num)
            sample_set = np.append(sample_set, data)
    else:
        for num in sample_list:
            data = None
            if dist == 'uniform':
                data = load_fake_host_sample(sample_length=sample_length, sample_num=num, dist='uniform')
            else:
                data = load_fake_host_sample(sample_length=sample_length, sample_num=num, dist='normal')
            sample_set = np.append(sample_set, data)
    sample_set = np.reshape(sample_set, newshape=(num_samples, sample_length, N_COLS))
        
    return sample_set

def load_n_samples(real=True, sample_length=100, num_samples=100, random_state=69):
    
    sample_set = np.array([])
    sample_range = 0
    if sample_length <= 1000:
        sample_range = 10000
    elif sample_length <= 10000:
        sample_range = 2000
    elif sample_length <= 100000:
        sample_range = 1160
    # Seed random samples for repeatability    
    random.seed(a=random_state)
    sample_list = random.sample(range(sample_range), num_samples)
    if real:
        for num in sample_list:
            data = load_real_sample(sample_length=sample_length, sample_num=num)
            sample_set = np.append(sample_set, data)
    else:
        for num in sample_list:
            data = load_fake_sample(sample_length=sample_length, sample_num=num)
            sample_set = np.append(sample_set, data)
    sample_set = np.reshape(sample_set, newshape=(num_samples, sample_length, N_COLS))
    return sample_set

def create_sample_mix(ratio, sample_length=100, num_samples=100, random_state=69):
    sample_range = 0
    if sample_length <= 1000:
        sample_range = 10000
    elif sample_length <= 10000:
        sample_range = 2000
    elif sample_length <= 100000:
        sample_range = 1160
    
    #mix_set = np.zeros((num_samples, sample_length, N_COLS))
    bound_val_real = np.around(((1-ratio)*num_samples), decimals=2)
    bound_val_fake = np.around((ratio)*num_samples, decimals=2)
    bound_val_real = int(bound_val_real) # How many real samples there should be
    bound_val_fake = int(bound_val_fake) # How many fake samples there should be
    
    real_data = load_n_samples(real=True, num_samples=num_samples, sample_length=sample_length, random_state=random_state) 
    fake_data = load_n_samples(real=True, num_samples=num_samples, sample_length=sample_length, random_state=random_state)
    real_section = real_data[:bound_val_real]
    fake_section = fake_data[:bound_val_real]
    mix_set = np.append(real_section, fake_section)
    mix_set = np.reshape(mix_set, newshape=(num_samples, sample_length, N_COLS))
    return mix_set

## Test Discriminative ability

Approach: Caclulate $n$ different distances between real sets: $\rho(R_i,R_j)$ and $n$ different distances between real and fake sets $\rho(R_i, F_j)$ for $i,j \leq n, i \neq j$.

Then compare the two distributions $P_1 = \rho(R_i,R_j)$ and $P_2 = \rho(R_i,F_j)$ with the Kullback Leibler distance (KLD) $\sum_x p_1(x)\log_a\frac{p_1(x)}{p_2(x)}$


## Repeatability Standpoint

### Untransformed

In [3]:
indir = '/home/mnewlin/git/AFIT/Thesis/code/results/untrans/'
real_data_untrans = pd.read_csv(indir+'real_data_exp_host_normal.csv').drop(['Unnamed: 0','fid'], axis=1)
display(real_data_untrans.head())
fake_data_untrans = pd.read_csv(indir+'fake_data_exp_host_normal.csv').drop(['Unnamed: 0','fid'], axis=1)
display(fake_data_untrans.head())

Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,590.992823,32.033137,292085.4,4472.751122,0.729486,5346.056656,0.017451,0.071082,4.821975,0.35513
1,425.041051,20.273839,250928.7,3463.470656,0.729571,5631.874992,0.02865,0.257182,11.790535,0.282391
2,578.254941,30.75762,303006.4,4470.019475,0.746244,5520.306762,0.030961,0.185519,11.625489,0.413442
3,487.300317,34.779531,172492.2,3201.036523,0.768937,5433.687875,0.038753,1.102998,24.94551,0.44199
4,123729.906082,9111.937604,26741300.0,726779.60064,0.775043,5603.10622,0.031604,0.079901,5.516134,0.251136


Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,933.871953,43.301961,696639.773082,8129.066754,0.66358,6313.424061,0.594111,0.174121,12.565097,0.28642
1,743.141787,30.970115,608390.225717,6768.097355,0.672757,6330.699377,0.51891,0.347985,18.673544,0.120268
2,958.846492,43.714454,723638.649082,8401.196219,0.669651,6348.819559,0.645422,0.236122,15.952048,0.392609
3,583.71424,23.008342,498493.330493,5427.425735,0.706333,6256.107961,0.542735,1.374534,29.906985,0.322394
4,798.26938,34.136602,643685.898493,7213.660501,0.658289,6343.951919,0.538382,0.165421,12.214347,0.24938


In [4]:
titles = ['Manhattan', 'Euclidean', r'$l_p$: $p=r=0.5$', r'$l_p$: $p=r=0.75$', 'Cosine', 'Mahalanobis', 'Wasserstein', 'Entropy', 'Perplexity', 'MMD']
labels = [r'$\log_{10}$ Metric values', r'$\log_{10}$ Metric values', r'$\log_{10}$ Metric values', r'$\log_{10}$ Metric values', 
         'Metric values', 'Natural log metric values', 'Natural log Metric values', 'Natural log Metric values', 'Natural log Metric values', 'Metric values']
subplots = []
y_label = 'Count'
binsize = 100
alpha_blue = 0.8
alpha_red = 0.6
n_repeats = 10
real_data = real_data_untrans
fake_data = fake_data_untrans
num_metrics = 10
sample_length = 1000
real_dists = np.zeros((n_repeats,binsize-1,num_metrics))
fake_dists = np.zeros((n_repeats,binsize-1,num_metrics))

for i in range(n_repeats):
    
    fig = plt.figure(figsize=(15,20))
    
    grid_x = 5
    grid_y = 2
    grid = mpl.gridspec.GridSpec(grid_x, grid_y)
    count = 0
    
    r = np.zeros((num_metrics, binsize-1))
    f = np.zeros((num_metrics, binsize-1))
    rr = 0
    rf = 0
    for j in range(grid_x):
        for k in range(grid_y):
            ax1 = plt.subplot(grid[j,k])
            if 'Natural' in labels[count]:
                rr = np.log(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            elif '10' in labels[count]:
                rr = np.log10(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log10(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            else:
                rr = real_data.iloc[i*sample_length:(i+1)*sample_length, count]
                rf = fake_data.iloc[i*sample_length:(i+1)*sample_length, count]
            best_bins_min = np.minimum(rr.min(), rf.min()) 
            best_bins_max = np.maximum(rr.max(), rf.max()) 
            bins_best = np.linspace(best_bins_min, best_bins_max, binsize)
            real_hist_data = ax1.hist(rr,color='blue', bins=bins_best, alpha=alpha_blue, label='Real-Real')
            fake_hist_data = ax1.hist(rf,color='red', bins=bins_best, alpha=alpha_red, label='Real-Fake')
            r[count] = real_hist_data[0]
            f[count] = fake_hist_data[0]
            ax1.set_xlabel(labels[count], fontsize=16)
            ax1.set_ylabel(y_label, fontsize=16)
            ax1.set_title(titles[count], fontsize=16)
            ax1.legend(fontsize=16)
            count += 1
    rd = np.zeros((binsize-1, num_metrics))
    fd = np.zeros((binsize-1, num_metrics))
    for j in range(num_metrics):
        rd[:,j] = r[j]
        fd[:,j] = f[j]
        
    real_dists[i] = rd
    fake_dists[i] = fd  
    
    plt.tight_layout()
    plt.savefig('figures/discriminative/png/uhnds_host/untrans/normal/hist_mat_vert_1000_1000_{}.png'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/untrans/normal/hist_mat_vert_1000_1000_{}.eps'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/untrans/normal/hist_mat_vert_1000_1000_{}.pdf'.format(i))
    plt.close()
print("Finished")    


Finished


In [5]:
names = ['Euclidean', 'Manhattan', 'lp: p=r=0.5', 'lp: p=r=0.75', 'cosine', 'mahalanobis', 'wasserstein', 'entropy', 'perplexity', 'mmd']
cols = []
for i in range(n_repeats):
    cols.append('Run {}'.format(i+1))

KLD_data_all = np.zeros((num_metrics, n_repeats))
for x in range(n_repeats):
    real_data = pd.DataFrame(data=real_dists[x], columns=names)
    fake_data = pd.DataFrame(data=fake_dists[x], columns=names)
    euc_KLD = metrics.KL(real_data.loc[:,'Euclidean']/np.sum(real_data.loc[:,'Euclidean']), fake_data.loc[:,'Euclidean']/np.sum(fake_data.loc[:,'Euclidean']))
    man_KLD = metrics.KL(real_data.loc[:,'Manhattan']/np.sum(real_data.loc[:,'Manhattan']), fake_data.loc[:,'Manhattan']/np.sum(fake_data.loc[:,'Manhattan']))
    lp1_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.5']/np.sum(real_data.loc[:,'lp: p=r=0.5']), fake_data.loc[:,'lp: p=r=0.5']/np.sum(fake_data.loc[:,'lp: p=r=0.5']))
    lp2_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.75']/np.sum(real_data.loc[:,'lp: p=r=0.75']), fake_data.loc[:,'lp: p=r=0.75']/np.sum(fake_data.loc[:,'lp: p=r=0.75']))
    cos_KLD = metrics.KL(real_data.loc[:,'cosine']/np.sum(real_data.loc[:,'cosine']), fake_data.loc[:,'cosine']/np.sum(fake_data.loc[:,'cosine']))
    mah_KLD = metrics.KL(real_data.loc[:,'mahalanobis']/np.sum(real_data.loc[:,'mahalanobis']), fake_data.loc[:,'mahalanobis']/np.sum(fake_data.loc[:,'mahalanobis']))
    wass_KLD = metrics.KL(real_data.loc[:,'wasserstein']/np.sum(real_data.loc[:,'wasserstein']), fake_data.loc[:,'wasserstein']/np.sum(fake_data.loc[:,'wasserstein']))
    ent_KLD = metrics.KL(real_data.loc[:,'entropy']/np.sum(real_data.loc[:,'entropy']), fake_data.loc[:,'entropy']/np.sum(fake_data.loc[:,'entropy']))
    perp_KLD = metrics.KL(real_data.loc[:,'perplexity']/np.sum(real_data.loc[:,'perplexity']), fake_data.loc[:,'perplexity']/np.sum(fake_data.loc[:,'perplexity']))
    mmd_KLD = metrics.KL(real_data.loc[:,'mmd']/np.sum(real_data.loc[:,'mmd']), fake_data.loc[:,'mmd']/np.sum(fake_data.loc[:,'mmd']))
    
    kld_data = np.array([man_KLD, euc_KLD, lp1_KLD, lp2_KLD, cos_KLD, mah_KLD, wass_KLD, ent_KLD, perp_KLD, mmd_KLD])
    KLD_data_all[:,x] = kld_data.T
names = ['Euclidean', 'Manhattan', r'$l_p$: $p=r=0.5$', r'$l_p$: $p=r=0.75$', 'Cosine', 'Mahalanobis', 'Wasserstein', 'Entropy', 'Perplexity', 'MMD']    
KLD_data_untrans = pd.DataFrame(KLD_data_all,index=names, columns=cols)
#KLD_data_untrans = KLD_data_untrans.sort_values(by=['KLD Score'], ascending=False)
KLD_data_untrans.at[:,'Min'] = np.round(np.min(KLD_data_untrans.loc[:,:'Run 10'], axis=1), 4)
KLD_data_untrans.at[:,'Max'] = np.round(np.max(KLD_data_untrans.loc[:,:'Run 10'], axis=1), 4)
KLD_data_untrans.at[:,'Range'] = np.round(KLD_data_untrans.loc[:,'Max'] - KLD_data_untrans.loc[:,'Min'], 4)
KLD_data_untrans.at[:,'Mean'] = np.round(np.mean(KLD_data_untrans.loc[:,:'Run 10'], axis=1), 4)
#display(KLD_data_untrans)
KLD_stats_untrans = KLD_data_untrans.sort_values(by=['Mean'], ascending=False).loc[:,'Min':'Mean']
display(KLD_stats_untrans)
outfile='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_untrans_normal.csv'
outfile2='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_untrans_normal_stats.csv'
KLD_data_untrans.to_csv(outfile)
KLD_stats_untrans.to_csv(outfile2)

Unnamed: 0,Min,Max,Range,Mean
Wasserstein,8.9004,9.0458,0.1454,8.9642
$l_p$: $p=r=0.5$,7.9451,8.8968,0.9517,8.2638
Mahalanobis,7.4409,9.0799,1.639,7.861
Cosine,5.895,7.8319,1.9369,6.9445
$l_p$: $p=r=0.75$,6.197,7.1439,0.9469,6.5236
Manhattan,3.9447,4.773,0.8283,4.3918
Entropy,2.7516,3.8152,1.0636,3.2131
Perplexity,2.0891,2.5394,0.4503,2.3593
Euclidean,1.3024,1.5547,0.2523,1.4276
MMD,0.0651,0.2394,0.1743,0.1356


## SQRT Data

In [6]:
indir = '/home/mnewlin/git/AFIT/Thesis/code/results/sqrt/'
real_data_sqrt = pd.read_csv(indir+'real_data_exp_host_normal.csv').drop(['Unnamed: 0','fid'], axis=1)
display(real_data_sqrt.head())
fake_data_sqrt = pd.read_csv(indir+'fake_data_exp_host_normal.csv').drop(['Unnamed: 0','fid'], axis=1)
display(fake_data_sqrt.head())

Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,541.32944,28.417349,273099.2,4163.699,0.720177,5349.563349,0.025063,0.062093,4.495712,0.247806
1,427.62689,20.535329,256810.7,3501.31,0.722647,5586.07491,0.041115,0.213811,9.570536,0.274134
2,481.324463,25.221703,262918.7,3783.803,0.74121,5432.409911,0.039645,0.2462,12.043945,0.396602
3,272.504883,15.977797,129248.1,2026.336,0.772368,5434.547624,0.057924,0.971037,23.683946,0.40288
4,179945.395546,13846.714972,37389760.0,1039538.0,0.763747,5561.662152,0.048313,0.071175,5.139938,0.247322


Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,875.469048,39.979419,675450.437689,7736.205008,0.659986,6309.950365,0.571833,0.148349,10.988474,0.23285
1,737.446179,30.489176,607132.016627,6738.06462,0.66936,6314.77563,0.50684,0.313064,16.868015,0.083723
2,867.411467,37.940228,685963.621854,7770.749152,0.6673,6324.63281,0.579058,0.182391,13.369845,0.332176
3,595.755318,23.652957,505297.612404,5522.06128,0.711902,6252.401209,0.552106,1.23415,28.025663,0.3021
4,849.989533,37.882767,668508.782513,7582.794148,0.654835,6334.83608,0.547902,0.172001,12.367642,0.247183


In [7]:

subplots = []
y_label = 'Count'
binsize = 100
alpha_blue = 0.8
alpha_red = 0.6
n_repeats = 10
real_data = real_data_sqrt
fake_data = fake_data_sqrt
num_metrics = 10
sample_length = 1000
real_dists = np.zeros((n_repeats,binsize-1,num_metrics))
fake_dists = np.zeros((n_repeats,binsize-1,num_metrics))

for i in range(n_repeats):
    
    fig = plt.figure(figsize=(15,20))
    
    grid_x = 5
    grid_y = 2
    grid = mpl.gridspec.GridSpec(grid_x, grid_y)
    count = 0
    
    r = np.zeros((num_metrics, binsize-1))
    f = np.zeros((num_metrics, binsize-1))
    rr = 0
    rf = 0
    for j in range(grid_x):
        for k in range(grid_y):
            ax1 = plt.subplot(grid[j,k])
            if 'Natural' in labels[count]:
                rr = np.log(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            elif '10' in labels[count]:
                rr = np.log10(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log10(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            else:
                rr = real_data.iloc[i*sample_length:(i+1)*sample_length, count]
                rf = fake_data.iloc[i*sample_length:(i+1)*sample_length, count]
            best_bins_min = np.minimum(rr.min(), rf.min()) 
            best_bins_max = np.maximum(rr.max(), rf.max()) 
            bins_best = np.linspace(best_bins_min, best_bins_max, binsize)
            real_hist_data = ax1.hist(rr,color='blue', bins=bins_best, alpha=alpha_blue, label='Real-Real')
            fake_hist_data = ax1.hist(rf,color='red', bins=bins_best, alpha=alpha_red, label='Real-Fake')
            r[count] = real_hist_data[0]
            f[count] = fake_hist_data[0]
            ax1.set_xlabel(labels[count], fontsize=16)
            ax1.set_ylabel(y_label, fontsize=16)
            ax1.set_title(titles[count], fontsize=16)
            ax1.legend(fontsize=16)
            count += 1
    rd = np.zeros((binsize-1, num_metrics))
    fd = np.zeros((binsize-1, num_metrics))
    for j in range(num_metrics):
        rd[:,j] = r[j]
        fd[:,j] = f[j]
        
    real_dists[i] = rd
    fake_dists[i] = fd  
    
    plt.tight_layout()
    plt.savefig('figures/discriminative/png/uhnds_host/sqrt/normal/hist_mat_vert_1000_1000_{}.png'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/sqrt/normal/hist_mat_vert_1000_1000_{}.eps'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/sqrt/normal/hist_mat_vert_1000_1000_{}.pdf'.format(i))
    plt.close()
print("Finished")    
    

Finished


In [14]:
names = ['Euclidean', 'Manhattan', 'lp: p=r=0.5', 'lp: p=r=0.75', 'cosine', 'mahalanobis', 'wasserstein', 'entropy', 'perplexity', 'mmd']
cols = []
for i in range(n_repeats):
    cols.append('Run {}'.format(i+1))

KLD_data_all = np.zeros((num_metrics, n_repeats))
for x in range(n_repeats):
    real_data = pd.DataFrame(data=real_dists[x], columns=names)
    fake_data = pd.DataFrame(data=fake_dists[x], columns=names)
    euc_KLD = metrics.KL(real_data.loc[:,'Euclidean']/np.sum(real_data.loc[:,'Euclidean']), fake_data.loc[:,'Euclidean']/np.sum(fake_data.loc[:,'Euclidean']))
    man_KLD = metrics.KL(real_data.loc[:,'Manhattan']/np.sum(real_data.loc[:,'Manhattan']), fake_data.loc[:,'Manhattan']/np.sum(fake_data.loc[:,'Manhattan']))
    lp1_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.5']/np.sum(real_data.loc[:,'lp: p=r=0.5']), fake_data.loc[:,'lp: p=r=0.5']/np.sum(fake_data.loc[:,'lp: p=r=0.5']))
    lp2_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.75']/np.sum(real_data.loc[:,'lp: p=r=0.75']), fake_data.loc[:,'lp: p=r=0.75']/np.sum(fake_data.loc[:,'lp: p=r=0.75']))
    cos_KLD = metrics.KL(real_data.loc[:,'cosine']/np.sum(real_data.loc[:,'cosine']), fake_data.loc[:,'cosine']/np.sum(fake_data.loc[:,'cosine']))
    mah_KLD = metrics.KL(real_data.loc[:,'mahalanobis']/np.sum(real_data.loc[:,'mahalanobis']), fake_data.loc[:,'mahalanobis']/np.sum(fake_data.loc[:,'mahalanobis']))
    wass_KLD = metrics.KL(real_data.loc[:,'wasserstein']/np.sum(real_data.loc[:,'wasserstein']), fake_data.loc[:,'wasserstein']/np.sum(fake_data.loc[:,'wasserstein']))
    ent_KLD = metrics.KL(real_data.loc[:,'entropy']/np.sum(real_data.loc[:,'entropy']), fake_data.loc[:,'entropy']/np.sum(fake_data.loc[:,'entropy']))
    perp_KLD = metrics.KL(real_data.loc[:,'perplexity']/np.sum(real_data.loc[:,'perplexity']), fake_data.loc[:,'perplexity']/np.sum(fake_data.loc[:,'perplexity']))
    mmd_KLD = metrics.KL(real_data.loc[:,'mmd']/np.sum(real_data.loc[:,'mmd']), fake_data.loc[:,'mmd']/np.sum(fake_data.loc[:,'mmd']))
    
    kld_data = np.array([man_KLD, euc_KLD, lp1_KLD, lp2_KLD, cos_KLD, mah_KLD, wass_KLD, ent_KLD, perp_KLD, mmd_KLD])
    KLD_data_all[:,x] = kld_data.T
names = ['Euclidean', 'Manhattan', r'$l_p$: $p=r=0.5$', r'$l_p$: $p=r=0.75$', 'Cosine', 'Mahalanobis', 'Wasserstein', 'Entropy', 'Perplexity', 'MMD']    
KLD_data_sqrt = pd.DataFrame(KLD_data_all,index=names, columns=cols)
#KLD_data_untrans = KLD_data_untrans.sort_values(by=['KLD Score'], ascending=False)
KLD_data_sqrt.at[:,'Min'] = np.round(np.min(KLD_data_sqrt.loc[:,:'Run 10'], axis=1), 4)
KLD_data_sqrt.at[:,'Max'] = np.round(np.max(KLD_data_sqrt.loc[:,:'Run 10'], axis=1), 4)
KLD_data_sqrt.at[:,'Range'] = np.round(KLD_data_sqrt.loc[:,'Max'] - KLD_data_sqrt.loc[:,'Min'], 4)
KLD_data_sqrt.at[:,'Mean'] = np.round(np.mean(KLD_data_sqrt.loc[:,:'Run 10'], axis=1), 4)

KLD_stats_sqrt = KLD_data_sqrt.sort_values(by=['Mean'], ascending=False).loc[:,'Min':'Mean']
display(KLD_stats_sqrt)
#display(KLD_data_sqrt)
outfile='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_normal_sqrt.csv'
outfile2='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results__host_normal_sqrt_stats.csv'
KLD_data_sqrt.to_csv(outfile)
KLD_stats_sqrt.to_csv(outfile2)

Unnamed: 0,Min,Max,Range,Mean
Wasserstein,8.6102,8.8451,0.2349,8.6879
$l_p$: $p=r=0.5$,8.0197,9.2153,1.1956,8.3817
Mahalanobis,7.4735,9.2128,1.7393,7.913
$l_p$: $p=r=0.75$,6.8089,7.3767,0.5678,7.0549
Cosine,5.3704,7.8285,2.4581,6.7401
Manhattan,5.2858,6.0173,0.7315,5.6909
Perplexity,2.8795,3.4537,0.5742,3.2053
Euclidean,2.3419,2.9337,0.5918,2.6697
Entropy,2.3947,2.7361,0.3414,2.5598
MMD,0.0576,0.1537,0.0961,0.1111


## Log Results

In [8]:
indir = '/home/mnewlin/git/AFIT/Thesis/code/results/log/'
real_data_log = pd.read_csv(indir+'real_data_exp_host_normal.csv').drop(['Unnamed: 0','fid'], axis=1)
display(real_data_log.head())
fake_data_log = pd.read_csv(indir+'fake_data_exp_host_normal.csv').drop(['Unnamed: 0','fid'], axis=1)
display(fake_data_log.head())

Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,575.678823,30.931968,287426.8,4377.868774,0.729121,5342.093246,0.016703,0.068898,4.692285,0.347861
1,419.41948,19.880224,249538.5,3428.755844,0.729532,5624.670468,0.027413,0.216824,9.786899,0.282502
2,557.164398,29.403759,294944.3,4326.042425,0.746214,5507.926394,0.029383,0.231021,12.565048,0.414639
3,460.583717,32.647788,165299.0,3040.458223,0.769915,5427.876917,0.03599,1.099098,24.733004,0.442224
4,127023.323665,9371.854835,27394410.0,745505.833591,0.774067,5598.333521,0.030398,0.078838,5.454309,0.250696


Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,916.355556,42.194362,689192.907962,8006.879987,0.658356,6321.880328,0.589816,0.179628,12.953685,0.282566
1,733.109108,30.350998,602181.045868,6689.257171,0.668528,6333.318203,0.512526,0.315663,17.202767,0.101461
2,933.158869,42.131958,710918.111394,8213.404752,0.665509,6348.871515,0.629817,0.294995,17.757719,0.384868
3,571.595936,22.266015,490209.5788,5326.942693,0.705147,6259.824135,0.541058,1.381051,30.362768,0.332176
4,796.652557,34.052189,642620.163563,7200.269546,0.65389,6351.120334,0.537735,0.170749,12.538689,0.249157


In [9]:

subplots = []
y_label = 'Count'
binsize = 100
alpha_blue = 0.8
alpha_red = 0.6
n_repeats = 10
real_data = real_data_log
fake_data = fake_data_log
num_metrics = 10
sample_length = 1000
real_dists = np.zeros((n_repeats,binsize-1,num_metrics))
fake_dists = np.zeros((n_repeats,binsize-1,num_metrics))

for i in range(n_repeats):
    
    fig = plt.figure(figsize=(15,20))
    
    grid_x = 5
    grid_y = 2
    grid = mpl.gridspec.GridSpec(grid_x, grid_y)
    count = 0
    
    r = np.zeros((num_metrics, binsize-1))
    f = np.zeros((num_metrics, binsize-1))
    rr = 0
    rf = 0
    for j in range(grid_x):
        for k in range(grid_y):
            ax1 = plt.subplot(grid[j,k])
            if 'Natural' in labels[count]:
                rr = np.log(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            elif '10' in labels[count]:
                rr = np.log10(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log10(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            else:
                rr = real_data.iloc[i*sample_length:(i+1)*sample_length, count]
                rf = fake_data.iloc[i*sample_length:(i+1)*sample_length, count]
            best_bins_min = np.minimum(rr.min(), rf.min()) 
            best_bins_max = np.maximum(rr.max(), rf.max()) 
            bins_best = np.linspace(best_bins_min, best_bins_max, binsize)
            real_hist_data = ax1.hist(rr,color='blue', bins=bins_best, alpha=alpha_blue, label='Real-Real')
            fake_hist_data = ax1.hist(rf,color='red', bins=bins_best, alpha=alpha_red, label='Real-Fake')
            r[count] = real_hist_data[0]
            f[count] = fake_hist_data[0]
            ax1.set_xlabel(labels[count], fontsize=16)
            ax1.set_ylabel(y_label, fontsize=16)
            ax1.set_title(titles[count], fontsize=16)
            ax1.legend(fontsize=16)
            count += 1
    rd = np.zeros((binsize-1, num_metrics))
    fd = np.zeros((binsize-1, num_metrics))
    for j in range(num_metrics):
        rd[:,j] = r[j]
        fd[:,j] = f[j]
        
    real_dists[i] = rd
    fake_dists[i] = fd  
    
    plt.tight_layout()
    plt.savefig('figures/discriminative/png/uhnds_host/log/normal/hist_mat_vert_1000_1000_{}.png'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/log/normal/hist_mat_vert_1000_1000_{}.eps'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/log/normal/hist_mat_vert_1000_1000_{}.pdf'.format(i))
    plt.close()
print("Finished")    
    

Finished


In [10]:
names = ['Euclidean', 'Manhattan', 'lp: p=r=0.5', 'lp: p=r=0.75', 'cosine', 'mahalanobis', 'wasserstein', 'entropy', 'perplexity', 'mmd']
cols = []
for i in range(n_repeats):
    cols.append('Run {}'.format(i+1))

KLD_data_all = np.zeros((num_metrics, n_repeats))
for x in range(n_repeats):
    real_data = pd.DataFrame(data=real_dists[x], columns=names)
    fake_data = pd.DataFrame(data=fake_dists[x], columns=names)
    euc_KLD = metrics.KL(real_data.loc[:,'Euclidean']/np.sum(real_data.loc[:,'Euclidean']), fake_data.loc[:,'Euclidean']/np.sum(fake_data.loc[:,'Euclidean']))
    man_KLD = metrics.KL(real_data.loc[:,'Manhattan']/np.sum(real_data.loc[:,'Manhattan']), fake_data.loc[:,'Manhattan']/np.sum(fake_data.loc[:,'Manhattan']))
    lp1_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.5']/np.sum(real_data.loc[:,'lp: p=r=0.5']), fake_data.loc[:,'lp: p=r=0.5']/np.sum(fake_data.loc[:,'lp: p=r=0.5']))
    lp2_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.75']/np.sum(real_data.loc[:,'lp: p=r=0.75']), fake_data.loc[:,'lp: p=r=0.75']/np.sum(fake_data.loc[:,'lp: p=r=0.75']))
    cos_KLD = metrics.KL(real_data.loc[:,'cosine']/np.sum(real_data.loc[:,'cosine']), fake_data.loc[:,'cosine']/np.sum(fake_data.loc[:,'cosine']))
    mah_KLD = metrics.KL(real_data.loc[:,'mahalanobis']/np.sum(real_data.loc[:,'mahalanobis']), fake_data.loc[:,'mahalanobis']/np.sum(fake_data.loc[:,'mahalanobis']))
    wass_KLD = metrics.KL(real_data.loc[:,'wasserstein']/np.sum(real_data.loc[:,'wasserstein']), fake_data.loc[:,'wasserstein']/np.sum(fake_data.loc[:,'wasserstein']))
    ent_KLD = metrics.KL(real_data.loc[:,'entropy']/np.sum(real_data.loc[:,'entropy']), fake_data.loc[:,'entropy']/np.sum(fake_data.loc[:,'entropy']))
    perp_KLD = metrics.KL(real_data.loc[:,'perplexity']/np.sum(real_data.loc[:,'perplexity']), fake_data.loc[:,'perplexity']/np.sum(fake_data.loc[:,'perplexity']))
    mmd_KLD = metrics.KL(real_data.loc[:,'mmd']/np.sum(real_data.loc[:,'mmd']), fake_data.loc[:,'mmd']/np.sum(fake_data.loc[:,'mmd']))
    
    kld_data = np.array([man_KLD, euc_KLD, lp1_KLD, lp2_KLD, cos_KLD, mah_KLD, wass_KLD, ent_KLD, perp_KLD, mmd_KLD])
    KLD_data_all[:,x] = kld_data.T
names = ['Euclidean', 'Manhattan', r'$l_p$: $p=r=0.5$', r'$l_p$: $p=r=0.75$', 'Cosine', 'Mahalanobis', 'Wasserstein', 'Entropy', 'Perplexity', 'MMD']    
KLD_data_log = pd.DataFrame(KLD_data_all,index=names, columns=cols)
#KLD_data_untrans = KLD_data_untrans.sort_values(by=['KLD Score'], ascending=False)
KLD_data_log.at[:,'Min'] = np.round(np.min(KLD_data_log.loc[:,:'Run 10'], axis=1), 4)
KLD_data_log.at[:,'Max'] = np.round(np.max(KLD_data_log.loc[:,:'Run 10'], axis=1), 4)
KLD_data_log.at[:,'Range'] = np.round(KLD_data_log.loc[:,'Max'] - KLD_data_log.loc[:,'Min'], 4)
KLD_data_log.at[:,'Mean'] = np.round(np.mean(KLD_data_log.loc[:,:'Run 10'], axis=1), 4)

KLD_stats_log = KLD_data_log.sort_values(by=['Mean'], ascending=False).loc[:,'Min':'Mean']
display(KLD_stats_log)
#display(KLD_data_sqrt)
outfile='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_normal_log.csv'
outfile2='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results__host_normal_log_stats.csv'
KLD_data_log.to_csv(outfile)
KLD_stats_log.to_csv(outfile2)

Unnamed: 0,Min,Max,Range,Mean
Wasserstein,8.9269,9.0872,0.1603,8.9951
$l_p$: $p=r=0.5$,8.0712,9.0103,0.9391,8.311
Mahalanobis,7.4426,9.1066,1.664,7.8711
Cosine,5.9481,7.9084,1.9603,6.9901
$l_p$: $p=r=0.75$,6.2948,6.9803,0.6855,6.6372
Manhattan,4.2782,4.8455,0.5673,4.586
Entropy,3.108,3.8019,0.6939,3.4024
Perplexity,2.3014,2.7109,0.4095,2.5402
Euclidean,1.4268,1.8516,0.4248,1.6083
MMD,0.0561,0.165,0.1089,0.1409


## PCA Results

In [11]:
indir = '/home/mnewlin/git/AFIT/Thesis/code/results/pca/'
real_data_pca = pd.read_csv(indir+'real_data_exp_host_normal.csv').drop(['Unnamed: 0'], axis=1)
display(real_data_pca.head())
fake_data_pca = pd.read_csv(indir+'fake_data_exp_host_normal.csv').drop(['Unnamed: 0'], axis=1)
display(fake_data_pca.head())

Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd,fid
0,3.023384,1.381805,28.405489,5.960992,0.449766,119.9375,0.051761,0.140218,0.602425,0.218504,118.837547
1,2.947267,1.320038,28.158765,5.855208,0.345413,116.333244,0.0657,0.19628,0.769909,0.213937,132.972137
2,3.109288,1.328393,29.697517,6.234704,0.61744,115.526364,0.05682,0.238677,0.860582,0.185271,181.926793
3,3.551161,1.404432,37.300385,7.406185,0.665088,126.635624,0.078207,0.18708,0.786804,0.425452,157.961042
4,2.642627,1.14578,26.915911,5.368901,0.443586,100.933728,0.051219,0.179875,0.686367,0.343648,135.770067


Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd,fid
0,4.740856,1.41903,77.430124,11.703355,0.675689,126.376566,0.122608,0.697994,2.402162,0.494275,230.73051
1,4.734695,1.443705,77.024229,11.656662,0.69022,129.197258,0.122993,0.666854,2.354208,0.530584,239.807976
2,4.469751,1.387902,71.77472,10.937228,0.598249,123.79557,0.114284,0.652026,2.266367,0.448981,248.052509
3,4.93798,1.458786,81.620104,12.256232,0.636869,130.693348,0.110416,0.625298,2.134136,0.295171,195.397938
4,4.491841,1.371096,72.647073,11.03629,0.665399,123.154836,0.116122,0.589171,2.098212,0.462207,199.153793


In [12]:
titles = ['Manhattan', 'Euclidean', r'$l_p$: $p=r=0.5$', r'$l_p$: $p=r=0.75$', 'Cosine', 'Mahalanobis', 'Wasserstein', 'Entropy', 'Perplexity', 'MMD', 'FID']
labels = [r'$\log_{10}$ Metric values', r'$\log_{10}$ Metric values', r'$\log_{10}$ Metric values', r'$\log_{10}$ Metric values', 
         'Metric values', 'Natural log metric values', 'Natural log Metric values', 'Natural log Metric values', 'Natural log Metric values', 
          'Metric values', 'Metric Values']
subplots = []
y_label = 'Count'
binsize = 100
alpha_blue = 0.8
alpha_red = 0.6
n_repeats = 10
real_data = real_data_pca
fake_data = fake_data_pca
num_metrics = 11 # 11 for PCA space
sample_length = 1000
real_dists = np.zeros((n_repeats,binsize-1,num_metrics))
fake_dists = np.zeros((n_repeats,binsize-1,num_metrics))

for i in range(n_repeats):
    
    fig = plt.figure(figsize=(16,20))
    
    grid_x = 4
    grid_y = 3
    grid = mpl.gridspec.GridSpec(grid_x, grid_y)
    count = 0
    
    r = np.zeros((num_metrics, binsize-1))
    f = np.zeros((num_metrics, binsize-1))
    rr = 0
    rf = 0
    for j in range(grid_x):
        for k in range(grid_y):
            ax1 = plt.subplot(grid[j,k])
            if 'Natural' in labels[count]:
                rr = np.log(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            elif '10' in labels[count]:
                rr = np.log10(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log10(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            else:
                rr = real_data.iloc[i*sample_length:(i+1)*sample_length, count]
                rf = fake_data.iloc[i*sample_length:(i+1)*sample_length, count]
            best_bins_min = np.minimum(rr.min(), rf.min()) 
            best_bins_max = np.maximum(rr.max(), rf.max()) 
            bins_best = np.linspace(best_bins_min, best_bins_max, binsize)
            real_hist_data = ax1.hist(rr,color='blue', bins=bins_best, alpha=alpha_blue, label='Real-Real')
            fake_hist_data = ax1.hist(rf,color='red', bins=bins_best, alpha=alpha_red, label='Real-Fake')
            r[count] = real_hist_data[0]
            f[count] = fake_hist_data[0]
            ax1.set_xlabel(labels[count], fontsize=16)
            ax1.set_ylabel(y_label, fontsize=16)
            ax1.set_title(titles[count], fontsize=16)
            ax1.legend(fontsize=16)
            count += 1
            if count >= num_metrics:
                break
    rd = np.zeros((binsize-1, num_metrics))
    fd = np.zeros((binsize-1, num_metrics))
    for j in range(num_metrics):
        rd[:,j] = r[j]
        fd[:,j] = f[j]
        
    real_dists[i] = rd
    fake_dists[i] = fd  
    
    plt.tight_layout()
    plt.savefig('figures/discriminative/png/uhnds_host/pca/normal/hist_mat_vert_1000_1000_{}.png'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/pca/normal/hist_mat_vert_1000_1000_{}.eps'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/pca/normal/hist_mat_vert_1000_1000_{}.pdf'.format(i))
    plt.close()
print("Finished")    
    

Finished


In [13]:
names = ['Euclidean', 'Manhattan', 'lp: p=r=0.5', 'lp: p=r=0.75', 'cosine', 'mahalanobis', 'wasserstein', 'entropy', 'perplexity', 'mmd', 'fid']
cols = []
for i in range(n_repeats):
    cols.append('Run {}'.format(i+1))

KLD_data_all = np.zeros((num_metrics, n_repeats))
for x in range(n_repeats):
    real_data = pd.DataFrame(data=real_dists[x], columns=names)
    fake_data = pd.DataFrame(data=fake_dists[x], columns=names)
    euc_KLD = metrics.KL(real_data.loc[:,'Euclidean']/np.sum(real_data.loc[:,'Euclidean']), fake_data.loc[:,'Euclidean']/np.sum(fake_data.loc[:,'Euclidean']))
    man_KLD = metrics.KL(real_data.loc[:,'Manhattan']/np.sum(real_data.loc[:,'Manhattan']), fake_data.loc[:,'Manhattan']/np.sum(fake_data.loc[:,'Manhattan']))
    lp1_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.5']/np.sum(real_data.loc[:,'lp: p=r=0.5']), fake_data.loc[:,'lp: p=r=0.5']/np.sum(fake_data.loc[:,'lp: p=r=0.5']))
    lp2_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.75']/np.sum(real_data.loc[:,'lp: p=r=0.75']), fake_data.loc[:,'lp: p=r=0.75']/np.sum(fake_data.loc[:,'lp: p=r=0.75']))
    cos_KLD = metrics.KL(real_data.loc[:,'cosine']/np.sum(real_data.loc[:,'cosine']), fake_data.loc[:,'cosine']/np.sum(fake_data.loc[:,'cosine']))
    mah_KLD = metrics.KL(real_data.loc[:,'mahalanobis']/np.sum(real_data.loc[:,'mahalanobis']), fake_data.loc[:,'mahalanobis']/np.sum(fake_data.loc[:,'mahalanobis']))
    wass_KLD = metrics.KL(real_data.loc[:,'wasserstein']/np.sum(real_data.loc[:,'wasserstein']), fake_data.loc[:,'wasserstein']/np.sum(fake_data.loc[:,'wasserstein']))
    ent_KLD = metrics.KL(real_data.loc[:,'entropy']/np.sum(real_data.loc[:,'entropy']), fake_data.loc[:,'entropy']/np.sum(fake_data.loc[:,'entropy']))
    perp_KLD = metrics.KL(real_data.loc[:,'perplexity']/np.sum(real_data.loc[:,'perplexity']), fake_data.loc[:,'perplexity']/np.sum(fake_data.loc[:,'perplexity']))
    mmd_KLD = metrics.KL(real_data.loc[:,'mmd']/np.sum(real_data.loc[:,'mmd']), fake_data.loc[:,'mmd']/np.sum(fake_data.loc[:,'mmd']))
    fid_KLD = metrics.KL(real_data.loc[:,'fid']/np.sum(real_data.loc[:,'fid']), fake_data.loc[:,'fid']/np.sum(fake_data.loc[:,'fid']))
    
    kld_data = np.array([man_KLD, euc_KLD, lp1_KLD, lp2_KLD, cos_KLD, mah_KLD, wass_KLD, ent_KLD, perp_KLD, mmd_KLD, fid_KLD])
    KLD_data_all[:,x] = kld_data.T
names = ['Euclidean', 'Manhattan', r'$l_p$: $p=r=0.5$', r'$l_p$: $p=r=0.75$', 'Cosine', 'Mahalanobis', 'Wasserstein', 'Entropy', 'Perplexity', 'MMD', 'FID']    
KLD_data_pca = pd.DataFrame(KLD_data_all,index=names, columns=cols)
#KLD_data_untrans = KLD_data_untrans.sort_values(by=['KLD Score'], ascending=False)
KLD_data_pca.at[:,'Min'] = np.round(np.min(KLD_data_pca.loc[:,:'Run 10'], axis=1), 4)
KLD_data_pca.at[:,'Max'] = np.round(np.max(KLD_data_pca.loc[:,:'Run 10'], axis=1), 4)
KLD_data_pca.at[:,'Range'] = np.round(KLD_data_pca.loc[:,'Max'] - KLD_data_pca.loc[:,'Min'], 4)
KLD_data_pca.at[:,'Mean'] = np.round(np.mean(KLD_data_pca.loc[:,:'Run 10'], axis=1), 4)

KLD_stats_pca = KLD_data_pca.sort_values(by=['Mean'], ascending=False).loc[:,'Min':'Mean']
display(KLD_stats_pca)
#display(KLD_data_sqrt)
outfile='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_pca.csv'
outfile2='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_pca_stats.csv'
KLD_data_pca.to_csv(outfile)
KLD_stats_pca.to_csv(outfile2)

Unnamed: 0,Min,Max,Range,Mean
Entropy,8.2269,8.3973,0.1704,8.2946
Wasserstein,7.9475,9.7827,1.8352,8.1977
Perplexity,7.9586,8.2346,0.276,8.0388
$l_p$: $p=r=0.5$,7.7831,8.6805,0.8974,7.9467
$l_p$: $p=r=0.75$,7.7518,8.4951,0.7433,7.8918
Manhattan,7.7053,8.4463,0.741,7.8538
FID,5.3255,5.9547,0.6292,5.6465
Cosine,4.3435,4.6269,0.2834,4.484
Mahalanobis,3.3921,4.1072,0.7151,3.7913
Euclidean,3.2485,3.8268,0.5783,3.5098


## FFT Data

In [18]:
indir = '/home/mnewlin/git/AFIT/Thesis/code/results/fft/'
real_data_fft = pd.read_csv(indir+'real_data_exp_host_uniform.csv').drop(['Unnamed: 0','fid'], axis=1)
display(real_data_fft.head())
fake_data_fft = pd.read_csv(indir+'fake_data_exp_host_uniform.csv').drop(['Unnamed: 0','fid'], axis=1)
display(fake_data_fft.head())

Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,901.463943,39.247526,737778.574246,8201.1873,0.127637,5514.56219,0.052719,0.006487,0.51606,0.130555
1,945.997192,43.689867,766257.232218,8554.243395,0.156093,5561.824278,0.069307,0.006538,0.521493,0.055559
2,943.833445,45.175318,760900.111498,8510.164033,0.132216,5554.123043,0.072782,0.00616,0.497263,0.132582
3,1060.144361,105.001153,795800.466362,9100.95367,0.27687,5455.37574,0.133521,0.027674,2.247832,0.21553
4,931.057585,41.264877,757812.172516,8446.564148,0.135626,5593.406665,0.053391,0.002695,0.213937,0.090316


Unnamed: 0,Manhattan,Euclidean,lp: p=r=0.5,lp: p=r=0.75,cosine,mahalanobis,wasserstein,entropy,perplexity,mmd
0,937.988802,64.832824,751131.264393,8384.159182,0.385258,5866.643717,0.120186,0.099491,7.864667,0.170873
1,957.267982,62.979552,760021.10089,8529.704747,0.396242,5903.165122,0.143458,0.096696,7.636781,0.1414
2,943.018946,64.398037,758027.010104,8448.646292,0.383657,5914.237338,0.111175,0.095998,7.59626,0.189054
3,998.664396,88.688525,774987.397697,8742.344323,0.387173,5940.340949,0.178703,0.09659,7.657024,0.121738
4,935.061867,63.937645,748793.578803,8361.59615,0.375067,5908.103118,0.12291,0.097414,7.709719,0.189762


In [19]:

subplots = []
y_label = 'Count'
binsize = 100
alpha_blue = 0.8
alpha_red = 0.6
n_repeats = 10
real_data = real_data_fft
fake_data = fake_data_fft
num_metrics = 10
sample_length = 1000
real_dists = np.zeros((n_repeats,binsize-1,num_metrics))
fake_dists = np.zeros((n_repeats,binsize-1,num_metrics))

for i in range(n_repeats):
    
    fig = plt.figure(figsize=(15,20))
    
    grid_x = 5
    grid_y = 2
    grid = mpl.gridspec.GridSpec(grid_x, grid_y)
    count = 0
    
    r = np.zeros((num_metrics, binsize-1))
    f = np.zeros((num_metrics, binsize-1))
    rr = 0
    rf = 0
    for j in range(grid_x):
        for k in range(grid_y):
            ax1 = plt.subplot(grid[j,k])
            if 'Natural' in labels[count]:
                rr = np.log(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            elif '10' in labels[count]:
                rr = np.log10(real_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
                rf = np.log10(fake_data.iloc[i*sample_length:(i+1)*sample_length, count]+1)
            else:
                rr = real_data.iloc[i*sample_length:(i+1)*sample_length, count]
                rf = fake_data.iloc[i*sample_length:(i+1)*sample_length, count]
            best_bins_min = np.minimum(rr.min(), rf.min()) 
            best_bins_max = np.maximum(rr.max(), rf.max()) 
            bins_best = np.linspace(best_bins_min, best_bins_max, binsize)
            real_hist_data = ax1.hist(rr,color='blue', bins=bins_best, alpha=alpha_blue, label='Real-Real')
            fake_hist_data = ax1.hist(rf,color='red', bins=bins_best, alpha=alpha_red, label='Real-Fake')
            r[count] = real_hist_data[0]
            f[count] = fake_hist_data[0]
            ax1.set_xlabel(labels[count], fontsize=16)
            ax1.set_ylabel(y_label, fontsize=16)
            ax1.set_title(titles[count], fontsize=16)
            ax1.legend(fontsize=16)
            count += 1
    rd = np.zeros((binsize-1, num_metrics))
    fd = np.zeros((binsize-1, num_metrics))
    for j in range(num_metrics):
        rd[:,j] = r[j]
        fd[:,j] = f[j]
        
    real_dists[i] = rd
    fake_dists[i] = fd  
    
    plt.tight_layout()
    plt.savefig('figures/discriminative/png/uhnds_host/fft/hist_mat_vert_1000_1000_{}.png'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/fft/hist_mat_vert_1000_1000_{}.eps'.format(i))
    plt.savefig('figures/discriminative/eps/uhnds_host/fft/hist_mat_vert_1000_1000_{}.pdf'.format(i))
    plt.close()
print("Finished")    
    

Finished


In [20]:
names = ['Euclidean', 'Manhattan', 'lp: p=r=0.5', 'lp: p=r=0.75', 'cosine', 'mahalanobis', 'wasserstein', 'entropy', 'perplexity', 'mmd']
cols = []
for i in range(n_repeats):
    cols.append('Run {}'.format(i+1))

KLD_data_all = np.zeros((num_metrics, n_repeats))
for x in range(n_repeats):
    real_data = pd.DataFrame(data=real_dists[x], columns=names)
    fake_data = pd.DataFrame(data=fake_dists[x], columns=names)
    euc_KLD = metrics.KL(real_data.loc[:,'Euclidean']/np.sum(real_data.loc[:,'Euclidean']), fake_data.loc[:,'Euclidean']/np.sum(fake_data.loc[:,'Euclidean']))
    man_KLD = metrics.KL(real_data.loc[:,'Manhattan']/np.sum(real_data.loc[:,'Manhattan']), fake_data.loc[:,'Manhattan']/np.sum(fake_data.loc[:,'Manhattan']))
    lp1_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.5']/np.sum(real_data.loc[:,'lp: p=r=0.5']), fake_data.loc[:,'lp: p=r=0.5']/np.sum(fake_data.loc[:,'lp: p=r=0.5']))
    lp2_KLD = metrics.KL(real_data.loc[:,'lp: p=r=0.75']/np.sum(real_data.loc[:,'lp: p=r=0.75']), fake_data.loc[:,'lp: p=r=0.75']/np.sum(fake_data.loc[:,'lp: p=r=0.75']))
    cos_KLD = metrics.KL(real_data.loc[:,'cosine']/np.sum(real_data.loc[:,'cosine']), fake_data.loc[:,'cosine']/np.sum(fake_data.loc[:,'cosine']))
    mah_KLD = metrics.KL(real_data.loc[:,'mahalanobis']/np.sum(real_data.loc[:,'mahalanobis']), fake_data.loc[:,'mahalanobis']/np.sum(fake_data.loc[:,'mahalanobis']))
    wass_KLD = metrics.KL(real_data.loc[:,'wasserstein']/np.sum(real_data.loc[:,'wasserstein']), fake_data.loc[:,'wasserstein']/np.sum(fake_data.loc[:,'wasserstein']))
    ent_KLD = metrics.KL(real_data.loc[:,'entropy']/np.sum(real_data.loc[:,'entropy']), fake_data.loc[:,'entropy']/np.sum(fake_data.loc[:,'entropy']))
    perp_KLD = metrics.KL(real_data.loc[:,'perplexity']/np.sum(real_data.loc[:,'perplexity']), fake_data.loc[:,'perplexity']/np.sum(fake_data.loc[:,'perplexity']))
    mmd_KLD = metrics.KL(real_data.loc[:,'mmd']/np.sum(real_data.loc[:,'mmd']), fake_data.loc[:,'mmd']/np.sum(fake_data.loc[:,'mmd']))
    
    kld_data = np.array([man_KLD, euc_KLD, lp1_KLD, lp2_KLD, cos_KLD, mah_KLD, wass_KLD, ent_KLD, perp_KLD, mmd_KLD])
    KLD_data_all[:,x] = kld_data.T
names = ['Euclidean', 'Manhattan', r'$l_p$: $p=r=0.5$', r'$l_p$: $p=r=0.75$', 'Cosine', 'Mahalanobis', 'Wasserstein', 'Entropy', 'Perplexity', 'MMD']    
KLD_data_fft = pd.DataFrame(KLD_data_all,index=names, columns=cols)
#KLD_data_untrans = KLD_data_untrans.sort_values(by=['KLD Score'], ascending=False)
KLD_data_fft.at[:,'Min'] = np.round(np.min(KLD_data_fft.loc[:,:'Run 10'], axis=1), 4)
KLD_data_fft.at[:,'Max'] = np.round(np.max(KLD_data_fft.loc[:,:'Run 10'], axis=1), 4)
KLD_data_fft.at[:,'Range'] = np.round(KLD_data_fft.loc[:,'Max'] - KLD_data_fft.loc[:,'Min'], 4)
KLD_data_fft.at[:,'Mean'] = np.round(np.mean(KLD_data_fft.loc[:,:'Run 10'], axis=1), 4)

KLD_stats_fft = KLD_data_fft.sort_values(by=['Mean'], ascending=False).loc[:,'Min':'Mean']
display(KLD_stats_fft)
#display(KLD_data_sqrt)
outfile='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_fft.csv'
outfile2='/home/mnewlin/git/AFIT/Thesis/code/results/KLD_results_host_fft_stats.csv'
KLD_data_fft.to_csv(outfile)
KLD_stats_fft.to_csv(outfile2)

Unnamed: 0,Min,Max,Range,Mean
Entropy,8.2557,8.3262,0.0705,8.2967
Cosine,7.9597,8.3278,0.3681,8.0474
Perplexity,7.572,7.6303,0.0583,7.6095
Mahalanobis,4.6258,7.7641,3.1383,7.3879
Wasserstein,6.1238,6.9641,0.8403,6.5194
Euclidean,5.7778,6.5679,0.7901,6.1697
MMD,1.3307,2.0197,0.689,1.6078
Manhattan,0.0395,0.4984,0.4589,0.3492
$l_p$: $p=r=0.75$,0.0344,0.4527,0.4183,0.3247
$l_p$: $p=r=0.5$,0.0036,0.344,0.3404,0.261
