# Imports

In [3]:
import os, platform

local = platform.system() == 'Windows'

if local:
	print('Running Locally')
	workspace_dir = './'
else:
	!pip install qqdm

	from google.colab import drive
	drive.mount('/content/drive/')

	workspace_dir = '/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/'
	print('Running in Google Colab')

Running Locally


In [4]:
import random

from qqdm.notebook import qqdm as qqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import pickle

import torch
from torch import nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset, Dataset)
from torch.optim import Adam

from scipy.stats import mstats

# Model

In [None]:
class IDS_VAE(nn.Module):
	def __init__(self, latent_dim, KL_weight):
		super(IDS_VAE, self).__init__()

		self.latent_dim = latent_dim
		self.KL_weight = KL_weight

		self.criterion = nn.MSELoss()

		self.channel_dims = [1, 32, 64]

		#====================[Encoder]====================
		self.encoder = nn.Sequential(
			nn.Conv2d(                                                                #in: N*1*8*8, out: N*dim[1]*4*4
				in_channels=self.channel_dims[0], out_channels=self.channel_dims[1],
				kernel_size=3, stride=2, padding=1
			),
			nn.BatchNorm2d(self.channel_dims[1]),                                     #in and out: N*dim[1]*4*4
			nn.LeakyReLU(),                                                           #in and out: N*dim[1]*4*4
			nn.Conv2d(                                                                #in: N*dim[1]*4*4, out: N*dim[2]*2*2
				in_channels=self.channel_dims[1], out_channels=self.channel_dims[2],
				kernel_size=3, stride=2, padding=1
			),
			nn.BatchNorm2d(self.channel_dims[2]),                                     #in and out: N*dim[2]*2*2
			nn.LeakyReLU()                                                            #in and out: N*dim[2]*2*2
		)

		#====================[Reparametrize]====================
		self.mu_layer  = nn.Linear(self.channel_dims[2]*2*2, self.latent_dim)         #in: N*1*(dim[2]*4)*1, out: N*1*latent_dim*1
		self.var_layer = nn.Linear(self.channel_dims[2]*2*2, self.latent_dim)         #in: N*1*(dim[2]*4)*1, out: N*1*latent_dim*1

		#====================[Decode]====================
		self.decode_linear_input = nn.Linear(                                         #in: N*1*latent_dim*1, out: N*1*(dim[2]*4)*1
			in_features=self.latent_dim, out_features=self.channel_dims[2]*2*2
		)


		self.decoder = nn.Sequential(
			nn.ConvTranspose2d(                                                       #in: N*dim[2]*2*2, out: N*dim[1]*4*4
				in_channels=self.channel_dims[2], out_channels=self.channel_dims[1],
				kernel_size=3, stride=2, padding=1, output_padding=1
			),
			nn.BatchNorm2d(self.channel_dims[1]),                        #in and out: N*dim[1]*4*4
			nn.LeakyReLU(),                                                             #in and out: N*dim[1]*4*4
			nn.ConvTranspose2d(                                                       #in: N*dim[1]*4*4, out: N*dim[1]*8*8
				in_channels=self.channel_dims[1], out_channels=self.channel_dims[1],
				kernel_size=3, stride=2, padding=1, output_padding=1
			),
			nn.BatchNorm2d(self.channel_dims[1]),                        #in and out: N*dim[1]*8*8
			nn.LeakyReLU(),                                                           #in and out: N*dim[1]*8*8
			nn.Conv2d(                                                                #in: N*dim[1]*8*8, out: N*1*8*8
				in_channels=self.channel_dims[1], out_channels=self.channel_dims[0],
				kernel_size=3, padding=1
			),
			##nn.Tanh()                                                                 #in and out: N*1*8*8
			nn.Sigmoid()
		)
    
	def encode(self, x):
		result = self.encoder(x)
		result = torch.flatten(result, start_dim=1)

		return result

	def reparameterize(self, x):
		mu = self.mu_layer(x)
		var = self.var_layer(x)

		std = torch.exp(0.5 * var)
		eps = torch.randn_like(std)
		return (eps * std) + mu, mu, var

	def decode(self, x):
		result = self.decode_linear_input(x)
		result = result.view(-1, self.channel_dims[2], 2, 2)
		result = self.decoder(result)
		return result

	def forward(self, x):
		encoded_output = self.encode(x)
		reparameterized_output, mu, var = self.reparameterize(encoded_output)
		decoded_output = self.decode(reparameterized_output)
		return encoded_output, reparameterized_output, mu, var, decoded_output

	def loss(self, output, input, mu, var):
		mse_loss = self.criterion(output, input)

		KL_loss = torch.mean(-0.5 * torch.sum(1 + var - mu**2 - var.exp(), dim=1), dim=0)

		loss = mse_loss + self.KL_weight * KL_loss

		return loss, mse_loss, KL_loss

In [21]:
cols_USBIDS_to_CICIDS2018 = {
	'Dst Port'                   : 'Dst Port',
	'Protocol'                   : 'Protocol',
	'Timestamp'                  : 'Timestamp',
	'Flow Duration'              : 'Flow Duration',
	'Total Fwd Packet'           : 'Tot Fwd Pkts',
	'Total Bwd packets'          : 'Tot Bwd Pkts',
	'Total Length of Fwd Packet' : 'TotLen Fwd Pkts',
	'Total Length of Bwd Packet' : 'TotLen Bwd Pkts',
	'Fwd Packet Length Max'      : 'Fwd Pkt Len Max',
	'Fwd Packet Length Min'      : 'Fwd Pkt Len Min',
	'Fwd Packet Length Mean'     : 'Fwd Pkt Len Mean',
	'Fwd Packet Length Std'      : 'Fwd Pkt Len Std',
	'Bwd Packet Length Max'      : 'Bwd Pkt Len Max',
	'Bwd Packet Length Min'      : 'Bwd Pkt Len Min',
	'Bwd Packet Length Mean'     : 'Bwd Pkt Len Mean',
	'Bwd Packet Length Std'      : 'Bwd Pkt Len Std',
	'Flow Bytes/s'               : 'Flow Byts/s',
	'Flow Packets/s'             : 'Flow Pkts/s',
	'Flow IAT Mean'              : 'Flow IAT Mean',
	'Flow IAT Std'               : 'Flow IAT Std',
	'Flow IAT Max'               : 'Flow IAT Max',
	'Flow IAT Min'               : 'Flow IAT Min',
	'Fwd IAT Total'              : 'Fwd IAT Tot',
	'Fwd IAT Mean'               : 'Fwd IAT Mean',
	'Fwd IAT Std'                : 'Fwd IAT Std',
	'Fwd IAT Max'                : 'Fwd IAT Max',
	'Fwd IAT Min'                : 'Fwd IAT Min',
	'Bwd IAT Total'              : 'Bwd IAT Tot',
	'Bwd IAT Mean'               : 'Bwd IAT Mean',
	'Bwd IAT Std'                : 'Bwd IAT Std',
	'Bwd IAT Max'                : 'Bwd IAT Max',
	'Bwd IAT Min'                : 'Bwd IAT Min',
	'Fwd PSH Flags'              : 'Fwd PSH Flags',
	'Bwd PSH Flags'              : 'Bwd PSH Flags',
	'Fwd URG Flags'              : 'Fwd URG Flags',
	'Bwd URG Flags'              : 'Bwd URG Flags',
	'Fwd Header Length'          : 'Fwd Header Len',
	'Bwd Header Length'          : 'Bwd Header Len',
	'Fwd Packets/s'              : 'Fwd Pkts/s',
	'Bwd Packets/s'              : 'Bwd Pkts/s',
	'Packet Length Min'          : 'Pkt Len Min',
	'Packet Length Max'          : 'Pkt Len Max',
	'Packet Length Mean'         : 'Pkt Len Mean',
	'Packet Length Std'          : 'Pkt Len Std',
	'Packet Length Variance'     : 'Pkt Len Var',
	'FIN Flag Count'             : 'FIN Flag Cnt',
	'SYN Flag Count'             : 'SYN Flag Cnt',
	'RST Flag Count'             : 'RST Flag Cnt',
	'PSH Flag Count'             : 'PSH Flag Cnt',
	'ACK Flag Count'             : 'ACK Flag Cnt',
	'URG Flag Count'             : 'URG Flag Cnt',
	'CWR Flag Count'             : 'CWE Flag Count',
	'ECE Flag Count'             : 'ECE Flag Cnt',
	'Down/Up Ratio'              : 'Down/Up Ratio',
	'Average Packet Size'        : 'Pkt Size Avg',
	'Fwd Segment Size Avg'       : 'Fwd Seg Size Avg',
	'Bwd Segment Size Avg'       : 'Bwd Seg Size Avg',
	'Fwd Bytes/Bulk Avg'         : 'Fwd Byts/b Avg',
	'Fwd Packet/Bulk Avg'        : 'Fwd Pkts/b Avg',
	'Fwd Bulk Rate Avg'          : 'Fwd Blk Rate Avg',
	'Bwd Bytes/Bulk Avg'         : 'Bwd Byts/b Avg',
	'Bwd Packet/Bulk Avg'        : 'Bwd Pkts/b Avg',
	'Bwd Bulk Rate Avg'          : 'Bwd Blk Rate Avg',
	'Subflow Fwd Packets'        : 'Subflow Fwd Pkts',
	'Subflow Fwd Bytes'          : 'Subflow Fwd Byts',
	'Subflow Bwd Packets'        : 'Subflow Bwd Pkts',
	'Subflow Bwd Bytes'          : 'Subflow Bwd Byts',
	'FWD Init Win Bytes'         : 'Init Fwd Win Byts',
	'Bwd Init Win Bytes'         : 'Init Bwd Win Byts',
	'Fwd Act Data Pkts'          : 'Fwd Act Data Pkts',
	'Fwd Seg Size Min'           : 'Fwd Seg Size Min',
	'Active Mean'                : 'Active Mean',
	'Active Std'                 : 'Active Std',
	'Active Max'                 : 'Active Max',
	'Active Min'                 : 'Active Min',
	'Idle Mean'                  : 'Idle Mean',
	'Idle Std'                   : 'Idle Std',
	'Idle Max'                   : 'Idle Max',
	'Idle Min'                   : 'Idle Min',
	'Label'                      : 'Label'
}

#cols_USBIDS_to_CICIDS2018
cols_CICIDS2018_to_USBIDS = {}

for key in cols_USBIDS_to_CICIDS2018:
	cols_CICIDS2018_to_USBIDS[cols_USBIDS_to_CICIDS2018[key]] = key

final_order = [
	'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 
	'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
	'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
	'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s',
	'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
	'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
	'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
	'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
	'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
	'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
	'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
	'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
	'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
	'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
	'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts',
	'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
	'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
	'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min',
	'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Label'
]

In [47]:
# cols_USBIDS_to_CICIDS2018
def read_and_convert_USB_to_CIC2018(filename, col_to_2018, col_order):
    print('=====[Reading CSV file]=====')
    df = pd.read_csv(filename)
    print('=====[Converting columns from USB IDS to CIC IDS 2018]=====')
    cols = []
    for col in df.columns:
        if col in col_to_2018:
            cols.append(col_to_2018[col])
        else:
            print(f'Column {col} not in 2018 map')
            cols.append(col)
    df.columns = cols
    print('=====[Reordering columns]=====')
    df = df[col_order]
    return df

def print_col_stats(df, col):
    print(f'Column \'{col}\' stats:')
    print(f'\tmin:  {df[col].min()}')
    print(f'\tmean: {df[col].mean()}')
    print(f'\tmax:  {df[col].max()}')

def new_normalization(df: pd.DataFrame, cols_to_normalize: list, debug=False):
    print('=====[Performing winsorization and normalization]=====')
    for col in cols_to_normalize:
        if debug:print_col_stats(df, col)
        if debug:print(f'Winsorizing column \'{col}\'')

        df[col] = mstats.winsorize(df[col], limits=[0.05, 0.05])
        if debug:print_col_stats(df, col)
  
        if debug:print('Min Max Scaling to [0,1]')
        width = df[col].max() - df[col].min()
        if width == 0.0:
            print(f'Zero width column: \'{col}\'')
            width = 1
        df[col] = ( df[col] - df[col].min() ) / ( width )
        if debug:print_col_stats(df, col)
        if debug:print('')

    return df

def freq_encode_cols(df: pd.DataFrame, cols_to_encode: list, debug=False):
    print('=====[Performing frequency encoding]=====')
    for col in cols_to_encode:
        if debug: print(f'Frequency encoding column {col}\n')
        val_counts = df[col].value_counts(normalize=True)
        if debug: print('Value counts:\n', val_counts, '\n')
  
        replace_dict = {}

        for idx in val_counts.index:
            replace_dict[idx] = val_counts[idx]
        
        if debug: print('Replace Dict:', replace_dict)
        df = df.replace({col: replace_dict})

        if debug:
            unique_values = df[col].unique()
            print('Unique Values:', unique_values, '\n')

        return df



def normalize_CSV(df: pd.DataFrame):
    #Remove cols
    # cols_to_remove = [
    #     'Dst Port', 'Timestamp', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP'
    # ]
    # df = df.drop(cols_to_remove, axis=1)
    # print(f'Dropped columns: {cols_to_remove}')

    #'Dst Port' cleaning
    # df[df['Dst Port'] > 49151] = 49152
    # print('value counts\n', df['Dst Port'].value_counts())

    print(f'Columns: {df.columns}')
 
    cols_to_encode = ['Protocol']
    df = freq_encode_cols(df, cols_to_encode)
 
    print('Column types:')
    print(df.dtypes)
    # for i in range(1000000):
    #   if not isinstance(df['Flow Duration'][i], (float, int)):
    #       print(i,[df['Flow Duration'][i]])
    # df = df.astype({'Flow Duration':'float'})
    # print(df['Flow Duration'].describe())
    # return None
 
    cols_to_normalize = [
        'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
        'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 
        'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 
        'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
        'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 
        'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 
        'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 
        'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 
        'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len',
        'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
        'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
        'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
        'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
        'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg',
        'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg',
        'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg',
        'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts',
        'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Init Fwd Win Byts',
        'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min',
        'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
        'Idle Std', 'Idle Max', 'Idle Min'
    ]
    df = new_normalization(df, cols_to_normalize, False)

    # Reorder columns
    cols_present = True
    for col in df.columns:
        if col not in cols_to_encode and col not in cols_to_normalize and col != 'Label':
            print(f'Column {col} is unaccounted for in conversion!')
            cols_present = False
    if cols_present:
        print('All columns are accounted for!')
    else:
        print('Some columns were not accounted for!')

    return df

In [81]:
filen = 'TCPFlood-Security2.csv'
filename = workspace_dir + 'USB-IDS/' + filen
data = read_and_convert_USB_to_CIC2018(filename, cols_USBIDS_to_CICIDS2018, final_order)
data = normalize_CSV(data)
# data = pd.read_csv('USB-IDS/Hulk-Evasive.csv')

=====[Reading CSV file]=====
=====[Converting columns from USB IDS to CIC IDS 2018]=====
Column Flow ID not in 2018 map
Column Src IP not in 2018 map
Column Src Port not in 2018 map
Column Dst IP not in 2018 map
=====[Reordering columns]=====
Columns: Index(['Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len 

In [82]:
print(data['Label'].value_counts())
for label in data['Label'].unique():
    split_filename = workspace_dir+'USB-IDS/class_split/'+filen[:-4]+'-'+label+'.csv'
    print(f'\nSaving {label} class to\n{split_filename}')
    subdata = data[data['Label'] == label]
    subdata.to_csv(split_filename, index=False)

TCPFlood-Security2    296944
BENIGN                    27
Name: Label, dtype: int64

Saving TCPFlood-Security2 class to
/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/USB-IDS/class_split/TCPFlood-Security2-TCPFlood-Security2.csv

Saving BENIGN class to
/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/USB-IDS/class_split/TCPFlood-Security2-BENIGN.csv


In [None]:
print('columns:',list(data.columns))

columns: ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Fla

In [None]:
final_order = [
	'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
	'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
	'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 
	'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 
	'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 
	'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 
	'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 
	'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 
	'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 
	'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 
	'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 
	'PSH Flag Cnt', 'URG Flag Cnt', 'ECE Flag Cnt', 'Down/Up Ratio', 
	'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 
	'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 
	'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 
	'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std', 
	'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Label']

In [None]:
#Replace column names with CIC-IDS-2018 names
new_cols = []
for i in range(len(data.columns)):
	if data.columns[i] not in cols_USBIDS_to_CICIDS2018:
		new_cols.append(data.columns[i])
		continue
	new_cols.append(cols_USBIDS_to_CICIDS2018[data.columns[i]])
data.columns = new_cols

#Remove columns not used
for col in data.columns:
	if col not in final_order:
		print('Removing', col, 'column')
		data = data.drop([col], axis=1)

#Check that order of columns is the same
good_order = True
for i in range(len(data.columns)):
	if data.columns[i] != final_order[i]:
		print('Column', i+1, 'is', new_cols[i], 'instead of', final_order[i])
		good_order = False
if good_order:
	print('Order of columns is good!')
	print('columns:',list(data.columns))

del new_cols, good_order
del col, i


Removing Flow ID column
Removing Src IP column
Removing Src Port column
Removing Dst IP column
Removing Dst Port column
Removing Protocol column
Removing Timestamp column
Removing Fwd PSH Flags column
Removing Bwd PSH Flags column
Removing Fwd URG Flags column
Removing Bwd URG Flags column
Removing ACK Flag Cnt column
Removing CWE Flag Count column
Removing Fwd Byts/b Avg column
Removing Fwd Pkts/b Avg column
Removing Fwd Blk Rate Avg column
Removing Bwd Byts/b Avg column
Removing Bwd Pkts/b Avg column
Removing Bwd Blk Rate Avg column
Removing Active Max column
Order of columns is good!
columns: ['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',

In [None]:
from scipy.stats import mstats

def winsorize_func(s):
	return mstats.winsorize(s, limits=[0.05, 0.05])

In [None]:
data.iloc[0]
data.apply(winsorize_func)
data.iloc[0]

In [8]:
files = [
    'USB-IDS\Hulk-Evasive.csv',
    'USB-IDS\Hulk-NoDefense.csv',
    'USB-IDS\Hulk-Reqtimeout.csv',
    'USB-IDS\Hulk-Security2.csv',
    'USB-IDS\Slowhttptest-Evasive.csv',
    'USB-IDS\Slowhttptest-NoDefense.csv',
    'USB-IDS\Slowhttptest-Reqtimeout.csv',
    'USB-IDS\Slowhttptest-Security2.csv',
    'USB-IDS\Slowloris-Evasive.csv',
    'USB-IDS\Slowloris-NoDefense.csv',
    'USB-IDS\Slowloris-Reqtimeout.csv',
    'USB-IDS\Slowloris-Security2.csv',
    'USB-IDS\TCPFlood-Evasive.csv',
    'USB-IDS\TCPFlood-NoDefense.csv',
    'USB-IDS\TCPFlood-Reqtimeout.csv',
    'USB-IDS\TCPFlood-Security2.csv',
    'USB-IDS\REGULAR.csv'
]

In [9]:
for f in files:
    df = pd.read_csv(f)
    print(f)
    for label in df['Label'].unique():
        print(f'\t{label}')
        dfl = df[df['Label'] == label]
        pd.options.mode.use_inf_as_na = True
        neg_inf_idx = dfl[dfl.isnull().any(axis=1)].index.tolist()
        print(f'\t\tBad vals: {len(neg_inf_idx)}')
    print('')

USB-IDS\Hulk-Evasive.csv
	Hulk-Evasive
		Bad vals: 0
	BENIGN
		Bad vals: 3

USB-IDS\Hulk-NoDefense.csv
	Hulk-NoDefense
		Bad vals: 0
	BENIGN
		Bad vals: 3

USB-IDS\Hulk-Reqtimeout.csv
	Hulk-Reqtimeout
		Bad vals: 0
	BENIGN
		Bad vals: 2

USB-IDS\Hulk-Security2.csv
	Hulk-Security2
		Bad vals: 0
	BENIGN
		Bad vals: 2

USB-IDS\Slowhttptest-Evasive.csv
	BENIGN
		Bad vals: 3
	Slowhttptest-Evasive
		Bad vals: 0

USB-IDS\Slowhttptest-NoDefense.csv
	BENIGN
		Bad vals: 2
	Slowhttptest-NoDefense
		Bad vals: 0

USB-IDS\Slowhttptest-Reqtimeout.csv
	BENIGN
		Bad vals: 0
	Slowhttptest-Reqtimeout
		Bad vals: 0

USB-IDS\Slowhttptest-Security2.csv
	BENIGN
		Bad vals: 1
	Slowhttptest-Security2
		Bad vals: 0

USB-IDS\Slowloris-Evasive.csv
	BENIGN
		Bad vals: 1
	Slowloris-Evasive
		Bad vals: 0

USB-IDS\Slowloris-NoDefense.csv
	BENIGN
		Bad vals: 0
	Slowloris-NoDefense
		Bad vals: 0

USB-IDS\Slowloris-Reqtimeout.csv
	BENIGN
		Bad vals: 0
	Slowloris-Reqtimeout
		Bad vals: 0

USB-IDS\Slowloris-Security2.csv
