In [1]:
import os, platform

local = platform.system() == 'Windows'

if local:
	print('Running Locally')
	workspace_dir = './'
else:
	!pip install qqdm

	from google.colab import drive
	drive.mount('/content/drive/')

	workspace_dir = '/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/'
	print('Running in Google Colab')

Running Locally


In [2]:
import random

from qqdm.notebook import qqdm as qqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import pickle

import torch
from torch import nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset, Dataset)
from torch.optim import Adam

from scipy.stats import mstats

In [10]:
cols_2017_to_2018 = {
    ' Destination Port'            : 'Dst Port',
    ' Flow Duration'               : 'Flow Duration',
    ' Total Fwd Packets'           : 'Tot Fwd Pkts',
    ' Total Backward Packets'      : 'Tot Bwd Pkts',
    'Total Length of Fwd Packets'  : 'TotLen Fwd Pkts',
    ' Total Length of Bwd Packets' : 'TotLen Bwd Pkts',
    ' Fwd Packet Length Max'       : 'Fwd Pkt Len Max',
    ' Fwd Packet Length Min'       : 'Fwd Pkt Len Min',
    ' Fwd Packet Length Mean'      : 'Fwd Pkt Len Mean',
    ' Fwd Packet Length Std'       : 'Fwd Pkt Len Std',
    'Bwd Packet Length Max'        : 'Bwd Pkt Len Max',
    ' Bwd Packet Length Min'       : 'Bwd Pkt Len Min',
    ' Bwd Packet Length Mean'      : 'Bwd Pkt Len Mean',
    ' Bwd Packet Length Std'       : 'Bwd Pkt Len Std',
    'Flow Bytes/s'                 : 'Flow Byts/s',
    ' Flow Packets/s'              : 'Flow Pkts/s',
    ' Flow IAT Mean'               : 'Flow IAT Mean',
    ' Flow IAT Std'                : 'Flow IAT Std',
    ' Flow IAT Max'                : 'Flow IAT Max',
    ' Flow IAT Min'                : 'Flow IAT Min',
    'Fwd IAT Total'                : 'Fwd IAT Tot',
    ' Fwd IAT Mean'                : 'Fwd IAT Mean',
    ' Fwd IAT Std'                 : 'Fwd IAT Std',
    ' Fwd IAT Max'                 : 'Fwd IAT Max',
    ' Fwd IAT Min'                 : 'Fwd IAT Min',
    'Bwd IAT Total'                : 'Bwd IAT Tot',
    ' Bwd IAT Mean'                : 'Bwd IAT Mean',
    ' Bwd IAT Std'                 : 'Bwd IAT Std',
    ' Bwd IAT Max'                 : 'Bwd IAT Max',
    ' Bwd IAT Min'                 : 'Bwd IAT Min',
    'Fwd PSH Flags'                : 'Fwd PSH Flags',
    ' Bwd PSH Flags'               : 'Bwd PSH Flags',
    ' Fwd URG Flags'               : 'Fwd URG Flags',
    ' Bwd URG Flags'               : 'Bwd URG Flags',
    ' Fwd Header Length'           : 'Fwd Header Len',
    ' Bwd Header Length'           : 'Bwd Header Len',
    'Fwd Packets/s'                : 'Fwd Pkts/s',
    ' Bwd Packets/s'               : 'Bwd Pkts/s',
    ' Min Packet Length'           : 'Pkt Len Min',
    ' Max Packet Length'           : 'Pkt Len Max',
    ' Packet Length Mean'          : 'Pkt Len Mean',
    ' Packet Length Std'           : 'Pkt Len Std',
    ' Packet Length Variance'      : 'Pkt Len Var',
    'FIN Flag Count'               : 'FIN Flag Cnt',
    ' SYN Flag Count'              : 'SYN Flag Cnt',
    ' RST Flag Count'              : 'RST Flag Cnt',
    ' PSH Flag Count'              : 'PSH Flag Cnt',
    ' ACK Flag Count'              : 'ACK Flag Cnt',
    ' URG Flag Count'              : 'URG Flag Cnt',
    ' CWE Flag Count'              : 'CWE Flag Count',
    ' ECE Flag Count'              : 'ECE Flag Cnt',
    ' Down/Up Ratio'               : 'Down/Up Ratio',
    ' Average Packet Size'         : 'Pkt Size Avg',
    ' Avg Fwd Segment Size'        : 'Fwd Seg Size Avg',
    ' Avg Bwd Segment Size'        : 'Bwd Seg Size Avg',
    'Fwd Avg Bytes/Bulk'           : 'Fwd Byts/b Avg',
    ' Fwd Avg Packets/Bulk'        : 'Fwd Pkts/b Avg',
    ' Fwd Avg Bulk Rate'           : 'Fwd Blk Rate Avg',
    ' Bwd Avg Bytes/Bulk'          : 'Bwd Byts/b Avg',
    ' Bwd Avg Packets/Bulk'        : 'Bwd Pkts/b Avg',
    'Bwd Avg Bulk Rate'            : 'Bwd Blk Rate Avg',
    'Subflow Fwd Packets'          : 'Subflow Fwd Pkts',
    ' Subflow Fwd Bytes'           : 'Subflow Fwd Byts',
    ' Subflow Bwd Packets'         : 'Subflow Bwd Pkts',
    ' Subflow Bwd Bytes'           : 'Subflow Bwd Byts',
    'Init_Win_bytes_forward'       : 'Init Fwd Win Byts',
    ' Init_Win_bytes_backward'     : 'Init Bwd Win Byts',
    ' act_data_pkt_fwd'            : 'Fwd Act Data Pkts',
    ' min_seg_size_forward'        : 'Fwd Seg Size Min',
    'Active Mean'                  : 'Active Mean',
    ' Active Std'                  : 'Active Std',
    ' Active Max'                  : 'Active Max',
    ' Active Min'                  : 'Active Min',
    'Idle Mean'                    : 'Idle Mean',
    ' Idle Std'                    : 'Idle Std',
    ' Idle Max'                    : 'Idle Max',
    ' Idle Min'                    : 'Idle Min',
    ' Label'                       : 'Label'
}

cols_2018_to_2017 = {}

for key in cols_2017_to_2018:
    cols_2018_to_2017[cols_2017_to_2018[key]] = key

final_order = [
	# 'Protocol', 
    'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 
	'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
	'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
	'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s',
	'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
	'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
	'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
	'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
	'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
	'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
	'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
	'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
	'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
	'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
	'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts',
	'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
	'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
	'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min',
	'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Label'
]

In [29]:
# cols_USBIDS_to_CICIDS2018
def read_and_convert_USB_to_CIC2018(filename, col_to_2018, col_order):
    print('=====[Reading CSV file]=====')
    df = pd.read_csv(filename)
    print('=====[Converting columns from USB IDS to CIC IDS 2018]=====')
    cols = []
    for col in df.columns:
        if col in col_to_2018:
            cols.append(col_to_2018[col])
        else:
            print(f'Column {col} not in 2018 map')
            cols.append(col)
    df.columns = cols
    print('=====[Reordering columns]=====')
    df = df[col_order]
    return df

def read_and_convert_2017_to_2018(filename, col_to_2018, col_order):
    print('=====[Reading CSV file]=====')
    df = pd.read_csv(filename)
    print('=====[Converting columns from USB IDS to CIC IDS 2018]=====')
    cols = []
    for col in df.columns:
        if col in col_to_2018:
            cols.append(col_to_2018[col])
        else:
            print(f'Column {col} not in 2018 map')
            cols.append(col)
    df.columns = cols
    print('=====[Reordering columns]=====')
    df = df[col_order]
    return df


def print_col_stats(df, col):
    print(f'Column \'{col}\' stats:')
    print(f'\tmin:  {df[col].min()}')
    print(f'\tmean: {df[col].mean()}')
    print(f'\tmax:  {df[col].max()}')

def new_normalization(df: pd.DataFrame, cols_to_normalize: list, debug=False):
    print('=====[Performing winsorization and normalization]=====')
    for col in cols_to_normalize:
        if debug:print_col_stats(df, col)
        if debug:print(f'Winsorizing column \'{col}\'')

        df[col] = mstats.winsorize(df[col], limits=[0.05, 0.05])
        if debug:print_col_stats(df, col)
  
        if debug:print('Min Max Scaling to [0,1]')
        width = df[col].max() - df[col].min()
        if width == 0.0:
            print(f'Zero width column: \'{col}\'')
            width = 1
        df[col] = ( df[col] - df[col].min() ) / ( width )
        if debug:print_col_stats(df, col)
        if debug:print('')

    return df

def freq_encode_cols(df: pd.DataFrame, cols_to_encode: list, debug=False):
    print('=====[Performing frequency encoding]=====')
    for col in cols_to_encode:
        if debug: print(f'Frequency encoding column {col}\n')
        val_counts = df[col].value_counts(normalize=True)
        if debug: print('Value counts:\n', val_counts, '\n')
  
        replace_dict = {}

        for idx in val_counts.index:
            replace_dict[idx] = val_counts[idx]
        
        if debug: print('Replace Dict:', replace_dict)
        df = df.replace({col: replace_dict})

        if debug:
            unique_values = df[col].unique()
            print('Unique Values:', unique_values, '\n')

        return df



def normalize_CSV(df: pd.DataFrame):
    #Remove cols
    # cols_to_remove = [
    #     'Dst Port', 'Timestamp', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP'
    # ]
    # df = df.drop(cols_to_remove, axis=1)
    # print(f'Dropped columns: {cols_to_remove}')

    #'Dst Port' cleaning
    # df[df['Dst Port'] > 49151] = 49152
    # print('value counts\n', df['Dst Port'].value_counts())

    # print(f'Columns: {df.columns}')
 
    # cols_to_encode = ['Protocol']
    # df = freq_encode_cols(df, cols_to_encode)
 
    print('Column types:')
    print(df.dtypes)
    # for i in range(1000000):
    #   if not isinstance(df['Flow Duration'][i], (float, int)):
    #       print(i,[df['Flow Duration'][i]])
    # df = df.astype({'Flow Duration':'float'})
    # print(df['Flow Duration'].describe())
    # return None
 
    cols_to_normalize = [
        'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
        'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 
        'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 
        'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
        'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 
        'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 
        'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 
        'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 
        'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len',
        'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
        'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
        'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
        'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
        'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg',
        'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg',
        'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg',
        'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts',
        'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Init Fwd Win Byts',
        'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min',
        'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
        'Idle Std', 'Idle Max', 'Idle Min'
    ]
    df = new_normalization(df, cols_to_normalize, False)

    # Reorder columns
    cols_present = True
    for col in df.columns:
        if col not in cols_to_normalize and col != 'Label':
            print(f'Column {col} is unaccounted for in conversion!')
            cols_present = False
    if cols_present:
        print('All columns are accounted for!')
    else:
        print('Some columns were not accounted for!')

    return df

In [38]:
filen = 'Wednesday-workingHours.csv'
#df = pd.read_csv(workspace_dir + 'CIC-IDS-2017/' + filen)
data = read_and_convert_2017_to_2018(workspace_dir + 'CIC-IDS-2017/' + filen, cols_2017_to_2018, final_order)
data = normalize_CSV(data)

=====[Reading CSV file]=====
=====[Converting columns from USB IDS to CIC IDS 2018]=====
Column  Fwd Header Length.1 not in 2018 map
=====[Reordering columns]=====
Column types:
Flow Duration        int64
Tot Fwd Pkts         int64
Tot Bwd Pkts         int64
TotLen Fwd Pkts      int64
TotLen Bwd Pkts      int64
                    ...   
Idle Mean          float64
Idle Std           float64
Idle Max             int64
Idle Min             int64
Label               object
Length: 77, dtype: object
=====[Performing winsorization and normalization]=====
Zero width column: 'Fwd PSH Flags'
Zero width column: 'Bwd PSH Flags'
Zero width column: 'Fwd URG Flags'
Zero width column: 'Bwd URG Flags'
Zero width column: 'SYN Flag Cnt'
Zero width column: 'RST Flag Cnt'
Zero width column: 'CWE Flag Count'
Zero width column: 'ECE Flag Cnt'
Zero width column: 'Fwd Byts/b Avg'
Zero width column: 'Fwd Pkts/b Avg'
Zero width column: 'Fwd Blk Rate Avg'
Zero width column: 'Bwd Byts/b Avg'
Zero width column: '

In [39]:
print(data['Label'].value_counts())
for label in data['Label'].unique():
    split_filename = workspace_dir+'CIC-IDS-2017/class_split/'+filen[:-4]+'-'+label+'.csv'
    print(f'\nSaving {label} class to\n{split_filename}')
    subdata = data[data['Label'] == label]
    subdata.to_csv(split_filename, index=False)
#/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/CIC-IDS-2017/class_split

BENIGN              440031
DoS Hulk            231073
DoS GoldenEye        10293
DoS slowloris         5796
DoS Slowhttptest      5499
Heartbleed              11
Name: Label, dtype: int64

Saving BENIGN class to
/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/CIC-IDS-2017/class_split/Wednesday-workingHours-BENIGN.csv

Saving DoS slowloris class to
/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/CIC-IDS-2017/class_split/Wednesday-workingHours-DoS slowloris.csv

Saving DoS Slowhttptest class to
/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/CIC-IDS-2017/class_split/Wednesday-workingHours-DoS Slowhttptest.csv

Saving DoS Hulk class to
/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/CIC-IDS-2017/class_split/Wednesday-workingHours-DoS Hulk.csv

Saving DoS GoldenEye class to
/content/drive/MyDrive/School/2022_RIT/Masters/Workspace/CIC-IDS-2017/class_split/Wednesday-workingHours-DoS GoldenEye.csv

Saving Heartbleed class to
/content/drive/MyDrive/School/20

In [3]:
files = [
    'CIC-IDS-2017/Friday-WorkingHours-Morning.csv',
    'CIC-IDS-2017/Monday-WorkingHours.csv',
    'CIC-IDS-2017/Thursday-WorkingHours-Afternoon-Infilteration.csv',
    'CIC-IDS-2017/Thursday-WorkingHours-Morning-WebAttacks.csv',
    'CIC-IDS-2017/Tuesday-WorkingHours.csv',
    'CIC-IDS-2017/Wednesday-workingHours.csv'
]

In [6]:
for f in files:
    df = pd.read_csv(f)
    for label in df[' Label'].unique():
        print(f)
        print(f'\t{label}')
        dfl = df[df[' Label'] == label]
        pd.options.mode.use_inf_as_na = True
        neg_inf_idx = dfl[dfl.isnull().any(axis=1)].index.tolist()
        print(f'\t\tBad vals: {len(neg_inf_idx)}')
    print('')

CIC-IDS-2017/Friday-WorkingHours-Morning.csv
	BENIGN
		Bad vals: 112
CIC-IDS-2017/Friday-WorkingHours-Morning.csv
	Bot
		Bad vals: 10

CIC-IDS-2017/Monday-WorkingHours.csv
	BENIGN
		Bad vals: 437

CIC-IDS-2017/Thursday-WorkingHours-Afternoon-Infilteration.csv
	BENIGN
		Bad vals: 207
CIC-IDS-2017/Thursday-WorkingHours-Afternoon-Infilteration.csv
	Infiltration
		Bad vals: 0

CIC-IDS-2017/Thursday-WorkingHours-Morning-WebAttacks.csv
	BENIGN
		Bad vals: 135
CIC-IDS-2017/Thursday-WorkingHours-Morning-WebAttacks.csv
	Web Attack � Brute Force
		Bad vals: 0
CIC-IDS-2017/Thursday-WorkingHours-Morning-WebAttacks.csv
	Web Attack � XSS
		Bad vals: 0
CIC-IDS-2017/Thursday-WorkingHours-Morning-WebAttacks.csv
	Web Attack � Sql Injection
		Bad vals: 0

CIC-IDS-2017/Tuesday-WorkingHours.csv
	BENIGN
		Bad vals: 261
CIC-IDS-2017/Tuesday-WorkingHours.csv
	FTP-Patator
		Bad vals: 3
CIC-IDS-2017/Tuesday-WorkingHours.csv
	SSH-Patator
		Bad vals: 0

CIC-IDS-2017/Wednesday-workingHours.csv
	BENIGN
		Bad vals: 