In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt

#make plots inline using jupyter magic
%matplotlib inline

import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn import datasets, linear_model, metrics

# Imports from Chapter 4 Lab
import matplotlib as mpl

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from IPython.display import Markdown as md  #enable markdown within code cell
from IPython.display import display, Math, Latex

### Utility Functions

In [21]:
def get_data_path(data_name="none", 
                  data_file_name="project_dataset.csv"):
    """  Gets the data path, specific to the project configuration.
    
    :return path to data
    """
    
    proj_dirs = os.getcwd().split("code")
    data_dir = os.path.join(proj_dirs[0], "data")
    data_path = os.path.join(data_dir,data_file_name)
    
    return data_path
# Sample a given number of points from the provided dataset with replacement. seed for repeatability
def get_dataFrame_sample(subset_size, df, seed):
    return df.sample(n=subset_size, random_state=seed, replace=True)

# Sample a given number of points from the provided dataset. seed for repeatability
def get_dataFrame_subset(subset_size, df, seed):
    return df.sample(n=subset_size, random_state=seed)

### Pre-processing

In [19]:
file_name = get_data_path()
# usecols=['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Flow Bytes/s', 'Label']
original_df = pd.read_csv(file_name, sep=',', encoding='utf-8', usecols=[' Flow Duration', 
                ' Total Fwd Packets', ' Total Backward Packets', 'Flow Bytes/s', ' Label'])
# Remove leading/trailing whitespace

display(original_df.head())
display(original_df.describe())

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Flow Bytes/s,Label
0,113095465,48,24,174.0122825,BENIGN
1,113473706,68,40,212.2253767,BENIGN
2,119945515,150,0,0.0,BENIGN
3,60261928,9,7,108.7087688,BENIGN
4,269,2,2,1576208.178,BENIGN


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets
count,170366.0,170366.0,170366.0
mean,12463540.0,15.12462,18.022276
std,31938520.0,1123.107756,1494.492871
min,-1.0,1.0,0.0
25%,192.0,1.0,1.0
50%,31412.0,2.0,2.0
75%,816981.8,4.0,2.0
max,120000000.0,200755.0,270686.0


### Data Distribution

In [24]:
benign = 'BENIGN'
benign_df = original_df[original_df[' Label'] == benign]

# Get number of malicious entries
malicious_df = original_df[original_df[' Label'] != benign]
display(malicious_df.head())
display(malicious_df.describe())

#Perform Class sampling with replacement to increase malicious sample size from 2000 to 10000 
malicious_df_resampled = get_dataFrame_sample(10000, malicious_df, 1)
display(malicious_df_resampled.head())
display(malicious_df_resampled.describe())

# Reduce number of samples for benign class
benign_df_reduced = get_dataFrame_subset(40000, benign_df, 1)


display(benign_df_reduced.head())
display(benign_df_reduced.describe())

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Flow Bytes/s,Label
12637,5185118,7,7,644.7297824,Web Attack � Brute Force
12643,5057374,10,7,2099.113097,Web Attack � Brute Force
12700,81,1,1,0.0,Web Attack � Brute Force
12712,5271123,7,5,455.1212332,Web Attack � Brute Force
12791,5020638,7,4,478.6244298,Web Attack � Brute Force


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets
count,2180.0,2180.0,2180.0
mean,6533719.0,10.945413,5.281193
std,7492580.0,39.124039,20.156303
min,4.0,1.0,0.0
25%,5177226.0,3.0,1.0
50%,5476074.0,3.0,1.0
75%,5783897.0,3.0,1.0
max,70203060.0,212.0,114.0


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Flow Bytes/s,Label
49609,5230919,3,1,0.0,Web Attack � Brute Force
24504,5424045,3,1,0.0,Web Attack � Brute Force
50247,5240908,3,1,0.0,Web Attack � Brute Force
45711,5197465,3,1,0.0,Web Attack � Brute Force
46770,34281644,203,106,3387.818857,Web Attack � Brute Force


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets
count,10000.0,10000.0,10000.0
mean,6534806.0,10.9215,5.2724
std,7528974.0,39.086557,20.133408
min,4.0,1.0,0.0
25%,5174878.0,3.0,1.0
50%,5470453.0,3.0,1.0
75%,5785394.0,3.0,1.0
max,70203060.0,212.0,114.0


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Flow Bytes/s,Label
163317,2,2,0,1040000000.0,BENIGN
9988,51367,1,1,233.61302,BENIGN
58239,30819,1,1,3893.701937,BENIGN
4940,5957135,3,1,2.014391146,BENIGN
163272,169,2,2,1408284.024,BENIGN


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets
count,40000.0,40000.0,40000.0
mean,12537960.0,7.393225,7.79685
std,32080420.0,117.652881,174.804593
min,-1.0,1.0,0.0
25%,191.0,1.0,1.0
50%,31178.0,2.0,2.0
75%,485379.0,4.0,2.0
max,119999500.0,20710.0,31041.0
