In [1]:
import pandas as pd
from pathlib import Path

**Reading data**

<strong style="color: red">Important!</strong>

To run this notebook you have to download the [CIC-IDS-2017 dataset](https://www.kaggle.com/asthana12/cicids2017), and point the `pd.read_csv` method to the `MachineLearningCSV` directory.

In [3]:
files = Path("./MachineLearningCSV/").rglob("*.csv")

In [4]:
df = [pd.read_csv(file) for file in files]

In [5]:
df = pd.concat(df)

Renaming columns, removing spaces, switching to lowerecase.

In [6]:
df.columns = [col.strip().replace(" ", "_").lower() for col in df.columns]

**Renaming labels**

In [7]:
label_names = df['label'].unique()


import re

label_names = [re.sub("[^a-zA-Z ]+", "", l) for l in label_names]
label_names = [re.sub("[\s\s]", '_', l) for l in label_names]
label_names = [lab.replace("__", "_") for lab in label_names]

label_names, len(label_names)

(['BENIGN',
  'DDoS',
  'PortScan',
  'Bot',
  'Infiltration',
  'Web_Attack_Brute_Force',
  'Web_Attack_XSS',
  'Web_Attack_Sql_Injection',
  'FTPPatator',
  'SSHPatator',
  'DoS_slowloris',
  'DoS_Slowhttptest',
  'DoS_Hulk',
  'DoS_GoldenEye',
  'Heartbleed'],
 15)

In [8]:
# Replacing 'Label' column values with new readable values.

labels = df['label'].unique()

for i in range(0,len(label_names)):
    df['label'] = df['label'].replace({labels[i] : label_names[i]})

In [9]:
df.label.unique()

array(['BENIGN', 'DDoS', 'PortScan', 'Bot', 'Infiltration',
       'Web_Attack_Brute_Force', 'Web_Attack_XSS',
       'Web_Attack_Sql_Injection', 'FTPPatator', 'SSHPatator',
       'DoS_slowloris', 'DoS_Slowhttptest', 'DoS_Hulk', 'DoS_GoldenEye',
       'Heartbleed'], dtype=object)

**Removing NULL values**

In [10]:
df.isnull().values.any()

True

In [11]:
[col for col in df if df[col].isnull().values.any()]

['flow_bytes/s']

In [12]:
df['flow_bytes/s'].isnull().sum()

1358

In [13]:
(1358 / df.shape[0])*100

0.04797327062188267

In [14]:
before = df.shape

df.dropna(inplace=True)

after = df.shape

before[0] - after[0]

1358

In [15]:
df.isnull().any().any()

False

**Removing infinite values**

In [16]:
import numpy as np

In [21]:
np.all(np.isfinite(df.iloc[:,:-1]))

False

In [22]:
# Checking what column/s contain non-finite values.

nonfinite = [col for col in df.iloc[:,:-1] if not np.all(np.isfinite(df[col]))]

nonfinite

['flow_bytes/s', 'flow_packets/s']

In [24]:
df.iloc[:,:-1] = df.iloc[:,:-1].replace([np.inf, -np.inf], np.nan)

np.any(np.isnan(df.iloc[:,:-1]))

True

In [26]:
before = df.shape
print("Before drop", before)

df.dropna(inplace=True)

print("After drop", df.shape)
print("The difference", before[0] - df.shape[0])

del before

Before drop (2827876, 79)
After drop (2827876, 79)
The difference 0


In [27]:
np.all(np.isfinite(df.iloc[:,:-1]))

True

**Saving clean data**

In [None]:
df.to_csv("cic-ids-2017-clean.csv", index=False)