This file contains code for sampling cleaned CIC-IDS-2017 dataset to be used for experiments.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.utils import shuffle, resample
from sklearn.utils.random import sample_without_replacement
from predictions import labels

import warnings
warnings.filterwarnings("ignore")

In [2]:
clean = pd.read_csv("data/clean1.csv")

`preproc_data` function implements data preprocessing steps from `preprocessing.ipynb` file - but as a single function. 

In [3]:
def preproc_data(dataset, train_sample: float, pca_dim=31):
    
    # Label encode
    le = LabelEncoder()
    dataset['label'] = le.fit_transform(dataset['label'])
    
    # Train test split
    x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-1], 
                                                        dataset['label'], 
                                                        test_size=1-train_sample, 
                                                        random_state=0)
    # Standard scaling
    ss = StandardScaler().fit(x_train)

    x_train = ss.transform(x_train)
    x_test = ss.transform(x_test)
    
    # PCA
    pca = PCA(n_components=31).fit(x_train)

    x_train = pca.transform(x_train)
    x_test = pca.transform(x_test)
    
    # Normalization
    norm = Normalizer().fit(x_train)

    x_train = norm.transform(x_train)
    x_test = norm.transform(x_test)
    
    # Reshaping 
    y_train = y_train.values.reshape(-1,1)
    y_test = y_test.values.reshape(-1,1)
    
    return x_train, x_test, y_train, y_test

In [6]:
low_member_thresh = 2000

low_member_labels = list(clean['label'].value_counts()[clean['label'].value_counts() < low_member_thresh].index)
low_member_labels

['Bot',
 'Web_Attack_Brute_Force',
 'Web_Attack_XSS',
 'Infiltration',
 'Web_Attack_Sql_Injection',
 'Heartbleed']

### 10%

This dataset sample contains only 10% of each class members except those which contain <2000 intances. They are taken in their entirety.

In [5]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [6]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.1, random_state=1)

In [7]:
data = data.append(low_member_data, ignore_index=True)

In [8]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [9]:
np.save("data/preserve10/x_train.npy", x_train)
np.save("data/preserve10/y_train.npy", y_train)
np.save("data/preserve10/x_test.npy", x_test)
np.save("data/preserve10/y_test.npy", y_test)

### 25%

In [12]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [13]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.25, random_state=1)

In [14]:
data = data.append(low_member_data, ignore_index=True)

In [15]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [16]:
np.save("data/preserve25/x_train.npy", x_train)
np.save("data/preserve25/y_train.npy", y_train)
np.save("data/preserve25/x_test.npy", x_test)
np.save("data/preserve25/y_test.npy", y_test)

### 50%

In [7]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [8]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.5, random_state=1)

In [9]:
data = data.append(low_member_data, ignore_index=True)

In [10]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [11]:
np.save("data/preserve50/x_train.npy", x_train)
np.save("data/preserve50/y_train.npy", y_train)
np.save("data/preserve50/x_test.npy", x_test)
np.save("data/preserve50/y_test.npy", y_test)

### 100%

In [5]:
x_train, x_test, y_train, y_test = preproc_data(clean, train_sample=0.75, pca_dim=31)

In [6]:
np.save("data/preserve100/x_train.npy", x_train)
np.save("data/preserve100/y_train.npy", y_train)
np.save("data/preserve100/x_test.npy", x_test)
np.save("data/preserve100/y_test.npy", y_test)