In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("data/cic-ids-2017-sample.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,0,80,67429791,9,7,384,11595,384,0,42.666667,...,32,2047.0,0.0,2047,2047,67300000.0,0.0,67300000,67300000,DoS_Hulk
1,1,53510,2653,2,1,0,0,0,0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,2,443,145276,1,1,0,0,0,0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,3,54230,146,1,1,0,0,0,0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,4,1218,79,1,1,2,6,2,2,2.0,...,24,0.0,0.0,0,0,0.0,0.0,0,0,PortScan


**Label Encoding**

In [4]:
le = LabelEncoder()

df['label'] = le.fit_transform(df['label'])

encoded_labels = {num:label for (num, label) in zip(range(15), le.classes_)}

encoded_labels

{0: 'BENIGN',
 1: 'Bot',
 2: 'DDoS',
 3: 'DoS_GoldenEye',
 4: 'DoS_Hulk',
 5: 'DoS_Slowhttptest',
 6: 'DoS_slowloris',
 7: 'FTPPatator',
 8: 'Heartbleed',
 9: 'Infiltration',
 10: 'PortScan',
 11: 'SSHPatator',
 12: 'Web_Attack_Brute_Force',
 13: 'Web_Attack_Sql_Injection',
 14: 'Web_Attack_XSS'}

**Train Test Split**

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:-1], 
                                                    df['label'], 
                                                    test_size=1/7.0, 
                                                    random_state=0)

**Data standardization**

In [6]:
ss = StandardScaler().fit(x_train)

x_train = ss.transform(x_train)
x_test = ss.transform(x_test)

**PCA**

In [7]:
pca = PCA(.99).fit(x_train)

x_train = pca.transform(x_train)
x_test = pca.transform(x_test)

**Normalizing**

In [8]:
norm = Normalizer().fit(x_train)

x_train = norm.transform(x_train)
x_test = norm.transform(x_test)

**Reshaping labels**

In [9]:
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

**Saving**

In [10]:
np.save("data/x_train.npy", x_train)
np.save("data/y_train.npy", y_train)
np.save("data/x_test.npy", x_test)
np.save("data/y_test.npy", y_test)