In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# read data
ddos_data = pd.read_csv("ddos_dataset.csv", sep=",", dtype={'SimillarHTTP': str})
ddos_data.rename(columns=lambda x: x.strip(), inplace=True)
columns = ddos_data.columns

# set Flow ID
ddos_data["Flow ID"] = ( ddos_data['Source IP'].astype(str) + ' - ' +
                         ddos_data['Source Port'].astype(str) + ' - ' +
                         ddos_data['Destination IP'].astype(str) + ' - ' + 
                         ddos_data['Destination Port'].astype(str) + ' - ' + 
                         ddos_data['Timestamp'].astype(str) ) 
ddos_data = ddos_data.set_index("Flow ID")

#  categorical SimillarHTTP
ddos_data['SimillarHTTP'] = ddos_data['SimillarHTTP'].astype(str).str.strip()
ddos_data['SimillarHTTP'] = pd.Categorical(ddos_data['SimillarHTTP']).codes

# delete rubbish
ddos_data.drop("Unnamed: 0", axis=1, inplace=True)

# delete varancie null
variance = ddos_data.var(numeric_only=True)
zero_variance = variance[variance == 0].index
ddos_data.drop(zero_variance, axis=1, inplace=True)

# add temporal features
ddos_data['Timestamp'] = pd.to_datetime(ddos_data['Timestamp'])
ddos_data['hours'] = ddos_data['Timestamp'].dt.hour
ddos_data['minutes'] = ddos_data['Timestamp'].dt.minute
ddos_data['seconds'] = ddos_data['Timestamp'].dt.second
ddos_data['milliseconds'] = ddos_data['Timestamp'].dt.microsecond // 1000

# one-hot encode Protocol
mask_udp = ddos_data['Protocol'] == 0
mask_tcp = ddos_data['Protocol'] == 6
mask_hopopt = ddos_data['Protocol'] == 17
ddos_data['Protocol 0'] = mask_udp.astype(int)
ddos_data['Protocol 6'] = mask_tcp.astype(int)
ddos_data['Protocol 17'] = mask_hopopt.astype(int)

# delete features 
ddos_data.drop(
    columns=["Destination IP", "Source IP", "Source Port", "Destination Port", "Timestamp", "Protocol", "SimillarHTTP" ], 
    axis=1, 
    inplace=True)

# features selected in the first section
columns_to_remove = ['Active Max', 'Active Min', 'Average Packet Size', 'Avg Bwd Segment Size', 'Avg Fwd Segment Size', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Duration', 'Flow IAT Max', 'Flow IAT Min', 'Flow IAT Std', 'Flow Packets/s', 'Fwd Header Length', 'Fwd IAT Max', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Total', 'Fwd Packet Length Mean', 'Fwd Packet Length Min', 'Idle Max', 'Idle Mean', 'Min Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'Protocol 17', 'Protocol 6', 'RST Flag Count', 'Subflow Bwd Bytes', 'Subflow Bwd Packets', 'Subflow Fwd Bytes', 'Subflow Fwd Packets', 'Total Backward Packets', 'Total Length of Bwd Packets', 'Total Length of Fwd Packets']
ddos_data = ddos_data.drop(columns=columns_to_remove)

# stratify the dataset 
ddos_data_temp = ddos_data.copy()
ddos_data_temp['label'] = pd.Categorical(ddos_data_temp['label']).codes
x = ddos_data_temp.drop(columns=['label'])
y = ddos_data_temp[['label']]
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, train_size=0.7, random_state=15)
y_train, y_test = np.ravel(y_train), np.ravel(y_test)
scaler = StandardScaler()
scaler.fit(x_train)
x_train_s, x_test_s = scaler.transform(x_train), scaler.transform(x_test) # array
ddos_data_x_train_s = pd.DataFrame(x_train_s, columns=x_train.columns) # dataframe
ddos_data_x_test_s = pd.DataFrame(x_test_s, columns=x_test.columns)

In [25]:
ddos_data_x_train_s

Unnamed: 0,Total Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Flow Bytes/s,Flow IAT Mean,Fwd IAT Min,Bwd IAT Total,Bwd IAT Max,...,Active Mean,Active Std,Idle Std,Idle Min,Inbound,hours,minutes,seconds,milliseconds,Protocol 0
0,-0.030065,-0.383617,0.502891,-0.070483,-0.145037,-0.767028,-0.211432,-0.014757,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,1.202745,-1.537192,0.995512,-0.905773,-0.030576
1,0.362115,-0.296502,-0.170995,-0.070483,-0.145037,-0.764784,-0.234199,-0.014753,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,-1.234635,1.485123,1.311507,1.113658,-0.030576
2,-0.034067,-1.048082,-0.170995,-0.070483,-0.145037,-0.767041,-0.234325,-0.014753,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,1.202745,0.391945,-0.205269,1.504943,-0.030576
3,-0.034067,-0.656919,-0.170995,-0.070483,-0.145037,-0.306614,-0.234325,-0.014753,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,0.390285,-1.794410,-1.026856,0.652737,-0.030576
4,-0.034067,-1.048082,-0.170995,-0.070483,-0.145037,-0.767041,-0.234325,-0.014753,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,1.202745,0.134727,0.047527,-1.529176,-0.030576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44962,-0.012057,-0.969507,0.265057,-0.006067,-0.145037,-0.767041,3.775689,-0.014753,5.595645,10.083914,...,0.393187,-0.034167,1.795424,11.403422,-3.384196,-2.047094,0.456250,1.690700,-0.099990,-0.030576
44963,-0.034067,-1.048082,-0.170995,-0.070483,-0.145037,-0.767041,-0.234325,-0.014753,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,1.202745,0.391945,-0.205269,0.785376,-0.030576
44964,-0.034067,-1.048082,-0.170995,-0.070483,-0.145037,-0.767041,-0.234294,-0.014543,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,1.202745,0.391945,-0.205269,0.533362,-0.030576
44965,-0.034067,-0.040282,-0.170995,-0.070483,-0.145037,0.419212,-0.234325,-0.014753,-0.112688,-0.105584,...,-0.042842,-0.034210,-0.164200,-0.193809,0.295491,-0.422175,1.227904,-1.406050,1.445256,-0.030576
