# Federated Learning Case Study

## 02. CICIDS 2017 Data Preparation 

In this notebook we prepare the CICIDS2017 dataset for further processing in our batch ML and federated learning scenarios. We use the CICIDS2017 improved dataset (noted in the paper ``Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study''), available from https://downloads.distrinet-research.be/WTMC2021/index.html.

### 1. Preparing the CICIDS dataset

In [3]:
import os
import pandas as pd
import numpy as np

In [4]:
cicids_directory = '../datasets/CICIDS2017-IMPROVED-DOWNLOAD'

feature_files = os.listdir(cicids_directory)
monday_features_df = pd.read_csv(cicids_directory + '/' + feature_files[1], encoding = 'unicode_escape', engine ='python')
tuesday_features_df = pd.read_csv(cicids_directory + '/' + feature_files[3], encoding = 'unicode_escape', engine ='python')
wednesday_features_df = pd.read_csv(cicids_directory + '/' + feature_files[4], encoding = 'unicode_escape', engine ='python')
thursday_features_df = pd.read_csv(cicids_directory + '/' + feature_files[2], encoding = 'unicode_escape', engine ='python')
friday_features_df = pd.read_csv(cicids_directory + '/' + feature_files[0], encoding = 'unicode_escape', engine ='python')
feature_df = pd.concat([monday_features_df, tuesday_features_df, wednesday_features_df, thursday_features_df, friday_features_df])

# Threshold to remove classes with less than THRESHOLD instances
THRESHOLD = 0
if THRESHOLD > 0:
    dd = feature_df['Label'].value_counts()
    feature_df = feature_df[feature_df['Label'].isin(dd[dd > THRESHOLD].index)]


WRITE_DF_TO_PICKLE = False
if WRITE_DF_TO_PICKLE:
    filename = "./full_cicids_df.pkl"
    print ("Write DF to Pickle...", filename)
    feature_df.to_pickle(filename)
    print ("Writing DF to Pickle complete...", filename)

# Pre-processing to tidy up NaN values
feature_df2 = feature_df.replace([np.inf, - np.inf], np.nan)
feature_df2 = feature_df2.dropna()    
label = feature_df2['Label']
feature_df2 = feature_df2.drop(['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'Label'], axis=1)

# Scaling of features to fit within defined range
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaled_features = StandardScaler().fit_transform(feature_df2.values)
#scaled_features = MinMaxScaler().fit_transform(feature_df2.values)

# Label Preprocessing
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
label_n = le.fit_transform(label.values)

# Test / train split
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(scaled_features, label_n, stratify=label_n, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(scaled_features, label_n, stratify=label_n, test_size=0.2, random_state=42)
print ("Train:", X_train.shape, y_train.shape)
print ("Test:", X_test.shape, y_test.shape)

KeyboardInterrupt: 

In [3]:
orig = list(le.inverse_transform(y_train))
pd.value_counts(orig)

BENIGN                                  1325655
PortScan                                 127218
DoS Hulk                                 126775
DDoS                                      76098
DoS GoldenEye                              6054
DoS slowloris                              3201
FTP-Patator                                3178
DoS Slowhttptest - Attempted               2694
SSH-Patator                                2384
DoS Slowhttptest                           1394
DoS slowloris - Attempted                  1365
Bot - Attempted                            1176
Web Attack - Brute Force - Attempted        971
Bot                                         590
Web Attack - XSS - Attempted                522
DoS Hulk - Attempted                        463
Web Attack - Brute Force                    121
DoS GoldenEye - Attempted                    64
dtype: int64

In [4]:
len(np.unique(y_test))

18

In [5]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(1679923, 79)
(419981, 79)
(1679923,)
(419981,)


In [9]:
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

In [6]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(1679923, 79)
(419981, 79)
(1679923,)
(419981,)


In [7]:
X_train = np.load("X_train.npy")
X_test = np.load("X_test.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")

In [8]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(1679923, 79)
(419981, 79)
(1679923,)
(419981,)
