# BUILDING THE MACHINE LEARNING ALGORITHMS.

### Importing the necessary libraries.

In [None]:
# Intermeddiate form of data during preprocessing.
import json

# Used for data manipulation.
import numpy as np

# Used for data manipulation.
import pandas as pd

# Splits the data set into a training set and a testing set.
from sklearn.model_selection import train_test_split

# Used for data preprocessing.
from sklearn.preprocessing import LabelEncoder

# Used when training and testing the Random Forest classifier.
from sklearn.ensemble import RandomForestClassifier

# Used when training and testing the Extra Trees classifier.
from sklearn.ensemble import ExtraTreesClassifier

# Used when saving ML objects such as the trained algorithms and the preprocessing objects.
import joblib

### Loading the Dataset.

In [None]:
# Load the dataset using pandas.
df = pd.read_csv(r'/home/patrick/School/Project/DDOS Detection using Machine Learning/DDOS_Detection/ML Datasets/MOSAIC/train_mosaic.csv', skipinitialspace=True)

# Set the input matrix and target column.
# The target column in this dataset is the 'Label' column.
# It is the column that identifies network traffic as either an ATTACK or BENIGN.
x_cols = [c for c in df.columns if c != 'Label']
X = df[x_cols]
y = df['Label']

# Print out the first five rows of data from the loaded dataset.
# Data is formatted as a table.
df.head()

In [None]:
# Print out the first row of data in the dataset in JSON format (Python dictionary).
dict(X.loc[1])

### Dataset splitting.

In [None]:
# Splitting the dataset into a training subset and a testing subset.
# 70% of the data will be used for training.
# 30% of the data will be used for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

### Data Pre-processing

In [None]:
# The dataset may have missing values.
# In order to fill in missing values, we will use the most frequent value in that particular column.
train_mode = dict(X_train.mode().iloc[-1])
X_train = X_train.fillna(train_mode)

# Print out the most common values in each column.
print(train_mode)

### Random Forest Algorithm

In [None]:
# Train the Random Forest Classifier.
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

### Extra Trees Algorithm

In [None]:
# Train the Extra Trees Classifier.
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

### Saving the objects.

In [None]:
# We will need to save the trained ML Algorithms for later use.
# We will also save the most common values in each column, in case we decide to train another algorithm later on.
# We will store these objects in the form of compressed Python Joblibs.

# The Random Forest classifier.
joblib.dump(rf, "./random_forest.joblib", compress=True)

# The Extra Trees classifier.
joblib.dump(et, "./extra_trees.joblib", compress=True)

# The Data Filling values.
joblib.dump(train_mode, "./train_mode.joblib", compress=True)