In [113]:
# Data processing
import pandas as pd
import numpy as np

In [201]:
# Machine Learning
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import linear_model
from sklearn.linear_model import LinearRegression

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

In [202]:
# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

## Preparing data

In [203]:
# loading data
df = pd.read_csv(r'F:\Programacion\1.BOOTCAMP\data\trabajo_interdisciplinar\df_ready\whole_encoded.csv')
data = df.copy()
data.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,time,duration,source,src_port,destination,dest_port,protocol,length,info,attack,flag,lb_protocol
0,0,1,"Jul 13, 2020 15:28:31.152545818 CEST",60.924132,49.202835,10.0.2.15,37232.0,10.0.2.13,80.0,TCP,74,37232 > 80 [SYN] Seq=0 Win=64240 Len=0 MSS=1...,1,SYN,60


In [204]:
data = data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [205]:
data.head(1)

Unnamed: 0,date,time,duration,source,src_port,destination,dest_port,protocol,length,info,attack,flag,lb_protocol
0,"Jul 13, 2020 15:28:31.152545818 CEST",60.924132,49.202835,10.0.2.15,37232.0,10.0.2.13,80.0,TCP,74,37232 > 80 [SYN] Seq=0 Win=64240 Len=0 MSS=1...,1,SYN,60


In [206]:
balance_ratio = dict(data['attack'].value_counts())
print(balance_ratio)
print("The attacks represent ", round(balance_ratio[1]*100/(balance_ratio[0] + balance_ratio[1]) , 2), "% of the whole data")

{0: 377720, 1: 315579}
The attacks represent  45.52 % of the whole data


## Preparing the ML model

In [207]:
X = data[['time', 'duration', 'length', 'lb_protocol']]
y = data['attack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [208]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Linear Regression

In [209]:
traffic_linear = LinearRegression(fit_intercept=False).fit(X, y)
traffic_linear

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [210]:
linear_model = traffic_linear.fit(X_train, y_train)
print(linear_model.coef_)

[-0.23603403  2.07144701 -1.5034411   0.59988959]


In [211]:
y_pred = np.round(linear_model.predict(X_test))
y_pred

array([0., 0., 0., ..., 0., 0., 1.])

In [212]:
m = confusion_matrix(y_test, y_pred)
precission = (m[0][0] + m[1][1]) / (m.sum())
print(m)
print(round(precission*100, 2), "%")

[[     0      0      0]
 [     1 105549  19141]
 [     8  89314  14776]]
46.13 %


In [213]:
test = pd.DataFrame(y_pred)
test[0].value_counts()

 0.0    194863
 1.0     33917
-1.0         9
Name: 0, dtype: int64

Classes all over the place. DISCARTED

### Logistic regression

In [214]:
traffic_logistic = LogisticRegression(random_state=0).fit(X, y)
traffic_logistic

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [215]:
logistic_model = traffic_logistic.fit(X_train, y_train)
logistic_model.coef_

array([[ -2.03379276,  -1.7031587 , -12.80761153,  -3.11435614]])

In [216]:
y_pred = np.round(logistic_model.predict(X_test))
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [217]:
m = confusion_matrix(y_test, y_pred)
precission = (m[0][0] + m[1][1]) / (m.sum())
print(m)
print(round(precission*100, 2), "%")

[[98151 26540]
 [51079 53019]]
66.07 %


In [218]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.66      0.79      0.72    124691
           1       0.67      0.51      0.58    104098

    accuracy                           0.66    228789
   macro avg       0.66      0.65      0.65    228789
weighted avg       0.66      0.66      0.65    228789



### Random Forest

In [219]:
traffic_forest = RandomForestClassifier(max_depth=5, random_state=17)
traffic_forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=17, verbose=0,
                       warm_start=False)

In [220]:
forest_model = traffic_forest.fit(X_train, y_train)
forest_model.predict_proba(X)

array([[0.60611033, 0.39388967],
       [0.61357491, 0.38642509],
       [0.60252159, 0.39747841],
       ...,
       [0.60252159, 0.39747841],
       [0.60252159, 0.39747841],
       [0.60252159, 0.39747841]])

In [221]:
y_pred = forest_model.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [222]:
m = confusion_matrix(y_test, y_pred)
precission = (m[0][0] + m[1][1]) / (m.sum())
print(m)
print(round(precission*100, 2), "%")

[[ 87415  37276]
 [  3781 100317]]
82.05 %


In [223]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.96      0.70      0.81    124691
           1       0.73      0.96      0.83    104098

    accuracy                           0.82    228789
   macro avg       0.84      0.83      0.82    228789
weighted avg       0.85      0.82      0.82    228789



### Decission Trees

In [224]:
dt_model = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_model

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [225]:
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [226]:
y_pred = dt_model.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [227]:
m = confusion_matrix(y_test, y_pred)
precission = (m[0][0] + m[1][1]) / (m.sum())
print(m)
print(round(precission*100, 2), "%")

[[110079  14612]
 [ 16944  87154]]
86.21 %


In [228]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.87      0.88      0.87    124691
           1       0.86      0.84      0.85    104098

    accuracy                           0.86    228789
   macro avg       0.86      0.86      0.86    228789
weighted avg       0.86      0.86      0.86    228789



### Gradient Boost classifier

In [229]:
gbc_model = GradientBoostingClassifier(n_estimators=20, learning_rate=1, max_features=2, max_depth=2, random_state=0)

In [230]:
gbc_model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=1,
                           loss='deviance', max_depth=2, max_features=2,
                           max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_impurity_split=None, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=20, n_iter_no_change=None,
                           presort='auto', random_state=0, subsample=1.0,
                           tol=0.0001, validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [231]:
y_pred = gbc_model.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [232]:
m = confusion_matrix(y_test, y_pred)
precission = (m[0][0] + m[1][1]) / (m.sum())
print(m)
print(round(precission*100, 2), "%")

[[96712 27979]
 [ 7922 96176]]
84.31 %


In [233]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.78      0.84    124691
           1       0.77      0.92      0.84    104098

    accuracy                           0.84    228789
   macro avg       0.85      0.85      0.84    228789
weighted avg       0.86      0.84      0.84    228789



### XLG Boost

In [234]:
xgb_model = XGBClassifier(learning_rate=0.5)
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [235]:
y_pred = xgb_model.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [236]:
m = confusion_matrix(y_test, y_pred)
precission = (m[0][0] + m[1][1]) / (m.sum())
print(m)
print(round(precission*100, 2), "%")

[[109901  14790]
 [  9401  94697]]
89.43 %


In [237]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.88      0.90    124691
           1       0.86      0.91      0.89    104098

    accuracy                           0.89    228789
   macro avg       0.89      0.90      0.89    228789
weighted avg       0.90      0.89      0.89    228789

