In [1]:
import numpy as np
import pandas as pnd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import torch

# Intrusion Detection Evaluation Dataset (CIC-IDS2017)
### DDoS Attack Data

Intrusion Detection Systems (IDSs) and Intrusion Prevention Systems (IPSs) are the most important defense tools against the sophisticated and ever-growing network attacks. Due to the lack of reliable test and validation datasets, anomaly-based intrusion detection approaches are suffering from consistent and accurate performance evolutions.

[https://www.unb.ca/cic/datasets/ids-2017.html]


In [2]:
# Load Data
df = pnd.read_csv("data/CICIDS-2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
X = df

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225745 entries, 0 to 225744
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             225745 non-null  int64  
 1    Flow Duration                225745 non-null  int64  
 2    Total Fwd Packets            225745 non-null  int64  
 3    Total Backward Packets       225745 non-null  int64  
 4   Total Length of Fwd Packets   225745 non-null  int64  
 5    Total Length of Bwd Packets  225745 non-null  int64  
 6    Fwd Packet Length Max        225745 non-null  int64  
 7    Fwd Packet Length Min        225745 non-null  int64  
 8    Fwd Packet Length Mean       225745 non-null  float64
 9    Fwd Packet Length Std        225745 non-null  float64
 10  Bwd Packet Length Max         225745 non-null  int64  
 11   Bwd Packet Length Min        225745 non-null  int64  
 12   Bwd Packet Length Mean       225745 non-nul

In [4]:
X.replace('BENIGN', 0, inplace=True)
X.replace('DDoS', 1, inplace=True)

In [5]:
# Clean Data
if(np.any(np.isnan(X)) or np.all(np.isfinite(X))):
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    X = X[indices_to_keep]

X.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


In [6]:
# Scale Data
scaled_features = MinMaxScaler().fit_transform(X.values)
X = pnd.DataFrame(scaled_features, index=X.index, columns=X.columns)
X.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.837225,3.333335e-08,0.000518,0.0,6.6e-05,0.0,0.000514,0.004076,0.001552,0.0,...,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.840109,9.166671e-07,0.0,0.00034,3.3e-05,1e-06,0.000514,0.004076,0.001552,0.0,...,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.840124,4.416669e-07,0.0,0.00034,3.3e-05,1e-06,0.000514,0.004076,0.001552,0.0,...,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.705548,2.916668e-07,0.0,0.00034,3.3e-05,1e-06,0.000514,0.004076,0.001552,0.0,...,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.837194,3.333335e-08,0.000518,0.0,6.6e-05,0.0,0.000514,0.004076,0.001552,0.0,...,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
y_label = X.pop(' Label')
y_label.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name:  Label, dtype: float64

In [37]:
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance
import matplotlib.pyplot as plt

mod = xgb.XGBRegressor(objective ='reg:squarederror',max_depth = 5, n_estimators = 600)
mod.fit(X, y_label)
plot_importance(mod, max_num_features=30)
plt.show()

In [33]:
# Important Features
from sklearn.ensemble import RandomForestRegressor

# Create a random forest regressor
reg = RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)

# Train the regressor
reg.fit(X, y_label)

x = {}
# Print the name and gini importance of each feature
for feature in zip(list(X.columns), reg.feature_importances_):
    print(feature)
    x[feature[0]] = feature[1]

(' Destination Port', 0.0012500850310623553)
(' Flow Duration', 9.311809013608722e-06)
(' Total Fwd Packets', 2.4926433213716797e-05)
(' Total Backward Packets', 3.1882093523821206e-05)
('Total Length of Fwd Packets', 0.2116241868089053)
(' Total Length of Bwd Packets', 3.7859202171811444e-06)
(' Fwd Packet Length Max', 0.5735564172211007)
(' Fwd Packet Length Min', 1.8685939087991304e-06)
(' Fwd Packet Length Mean', 1.0409741081298226e-05)
(' Fwd Packet Length Std', 3.745492776920816e-05)
('Bwd Packet Length Max', 8.524216681600004e-06)
(' Bwd Packet Length Min', 0.00027614618709888587)
(' Bwd Packet Length Mean', 0.00026157486584077705)
(' Bwd Packet Length Std', 5.149368581706075e-06)
('Flow Bytes/s', 9.867593145116227e-06)
(' Flow Packets/s', 1.0316707945163388e-05)
(' Flow IAT Mean', 1.911751046808292e-05)
(' Flow IAT Std', 1.1542762752259564e-05)
(' Flow IAT Max', 1.3896841456631209e-05)
(' Flow IAT Min', 0.00015916330599274173)
('Fwd IAT Total', 1.3437079525014172e-05)
(' Fwd IA

In [8]:
# Create training & testing datasets
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

# Confusion Matrix
# Precision & Recall
# plot num of bins vs. accuracy, against no bin approach
# Use other datasets

# Using Machine Learning to predict DDos attack
created two different classification models, a Support Vector Machine (SVM) and Multilayer Perceptron (MLP) neural network. 
These models will be used as baselines for validating the discretization of the features.

**SVM accuracy score: 99.8%**

**MLP accuracy score: 99.9%**

In [9]:
# Support Vector Machine
from sklearn.svm import LinearSVC
linear_SVC = LinearSVC(random_state=0, tol=1e-5)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label) 
    linear_SVC.fit(X_train[train], y_train[train])
    accuracy = linear_SVC.score(X_test[test], y_test[test])
    print(f'train: {train},\n test: {test},\n Accuracy Score: {accuracy},\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')
    display = PrecisionRecallDisplay.from_estimator(linear_SVC, X_test[test], y_test[test], name="LinearSVC")   
    _ = display.ax_.set_title("2-class Precision-Recall curve")

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 Accuracy Score: 0.984183594355714,
 Confusion Matrix:
[[25076   708]
 [    6 19353]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 Accuracy Score: 0.9979619866199991,
 Confusion Matrix:
[[11190    86]
 [    6 33860]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 Accuracy Score: 0.9987594701165212,
 Confusion Matrix:
[[11594    48]
 [    8 33492]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [135427 135428 135429 ... 180566 180567 180568],
 Accuracy Score: 0.9970758938460856,
 Confusion Matrix:
[[15083   124]
 [    8 29927]]

train: [     0      1      2 ... 180566 180567 180568],
 test: [180569 180570 180571 ... 225708 225709 225710],
 Accuracy Score: 0.9692082761065084,
 Confusion Matrix:
[[32430  1347]
 [   43 11322]]



In [10]:
# MLP Neural Net
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(random_state=1, max_iter=300)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label) 
    mlp_clf.fit(X_train[train], y_train[train])
    accuracy = mlp_clf.score(X_test[test], y_test[test])
    loss = mlp_clf.loss_
    print(f'train: {train},\n test: {test},\n Loss: {loss},\n Accuracy Score: {accuracy},\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 Loss: 0.0028523110895046794,
 Accuracy Score: 0.9995569634273309,
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 Loss: 0.003271555761231856,
 Accuracy Score: 0.9996234105710868,
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 Loss: 0.0029955093919307985,
 Accuracy Score: 0.9998227814452173,
 Confusion Matrix:
[[11589    53]
 [    7 33493]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [135427 135428 135429 ... 180566 180567 180568],
 Loss: 0.003075748512062097,
 Accuracy Score: 0.9997784768065217,
 Confusion Matrix:
[[15079   128]
 [    2 29933]]

train: [     0      1      2 ... 180566 180567 180568],
 test: [180569 180570 180571 ... 22570

# Discretized Dataset
Discretize the same dataset, and run the same models to validate that the predicts can still be acheived using discretized data.

### Bin=5

**SVM accuracy score: 99.8%**

![](https://ml.azure.com/fileexplorerAzNB?wsid=/subscriptions/5b3eabf8-5525-4f24-b0aa-fc389f2fb155/resourcegroups/patrickday/workspaces/ml_workspace_patrickday&tid=72f988bf-86f1-41af-91ab-2d7cd011db47&activeFilePath=Users/patrickday/data/images/SVM_binned.png)

**MLP accuracy score: 98.7%**

In [11]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import KBinsDiscretizer

discrete5 = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
discrete10 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
discrete15 = KBinsDiscretizer(n_bins=15, encode='ordinal', strategy='uniform')
discrete20 = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
discrete25 = KBinsDiscretizer(n_bins=25, encode='ordinal', strategy='uniform')

svm_bin_accuracy = {'5bins': 0, '10bins': 0, '15bins': 0, '20bins': 0, '25bins': 0}
mlp_bin_accuracy = {'5bins': 0, '10bins': 0, '15bins': 0, '20bins': 0, '25bins': 0}

In [12]:
# SVM 5 bins
linear_SVC2 = LinearSVC(random_state=0, tol=1e-5)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label) 
    Xbin_train = discrete5.fit_transform(X_train)
    Xbin_test = discrete5.fit_transform(X_test)
    linear_SVC2.fit(Xbin_train[train], y_train[train])
    accuracy = linear_SVC2.score(Xbin_test[test], y_test[test])
    if(svm_bin_accuracy['5bins'] > accuracy):
        continue
    else:
        svm_bin_accuracy['5bins'] = accuracy
    print(f'train: {train},\n test: {test},\n Accuracy Score: {accuracy},\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 Accuracy Score: 0.9807943645747956,
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 Accuracy Score: 0.9977404634265208,
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 Accuracy Score: 0.9983607283682602,
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [13]:
svm_bin_accuracy

{'5bins': 0.9983607283682602,
 '10bins': 0,
 '15bins': 0,
 '20bins': 0,
 '25bins': 0}

In [14]:
# MLP Neural Net 5 bins
mlp_clf2 = MLPClassifier(random_state=1, max_iter=300)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label)
    Xbin_train = discrete5.fit_transform(X_train)
    Xbin_test = discrete5.fit_transform(X_test) 
    mlp_clf2.fit(Xbin_train[train], y_train[train])
    accuracy = mlp_clf2.score(Xbin_test[test], y_test[test])
    loss = mlp_clf2.loss_
    if(mlp_bin_accuracy['5bins'] > accuracy):
        continue
    else:
        mlp_bin_accuracy['5bins'] = accuracy
    print(f'train: {train},\n test: {test},\n layer size: {mlp_clf2.hidden_layer_sizes},\n Loss: {loss},\n Accuracy Score: {accuracy}\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 layer size: (100,),
 Loss: 0.04479286111909361,
 Accuracy Score: 0.9807500609175287
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 layer size: (100,),
 Loss: 0.054733288153976094,
 Accuracy Score: 0.9977847680652164
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 layer size: (100,),
 Loss: 0.054994719793026475,
 Accuracy Score: 0.9984050330069558
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [15]:
mlp_bin_accuracy

{'5bins': 0.9984050330069558,
 '10bins': 0,
 '15bins': 0,
 '20bins': 0,
 '25bins': 0}

### Bins = 10

 **SVM Accuracy: 99.87**

**MLP Accuracy: 99.84**

In [16]:
# SVM 10bins
linear_SVC2 = LinearSVC(random_state=0, tol=1e-5)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label) 
    Xbin_train = discrete10.fit_transform(X_train)
    Xbin_test = discrete10.fit_transform(X_test)
    linear_SVC2.fit(Xbin_train[train], y_train[train])
    accuracy = linear_SVC2.score(Xbin_test[test], y_test[test])
    if(svm_bin_accuracy['10bins'] > accuracy):
        continue
    else:
        svm_bin_accuracy['10bins'] = accuracy
    print(f'train: {train},\n test: {test},\n Accuracy Score: {accuracy},\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 Accuracy Score: 0.9843829608134151,
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 Accuracy Score: 0.9980949005360862,
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 Accuracy Score: 0.9987373177971733,
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [17]:
svm_bin_accuracy

{'5bins': 0.9983607283682602,
 '10bins': 0.9987373177971733,
 '15bins': 0,
 '20bins': 0,
 '25bins': 0}

In [18]:
# MLP Neural Net 10 bins
mlp_clf2 = MLPClassifier(random_state=1, max_iter=300)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label)
    Xbin_train = discrete10.fit_transform(X_train)
    Xbin_test = discrete10.fit_transform(X_test) 
    mlp_clf2.fit(Xbin_train[train], y_train[train])
    accuracy = mlp_clf2.score(Xbin_test[test], y_test[test])
    loss = mlp_clf2.loss_
    if(mlp_bin_accuracy['10bins'] > accuracy):
        continue
    else:
        mlp_bin_accuracy['10bins'] = accuracy
    print(f'train: {train},\n test: {test},\n layer size: {mlp_clf2.hidden_layer_sizes},\n Loss: {loss},\n Accuracy Score: {accuracy}\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 layer size: (100,),
 Loss: 0.03734193982732556,
 Accuracy Score: 0.9846487827570166
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 layer size: (100,),
 Loss: 0.045926979495372985,
 Accuracy Score: 0.9979176819813035
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 layer size: (100,),
 Loss: 0.046413336709528887,
 Accuracy Score: 0.9987594701165212
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [19]:
mlp_bin_accuracy

{'5bins': 0.9984050330069558,
 '10bins': 0.9987594701165212,
 '15bins': 0,
 '20bins': 0,
 '25bins': 0}

### Bins = 15

 **SVM Accuracy: 99.85**

**MLP Accuracy: 99.88**

In [20]:
# SVM 15 bins
linear_SVC2 = LinearSVC(random_state=0, tol=1e-5)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label) 
    Xbin_train = discrete15.fit_transform(X_train)
    Xbin_test = discrete15.fit_transform(X_test)
    linear_SVC2.fit(Xbin_train[train], y_train[train])
    accuracy = linear_SVC2.score(Xbin_test[test], y_test[test])
    if(svm_bin_accuracy['15bins'] > accuracy):
        continue
    else:
        svm_bin_accuracy['15bins'] = accuracy
    print(f'train: {train},\n test: {test},\n Accuracy Score: {accuracy},\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 Accuracy Score: 0.9843386571561482,
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 Accuracy Score: 0.998117052855434,
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 Accuracy Score: 0.998515794603695,
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [21]:
svm_bin_accuracy

{'5bins': 0.9983607283682602,
 '10bins': 0.9987373177971733,
 '15bins': 0.998515794603695,
 '20bins': 0,
 '25bins': 0}

In [22]:
# MLP Neural Net 15 bins
mlp_clf2 = MLPClassifier(random_state=1, max_iter=300)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label)
    Xbin_train = discrete15.fit_transform(X_train)
    Xbin_test = discrete15.fit_transform(X_test) 
    mlp_clf2.fit(Xbin_train[train], y_train[train])
    accuracy = mlp_clf2.score(Xbin_test[test], y_test[test])
    loss = mlp_clf2.loss_
    if(mlp_bin_accuracy['15bins'] > accuracy):
        continue
    else:
        mlp_bin_accuracy['15bins'] = accuracy
    print(f'train: {train},\n test: {test},\n layer size: {mlp_clf2.hidden_layer_sizes},\n Loss: {loss},\n Accuracy Score: {accuracy}\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 layer size: (100,),
 Loss: 0.034049961508477014,
 Accuracy Score: 0.9846487827570166
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 layer size: (100,),
 Loss: 0.04172045246193719,
 Accuracy Score: 0.9979176819813035
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 layer size: (100,),
 Loss: 0.042247965646218785,
 Accuracy Score: 0.9988037747552169
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [23]:
mlp_bin_accuracy

{'5bins': 0.9984050330069558,
 '10bins': 0.9987594701165212,
 '15bins': 0.9988037747552169,
 '20bins': 0,
 '25bins': 0}

### Bins = 20

 **SVM Accuracy: 99.87**

**MLP Accuracy: 99.85**

In [24]:
# SVM 20 bins
linear_SVC2 = LinearSVC(random_state=0, tol=1e-5)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label) 
    Xbin_train = discrete20.fit_transform(X_train)
    Xbin_test = discrete20.fit_transform(X_test)
    linear_SVC2.fit(Xbin_train[train], y_train[train])
    accuracy = linear_SVC2.score(Xbin_test[test], y_test[test])
    if(svm_bin_accuracy['20bins'] > accuracy):
        continue
    else:
        svm_bin_accuracy['20bins'] = accuracy
    print(f'train: {train},\n test: {test},\n Accuracy Score: {accuracy},\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 Accuracy Score: 0.9842943534988813,
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 Accuracy Score: 0.9978069203845643,
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 Accuracy Score: 0.998648708519782,
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [25]:
svm_bin_accuracy

{'5bins': 0.9983607283682602,
 '10bins': 0.9987373177971733,
 '15bins': 0.998515794603695,
 '20bins': 0.998648708519782,
 '25bins': 0}

In [26]:
# MLP Neural Net 20 bins
mlp_clf2 = MLPClassifier(random_state=1, max_iter=300)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label)
    Xbin_train = discrete20.fit_transform(X_train)
    Xbin_test = discrete20.fit_transform(X_test) 
    mlp_clf2.fit(Xbin_train[train], y_train[train])
    accuracy = mlp_clf2.score(Xbin_test[test], y_test[test])
    loss = mlp_clf2.loss_
    if(mlp_bin_accuracy['20bins'] > accuracy):
        continue
    else:
        mlp_bin_accuracy['20bins'] = accuracy
    print(f'train: {train},\n test: {test},\n layer size: {mlp_clf2.hidden_layer_sizes},\n Loss: {loss},\n Accuracy Score: {accuracy}\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 layer size: (100,),
 Loss: 0.031632569859036105,
 Accuracy Score: 0.9842278980129809
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 layer size: (100,),
 Loss: 0.0381527202181604,
 Accuracy Score: 0.9976518541491294
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 layer size: (100,),
 Loss: 0.038399561837030916,
 Accuracy Score: 0.9987373177971733
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [27]:
mlp_bin_accuracy

{'5bins': 0.9984050330069558,
 '10bins': 0.9987594701165212,
 '15bins': 0.9988037747552169,
 '20bins': 0.9987373177971733,
 '25bins': 0}

### Bins = 25

 **SVM Accuracy: 99.82**

**MLP Accuracy: 99.87**

In [28]:
# SVM 25 bins
linear_SVC2 = LinearSVC(random_state=0, tol=1e-5)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label) 
    Xbin_train = discrete25.fit_transform(X_train)
    Xbin_test = discrete25.fit_transform(X_test)
    linear_SVC2.fit(Xbin_train[train], y_train[train])
    accuracy = linear_SVC2.score(Xbin_test[test], y_test[test])
    if(svm_bin_accuracy['25bins'] > accuracy):
        continue
    else:
        svm_bin_accuracy['25bins'] = accuracy
    print(f'train: {train},\n test: {test},\n Accuracy Score: {accuracy},\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 Accuracy Score: 0.9837848614403119,
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 Accuracy Score: 0.9979176819813035,
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 Accuracy Score: 0.9982721190908688,
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [29]:
svm_bin_accuracy

{'5bins': 0.9983607283682602,
 '10bins': 0.9987373177971733,
 '15bins': 0.998515794603695,
 '20bins': 0.998648708519782,
 '25bins': 0.9982721190908688}

In [30]:
# MLP Neural Net 25 bins
mlp_clf2 = MLPClassifier(random_state=1, max_iter=300)
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = np.array(X), np.array(X), np.array(y_label), np.array(y_label)
    Xbin_train = discrete25.fit_transform(X_train)
    Xbin_test = discrete25.fit_transform(X_test) 
    mlp_clf2.fit(Xbin_train[train], y_train[train])
    accuracy = mlp_clf2.score(Xbin_test[test], y_test[test])
    loss = mlp_clf2.loss_
    if(mlp_bin_accuracy['25bins'] > accuracy):
        continue
    else:
        mlp_bin_accuracy['25bins'] = accuracy
    print(f'train: {train},\n test: {test},\n layer size: {mlp_clf2.hidden_layer_sizes},\n Loss: {loss},\n Accuracy Score: {accuracy}\n Confusion Matrix:\n{confusion_matrix(y_test[test], linear_SVC.predict(X_test[test]))}\n')

train: [ 45143  45144  45145 ... 225708 225709 225710],
 test: [    0     1     2 ... 45140 45141 45142],
 layer size: (100,),
 Loss: 0.029785363422394656,
 Accuracy Score: 0.98467093458565
 Confusion Matrix:
[[25063   721]
 [    1 19358]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [45143 45144 45145 ... 90282 90283 90284],
 layer size: (100,),
 Loss: 0.035573994533043614,
 Accuracy Score: 0.9978955296619556
 Confusion Matrix:
[[11187    89]
 [    2 33864]]

train: [     0      1      2 ... 225708 225709 225710],
 test: [ 90285  90286  90287 ... 135424 135425 135426],
 layer size: (100,),
 Loss: 0.03612674970985028,
 Accuracy Score: 0.9987594701165212
 Confusion Matrix:
[[11589    53]
 [    7 33493]]



In [31]:
mlp_bin_accuracy

{'5bins': 0.9984050330069558,
 '10bins': 0.9987594701165212,
 '15bins': 0.9988037747552169,
 '20bins': 0.9987373177971733,
 '25bins': 0.9987594701165212}