In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from category_encoders import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV

import os
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, accuracy_score, make_scorer

# carregando dataset

In [3]:
df = pd.read_csv('Train_Test_Network.csv')
df = df.replace('-', None)
df.shape

(461043, 45)

In [4]:
# determine categorical and numerical features
numerical_ix = df.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = df.select_dtypes(include=['object', 'bool']).columns


In [5]:
df.head()

Unnamed: 0,ts,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,...,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,label,type
0,1554198358,3.122.49.24,1883,192.168.1.152,52976,tcp,,80549.53026,1762852,41933215,...,0,0,,,,bad_TCP_checksum,,F,0,normal
1,1554198358,192.168.1.79,47260,192.168.1.255,15600,udp,,0.0,0,0,...,0,0,,,,,,,0,normal
2,1554198359,192.168.1.152,1880,192.168.1.152,51782,tcp,,0.0,0,0,...,0,0,,,,bad_TCP_checksum,,F,0,normal
3,1554198359,192.168.1.152,34296,192.168.1.152,10502,tcp,,0.0,0,0,...,0,0,,,,,,,0,normal
4,1554198362,192.168.1.152,46608,192.168.1.190,53,udp,dns,0.000549,0,298,...,0,0,,,,bad_UDP_checksum,,F,0,normal


In [6]:
for cat in categorical_ix:
    df[cat].fillna(df[cat].mode()[0], inplace=True)

In [7]:
X = df.drop(['label', 'type', 'src_ip', 'dst_ip', 'ts'], axis=1)#.values
y = df['type'].values

y = LabelEncoder().fit_transform(y)

In [8]:
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(X, y, test_size=0.80, random_state=42, stratify=y)

In [9]:
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

def classification_report_with_accuracy_score(y_true, y_pred):
    print (classification_report(y_true, y_pred) )# print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [11]:
search.best_params_

{'mlp__activation': 'tanh',
 'mlp__alpha': 0.01,
 'mlp__hidden_layer_sizes': (100, 100),
 'mlp__learning_rate': 'constant',
 'mlp__max_iter': 100,
 'mlp__solver': 'sgd',
 'mlp__verbose': True}

In [10]:
enc = OrdinalEncoder()
# enc = JamesSteinEncoder(cols=categorical_ix)
# enc = OneHotEncoder()

ct = ColumnTransformer(
    [("text_preprocess", enc, categorical_ix),
     ('num_preprocess', MinMaxScaler(), numerical_ix)])

clf = make_pipeline(ct,
                    MLPClassifier(activation='tanh',
                                  alpha=0.01,
                                  hidden_layer_sizes=(100, 100),
                                  learning_rate='constant',
                                  max_iter=500,
                                  solver='sgd',
                                  verbose=True))

scores = cross_val_score(
    clf, 
    X, 
    y, 
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
    scoring=make_scorer(classification_report_with_accuracy_score), verbose=1)
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Iteration 1, loss = 1.40901005
Iteration 2, loss = 1.38781660
Iteration 3, loss = 1.38647078
Iteration 4, loss = 1.38662633
Iteration 5, loss = 1.38637368
Iteration 6, loss = 1.38629290
Iteration 7, loss = 1.38620620
Iteration 8, loss = 1.38613224
Iteration 9, loss = 1.38637033
Iteration 10, loss = 1.38606287
Iteration 11, loss = 1.38600257
Iteration 12, loss = 1.38598981
Iteration 13, loss = 1.38570335
Iteration 14, loss = 1.38595497
Iteration 15, loss = 1.38592871
Iteration 16, loss = 1.38587868
Iteration 17, loss = 1.38553036
Iteration 18, loss = 1.38584824
Iteration 19, loss = 1.38594920
Iteration 20, loss = 1.38597611
Iteration 21, loss = 1.38585255
Iteration 22, loss = 1.38583713
Iteration 23, loss = 1.38581566
Iteration 24, loss = 1.38581960
Iteration 25, loss = 1.38593317
Iteration 26, loss = 1.38587073
Iteration 27, loss = 1.38602098
Iteration 28, loss = 1.38580645
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 1.4

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.7min finished


array([nan, nan, nan, nan, nan])

In [14]:
ct.fit_transform(X)[0]

array([2.00000000e+00, 2.00000000e+00, 6.00000000e+00, 3.37000000e+03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.70000000e+01, 0.00000000e+00, 6.00000000e+00, 1.00000000e+00,
       3.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.44617216e-01, 9.70486800e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.16628396e-05, 1.54304380e-05,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])