# Importações

In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.datasets import load_digits
from FirstStage import FirstStage
from SecondStage import SecondStage
from Extension import Extension

from tqdm import tqdm


# Dados

### Carregando os dados

In [3]:
cic_ids_2017 = pd.read_parquet("data/cic_ids_2017.parquet")
infiltration_2018 = pd.read_parquet("data/infiltration_2018.parquet")
testp = pd.read_parquet("data/test.parquet")

Exibindo o **.info()**

In [4]:
cic_ids_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

In [5]:
infiltration_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127844 entries, 0 to 127843
Data columns (total 68 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Protocol                  127844 non-null  float64
 1   Flow Duration             127844 non-null  float64
 2   Total Fwd Packets         127844 non-null  float64
 3   Total Backward Packets    127844 non-null  float64
 4   Fwd Packets Length Total  127844 non-null  float64
 5   Bwd Packets Length Total  127844 non-null  float64
 6   Fwd Packet Length Max     127844 non-null  float64
 7   Fwd Packet Length Min     127844 non-null  float64
 8   Fwd Packet Length Mean    127844 non-null  float64
 9   Fwd Packet Length Std     127844 non-null  float64
 10  Bwd Packet Length Max     127844 non-null  float64
 11  Bwd Packet Length Min     127844 non-null  float64
 12  Bwd Packet Length Mean    127844 non-null  float64
 13  Bwd Packet Length Std     127844 non-null  f

In [6]:
testp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59435 entries, 0 to 59434
Data columns (total 68 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Protocol                  59435 non-null  float32
 1   Flow Duration             59435 non-null  float64
 2   Total Fwd Packets         59435 non-null  float64
 3   Total Backward Packets    59435 non-null  float64
 4   Fwd Packets Length Total  59435 non-null  float64
 5   Bwd Packets Length Total  59435 non-null  float64
 6   Fwd Packet Length Max     59435 non-null  float32
 7   Fwd Packet Length Min     59435 non-null  float32
 8   Fwd Packet Length Mean    59435 non-null  float32
 9   Fwd Packet Length Std     59435 non-null  float32
 10  Bwd Packet Length Max     59435 non-null  float32
 11  Bwd Packet Length Min     59435 non-null  float32
 12  Bwd Packet Length Mean    59435 non-null  float32
 13  Bwd Packet Length Std     59435 non-null  float32
 14  Flow B

### Padronizando e limpando os dados

#### Concatenando os dados

Mudando o nome das colunas **y** e **Y** para **Label** em dois dos datagramas, pois são a mesma coisa.

In [7]:
infiltration_2018.rename(columns={'y': 'Label'}, inplace=True)
testp.rename(columns={'Y': 'Label'}, inplace=True)


In [8]:
df = pd.concat([cic_ids_2017, infiltration_2018, testp], axis=0)

In [9]:
# Limpeza
del cic_ids_2017
del infiltration_2018
del testp

In [10]:
df = df.reset_index(drop=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3018022 entries, 0 to 3018021
Data columns (total 89 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             float64
 1   Flow Duration                float64
 2   Total Fwd Packets            float64
 3   Total Backward Packets       float64
 4   Total Length of Fwd Packets  float64
 5   Total Length of Bwd Packets  float64
 6   Fwd Packet Length Max        float64
 7   Fwd Packet Length Min        float64
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        float64
 11  Bwd Packet Length Min        float64
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 float64
 19  

#### Registros duplicados

In [12]:
df[df.duplicated()]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Protocol,Fwd Packets Length Total,Bwd Packets Length Total,Packet Length Min,Packet Length Max,Avg Packet Size,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min
2109,80.0,77.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2257,443.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2749,443.0,49.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2862,443.0,4.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2877,443.0,1.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3018006,,36.0,1.0,1.0,,,6.0,6.0,6.0,0.0,...,6.0,6.0,6.0,6.0,6.0,9.0,237.0,256.0,0.0,20.0
3018008,,3.0,1.0,1.0,,,6.0,6.0,6.0,0.0,...,6.0,6.0,6.0,6.0,6.0,9.0,237.0,256.0,0.0,20.0
3018009,,61.0,1.0,1.0,,,0.0,0.0,0.0,0.0,...,6.0,0.0,6.0,0.0,6.0,3.0,29200.0,0.0,0.0,40.0
3018014,,53.0,1.0,1.0,,,0.0,0.0,0.0,0.0,...,6.0,0.0,6.0,0.0,6.0,3.0,29200.0,0.0,0.0,40.0


Exibindo os registros não duplicados

In [13]:
df[~df.duplicated()]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Protocol,Fwd Packets Length Total,Bwd Packets Length Total,Packet Length Min,Packet Length Max,Avg Packet Size,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min
0,54865.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
1,55054.0,109.0,1.0,1.0,6.0,6.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
2,55055.0,52.0,1.0,1.0,6.0,6.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
3,46236.0,34.0,1.0,1.0,6.0,6.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
4,54863.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3018016,,56538478.0,2.0,1.0,,,60.0,6.0,33.000000,38.183765,...,6.0,66.0,6.0,6.0,60.0,26.000000,255.0,0.0,1.0,20.0
3018017,,5792089.0,3.0,1.0,,,0.0,0.0,0.000000,0.000000,...,6.0,0.0,0.0,0.0,0.0,0.000000,29200.0,28960.0,0.0,32.0
3018018,,224.0,2.0,1.0,,,14.0,0.0,7.000000,9.899495,...,6.0,14.0,0.0,0.0,14.0,9.333333,229.0,0.0,0.0,32.0
3018019,,210319.0,5.0,4.0,,,195.0,0.0,42.599998,85.233795,...,6.0,213.0,140.0,0.0,195.0,39.222221,8192.0,237.0,4.0,20.0


Descartando registros duplicados

In [14]:
initial_len = df.shape[0]
df = df.drop_duplicates()
print(f'Tamanho inicial: {initial_len}, tamanho final {df.shape[0]} | Descartadas {initial_len - df.shape[0]} duplicadas')

df = df.reset_index(drop=True) # Resetando index

Tamanho inicial: 3018022, tamanho final 2683411 | Descartadas 334611 duplicadas


#### Registros com valores não finitos

In [15]:
df_columns_isfinite = np.isfinite(df.drop(['Label'], axis='columns')).all(axis=0)
df_columns_isfinite[df_columns_isfinite == False]

Destination Port               False
Total Length of Fwd Packets    False
Total Length of Bwd Packets    False
Flow Bytes/s                   False
Flow Packets/s                 False
Bwd PSH Flags                  False
Fwd URG Flags                  False
Bwd URG Flags                  False
Min Packet Length              False
Max Packet Length              False
CWE Flag Count                 False
Average Packet Size            False
Fwd Header Length.1            False
Fwd Avg Bytes/Bulk             False
Fwd Avg Packets/Bulk           False
Fwd Avg Bulk Rate              False
Bwd Avg Bytes/Bulk             False
Bwd Avg Packets/Bulk           False
Bwd Avg Bulk Rate              False
Init_Win_bytes_forward         False
Init_Win_bytes_backward        False
act_data_pkt_fwd               False
min_seg_size_forward           False
Protocol                       False
Fwd Packets Length Total       False
Bwd Packets Length Total       False
Packet Length Min              False
P

In [16]:
df_rows_isfinite = np.isfinite(df.drop(['Label'], axis='columns')).all(axis=1)
inf_indexes = df_rows_isfinite[df_rows_isfinite == False].index
df.iloc[inf_indexes][['Flow Bytes/s', 'Flow Packets/s']]

Unnamed: 0,Flow Bytes/s,Flow Packets/s
0,4.000000e+06,666666.666700
1,1.100917e+05,18348.623850
2,2.307692e+05,38461.538460
3,3.529412e+05,58823.529410
4,4.000000e+06,666666.666700
...,...,...
2683406,1.273469e+00,0.053061
2683407,0.000000e+00,0.690597
2683408,6.250000e+04,13392.857422
2683409,1.678403e+03,42.792141


Transformando valores infinitos no maior valor finito encontrado na coluna, isso é feito para as duas colunas.

Isso é feito porque a quantidade de registros infinitos é insignificante, seria inviável fazer isso em um conjunto de dados com muitos registros infinitos, pois existiriam muitos valores máximos e isso poderia comprometer o treinamento.

In [17]:
max_finite_flow_packets_per_sec = df[np.isfinite(df['Flow Packets/s'])]['Flow Packets/s'].max()
max_finite_flow_bytes_per_sec = df[np.isfinite(df['Flow Bytes/s'])]['Flow Bytes/s'].max()

df.loc[df['Flow Packets/s'] == np.inf, 'Flow Packets/s'] = max_finite_flow_packets_per_sec
df.loc[df['Flow Bytes/s'] == np.inf, 'Flow Bytes/s'] = max_finite_flow_bytes_per_sec

df = df.reset_index(drop=True) # Resetando index

#### Registros com valores Null/NaN/NA

In [18]:
df.columns[df.isna().any(axis=0)]

Index(['Destination Port', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Flow Bytes/s', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Min Packet Length',
       'Max Packet Length', 'CWE Flag Count', 'Average Packet Size',
       'Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk',
       'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk',
       'Bwd Avg Bulk Rate', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward',
       'Protocol', 'Fwd Packets Length Total', 'Bwd Packets Length Total',
       'Packet Length Min', 'Packet Length Max', 'Avg Packet Size',
       'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets',
       'Fwd Seg Size Min'],
      dtype='object')

In [19]:
df[df.isna().any(axis=1)][['Flow Bytes/s']]

Unnamed: 0,Flow Bytes/s
0,4.000000e+06
1,1.100917e+05
2,2.307692e+05
3,3.529412e+05
4,4.000000e+06
...,...
2683406,1.273469e+00
2683407,0.000000e+00
2683408,6.250000e+04
2683409,1.678403e+03


Preenchendo registros NaN/Null/NA com a média dos valores de cada coluna.

In [20]:
for column in tqdm(df.columns):
    if column != "Label":
        column_mean = df[column].mean()
        df[column].fillna(column_mean, inplace=True)

df = df.reset_index(drop=True) # Resetando index

100%|██████████| 89/89 [00:00<00:00, 102.17it/s]


#### Features correlacionadas

In [21]:
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)


Coletando as features correlacionadas, com o objetivo de evitar a redundância no treinamento do modelo.

In [22]:
df_without_Label = df.drop('Label', axis='columns')

In [23]:
corr_matrix = df_without_Label.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)

# Limpeza
del df_without_Label

In [24]:
correlation_list[:10]

[(('Avg Bwd Segment Size', 'Bwd Packet Length Mean'), 1.0000000000000002),
 (('SYN Flag Count', 'Fwd PSH Flags'), 1.0),
 (('CWE Flag Count', 'Fwd URG Flags'), 1.0),
 (('Avg Fwd Segment Size', 'Fwd Packet Length Mean'), 1.0),
 (('Subflow Fwd Packets', 'Total Fwd Packets'), 1.0),
 (('Subflow Bwd Packets', 'Total Backward Packets'), 1.0),
 (('ECE Flag Count', 'RST Flag Count'), 0.9998932970661557),
 (('Fwd Header Length.1', 'Fwd Header Length'), 0.9995963815040403),
 (('Total Backward Packets', 'Total Fwd Packets'), 0.99905808159376),
 (('Subflow Bwd Packets', 'Total Fwd Packets'), 0.99905808159376)]

Criando uma lista do que será dropado.

In [25]:
f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

Exibindo as features correlacionadas.

In [26]:
f2drop

['Bwd Packet Length Mean',
 'Fwd PSH Flags',
 'Fwd URG Flags',
 'Fwd Packet Length Mean',
 'Total Fwd Packets',
 'Total Backward Packets',
 'RST Flag Count',
 'Fwd Header Length',
 'Subflow Fwd Packets',
 'Flow Duration',
 'Flow IAT Max',
 'Subflow Bwd Packets',
 'Idle Mean',
 'Fwd IAT Max',
 'Packet Length Mean',
 'Bwd Packet Length Max',
 'Max Packet Length',
 'Fwd Packet Length Max',
 'Total Length of Bwd Packets',
 'Idle Max']

Adicionando a feature **Destination Port** à lista de features a serem dropadas, porque não existe muita correlação entre o tamanho do número da porta com a função da porta, por exemplo, não existe algo do tipo "portas maiores tem uma função mais pra X, enquanto portas menores servem mais pra Y".

In [27]:
f2drop += ['Destination Port']

Removendo as features que atrapalham o modelo de aprendizagem

In [28]:
df = df.drop(f2drop, axis='columns')

#### Normalização dos dados

In [30]:
std_scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = std_scaler.fit_transform(df[numeric_cols])
#df = pd.DataFrame(std_scaler.fit_transform(df), columns=df.columns)

#### Limpeza de variáveis

In [31]:
del f2drop
del corr_matrix
del correlation_list
del df_columns_isfinite
del df_rows_isfinite
del inf_indexes
del max_finite_flow_packets_per_sec
del max_finite_flow_bytes_per_sec
del initial_len
del column_mean
del numeric_cols
del std_scaler

### Dados após o tratamento

Exibindo o **.describe()**.

In [35]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
df.describe(include='all')

Unnamed: 0,Total Length of Fwd Packets,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Bwd PSH Flags,Bwd URG Flags,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Bytes,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std,Idle Min,Label,Protocol,Fwd Packets Length Total,Bwd Packets Length Total,Packet Length Min,Packet Length Max,Avg Packet Size,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min
count,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0,2683411.0
unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,22,,,,,,,,,,
top,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,BENIGN,,,,,,,,,,
freq,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2096484,,,,,,,,,,
mean,-5.99312e-17,-1.1566070000000001e-17,-2.818434e-16,-6.049942e-17,1.689578e-16,-1.6830110000000002e-17,2.106464e-16,-1.453755e-16,1.794647e-16,-1.5675620000000001e-18,6.378282000000001e-17,-5.420799e-17,2.889822e-16,-4.554402999999999e-19,2.0611320000000003e-17,-1.191347e-16,-6.883503000000001e-17,4.9399380000000006e-17,1.061282e-17,0.0,0.0,-9.214722e-19,8.752927000000001e-17,8.095517000000001e-17,4.262877e-16,1.7349100000000003e-17,-6.994716e-17,4.382818e-17,8.753986e-18,6.397347000000001e-17,2.7983100000000003e-17,-5.486467e-18,5.376289999999999e-19,1.226299e-16,-2.10138e-17,-9.128137000000001e-17,6.725687999999999e-19,-1.463552e-16,-1.6235800000000001e-18,0.0,0.0,0.0,0.0,0.0,0.0,8.473307999999999e-19,-2.1183269999999998e-20,-5.331711e-16,2.939968e-17,-3.165043e-18,1.180728e-18,-2.5411980000000004e-17,-7.276453e-18,1.0167970000000001e-17,-6.0292880000000004e-18,-4.548578e-17,1.8567140000000002e-17,,-2.414215e-14,-3.918058e-16,-8.553804000000001e-17,3.15546e-15,-3.382545e-15,3.311369e-16,1.201041e-14,-7.254677e-15,1.956487e-16,-3.9861620000000006e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.05959353,-0.3180452,-0.2631412,-0.610725,-0.423049,-5.159048,-9.266356,-0.301697,-0.3838163,-0.05953342,-0.4597054,-0.288905,-0.3578752,-0.125239,-0.3694809,-0.216276,-0.2537148,-0.2921925,-0.1225324,0.0,0.0,-719.8805,-0.2072836,-0.1683633,-0.6783865,-0.4952128,-0.3065575,-0.1777356,-0.2293459,-0.6594143,-0.6655088,-0.3322467,-0.005808822,-0.1119493,-1.003002,-0.6339378,-0.3283441,-0.5337924,-1489.714,0.0,0.0,0.0,0.0,0.0,0.0,-0.04539528,-0.00748267,-0.5134057,-0.2578254,-0.009188102,-481.7651,-0.1026217,-0.08461909,-0.128929,-0.08442384,-0.1159003,-0.3586239,,-7.493099,-0.09489807,-0.02564499,-2.386949,-2.314561,-3.012793,-2.046924,-1.563979,-0.03652719,-1638.107
25%,-0.05725491,-0.3180452,-0.2631412,-0.610725,-0.423049,-0.04475867,-0.21712,-0.301676,-0.3838163,-0.05952813,-0.4597053,-0.2889049,-0.3578752,-0.1252376,-0.3694809,-0.216276,-0.2537148,-0.2921925,-0.1225324,0.0,0.0,0.001612486,-0.2072772,-0.1683588,-0.6783865,-0.4916335,-0.3065544,-0.1777356,-0.2293459,-0.6594143,-0.6655088,-0.3322467,-0.005808822,-0.1119493,-1.003002,-0.6070648,-0.2972039,-0.5240814,0.001351061,0.0,0.0,0.0,0.0,0.0,0.0,-0.04452748,-0.007480174,-0.5134057,-0.2578254,-0.009188102,0.002782101,-0.1026217,-0.08461909,-0.128929,-0.08442384,-0.1159003,-0.3586239,,-2.415632e-14,-3.918821e-16,-8.573731e-17,3.128265e-15,-3.374424e-15,3.541526e-16,1.199809e-14,-7.249413e-15,1.955376e-16,-3.9961480000000005e-17
50%,-0.05277254,-0.2847153,-0.2631412,-0.610725,-0.423049,-0.04469012,-0.216821,-0.2973861,-0.3822992,-0.05952782,-0.459704,-0.2889,-0.3578752,-0.1252373,-0.3694808,-0.2162757,-0.2537148,-0.2921923,-0.1225322,0.0,0.0,0.001625895,-0.2070888,-0.1678741,-0.4364439,-0.4448198,-0.3059323,-0.1777356,-0.2293459,-0.6594143,-0.6655088,-0.3322467,-0.005808822,-0.1119493,0.4359801,-0.3764052,-0.1363132,-0.390477,0.001352541,0.0,0.0,0.0,0.0,0.0,0.0,-0.0406224,-0.007418621,-0.495248,-0.2577099,-0.007658257,0.002782101,-0.1026217,-0.08461909,-0.128929,-0.08442384,-0.1159003,-0.3586239,,-2.415632e-14,-3.918821e-16,-8.573731e-17,3.128265e-15,-3.374424e-15,3.541526e-16,1.199809e-14,-7.249413e-15,1.955376e-16,-3.9961480000000005e-17
75%,-0.02227298,0.2985574,-0.006237436,0.5524662,-0.1130776,-0.04276829,-0.1397857,-0.1881548,-0.2081058,-0.05950604,-0.3216976,-0.2157761,-0.3126934,-0.1252322,-0.3643625,-0.2129372,-0.2490001,-0.2849163,-0.122527,0.0,0.0,0.001687576,-0.1590033,-0.1247319,0.732945,-0.001380151,-0.2465171,-0.1777356,-0.2293459,1.516497,1.50261,-0.3322467,-0.005808822,-0.1119493,0.4359801,-3.394559e-16,-0.05698001,-0.1813645,0.001355686,0.0,0.0,0.0,0.0,0.0,0.0,-0.02095237,-0.007059702,0.06544835,-0.2284828,-0.004598567,0.002792869,-0.1026217,-0.08461909,-0.128929,-0.08442384,-0.1159003,-0.3586239,,-2.415632e-14,-3.918821e-16,-8.573731e-17,3.128265e-15,-3.374424e-15,3.541526e-16,1.199809e-14,-7.249413e-15,1.955376e-16,-3.9961480000000005e-17


In [36]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

Exibindo o **.info()**.

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2683411 entries, 0 to 2683410
Data columns (total 68 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Total Length of Fwd Packets  float64
 1   Fwd Packet Length Min        float64
 2   Fwd Packet Length Std        float64
 3   Bwd Packet Length Min        float64
 4   Bwd Packet Length Std        float64
 5   Flow Bytes/s                 float64
 6   Flow Packets/s               float64
 7   Flow IAT Mean                float64
 8   Flow IAT Std                 float64
 9   Flow IAT Min                 float64
 10  Fwd IAT Total                float64
 11  Fwd IAT Mean                 float64
 12  Fwd IAT Std                  float64
 13  Fwd IAT Min                  float64
 14  Bwd IAT Total                float64
 15  Bwd IAT Mean                 float64
 16  Bwd IAT Std                  float64
 17  Bwd IAT Max                  float64
 18  Bwd IAT Min                  float64
 19  

# Dividindo os dados (treino, validação, teste)

In [None]:
df_train = df.query('Label == "BENIGN"').sample(frac=0.6)#, random_state=RANDOM_SEED)
df_val_test = df.drop(df_train.index)

df_train = df_train.reset_index(drop=True)
df_val_test = df_val_test.reset_index(drop=True)

X_train = df_train.drop('Label', axis='columns')

In [None]:
X_val, X_test, classes_val, classes_test = train_test_split(df_val_test.drop('Label', axis='columns'), df_val_test['Label'], test_size=0.65, stratify=df_val_test['Label']) #random_state=RANDOM_SEED)

X_val, X_test = X_val.reset_index(drop=True), X_test.reset_index(drop=True)
classes_val, classes_test =  classes_val.reset_index(drop=True), classes_test.reset_index(drop=True)

y_val, y_test = classes_val.apply(lambda c: 0 if c == 'BENIGN' else 1), classes_test.apply(lambda c: 0 if c == 'BENIGN' else 1)

In [None]:
del df_train, df_val_test