# Importações

In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.datasets import load_digits
from FirstStage import FirstStage
from SecondStage import SecondStage
from Extension import Extension

from tqdm import tqdm


# Dados

### Carregando os dados

In [2]:
cic_ids_2017 = pd.read_parquet("data/cic_ids_2017.parquet")
infiltration_2018 = pd.read_parquet("data/infiltration_2018.parquet")
testp = pd.read_parquet("data/test.parquet")
cic_collection = pd.read_parquet("data/cic-collection.parquet")

Exibindo o **.info()**

In [3]:
cic_ids_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

In [4]:
infiltration_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127844 entries, 0 to 127843
Data columns (total 68 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Protocol                  127844 non-null  float64
 1   Flow Duration             127844 non-null  float64
 2   Total Fwd Packets         127844 non-null  float64
 3   Total Backward Packets    127844 non-null  float64
 4   Fwd Packets Length Total  127844 non-null  float64
 5   Bwd Packets Length Total  127844 non-null  float64
 6   Fwd Packet Length Max     127844 non-null  float64
 7   Fwd Packet Length Min     127844 non-null  float64
 8   Fwd Packet Length Mean    127844 non-null  float64
 9   Fwd Packet Length Std     127844 non-null  float64
 10  Bwd Packet Length Max     127844 non-null  float64
 11  Bwd Packet Length Min     127844 non-null  float64
 12  Bwd Packet Length Mean    127844 non-null  float64
 13  Bwd Packet Length Std     127844 non-null  f

In [5]:
testp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59435 entries, 0 to 59434
Data columns (total 68 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Protocol                  59435 non-null  float32
 1   Flow Duration             59435 non-null  float64
 2   Total Fwd Packets         59435 non-null  float64
 3   Total Backward Packets    59435 non-null  float64
 4   Fwd Packets Length Total  59435 non-null  float64
 5   Bwd Packets Length Total  59435 non-null  float64
 6   Fwd Packet Length Max     59435 non-null  float32
 7   Fwd Packet Length Min     59435 non-null  float32
 8   Fwd Packet Length Mean    59435 non-null  float32
 9   Fwd Packet Length Std     59435 non-null  float32
 10  Bwd Packet Length Max     59435 non-null  float32
 11  Bwd Packet Length Min     59435 non-null  float32
 12  Bwd Packet Length Mean    59435 non-null  float32
 13  Bwd Packet Length Std     59435 non-null  float32
 14  Flow B

### Padronizando e limpando os dados

#### Concatenando os dados

Mudando o nome das colunas **y** e **Y** para **Label** em dois dos datagramas, pois são a mesma coisa.

In [6]:
infiltration_2018.rename(columns={'y': 'Label'}, inplace=True)
testp.rename(columns={'Y': 'Label'}, inplace=True)


In [7]:
df = pd.concat([cic_ids_2017, infiltration_2018, testp, cic_collection], axis=0)

In [8]:
# Limpeza
del cic_ids_2017
del infiltration_2018
del testp
del cic_collection

In [9]:
df = df.reset_index(drop=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12185603 entries, 0 to 12185602
Data columns (total 90 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             float64
 1   Flow Duration                float64
 2   Total Fwd Packets            float64
 3   Total Backward Packets       float64
 4   Total Length of Fwd Packets  float64
 5   Total Length of Bwd Packets  float64
 6   Fwd Packet Length Max        float64
 7   Fwd Packet Length Min        float64
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        float64
 11  Bwd Packet Length Min        float64
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 float64
 19

#### Alterando Labels

In [11]:
pd.set_option('display.max_rows', 999)
values_count = df['Label'].value_counts()
print(values_count)

Label
Benign                        7242657
BENIGN                        2273097
DDoS-LOIC-HTTP                 575364
DoS-Hulk                       318740
DDoS                           256089
DoS Hulk                       231073
DDoS-HOIC                      198861
PortScan                       158930
Botnet                         146552
Infilteration                  127844
DDoS-NTP                       121328
DDoS-TFTP                       98833
Bruteforce-SSH                  97260
Infiltration                    94929
DoS-Goldeneye                   52324
DDoS-Syn                        47757
DDoS-UDP                        28863
DoS-Slowloris                   15243
DDoS-MSSQL                      11784
DoS GoldenEye                   10293
DDoS-UDPLag                      8452
FTP-Patator                      7938
Bruteforce-FTP                   5984
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
DoS-Sl

Padronizando labels e corrigindo labels redundantes/repetidas ou com caracteres irreconhecíveis.

In [12]:
label_changes = {'infilteration':'Infiltration', 
                 'Infilteration':'Infiltration', 
                 'Bot':'Botnet', 
                 'PortScan':'Port Scan', 
                 'Web Attack � Brute Force':'Web Attack-Brute Force', 
                 'Web Attack � XSS':'Web Attack-XSS', 
                 'Web Attack � Sql Injection':'Web Attack-Sql Injection', 
                 'DoS slowloris':'DoS Slowloris', 
                 'Benign':'BENIGN', 
                 'DoS Hulk':'DoS-Hulk', 
                 'DoS-Goldeneye':'DoS-GoldenEye', 
                 'DoS GoldenEye':'DoS-GoldenEye', 
                 'DoS Slowhttptest':'DoS-Slowhttptest', 
                 'DoS Slowloris':'DoS-Slowloris', 
                 'portscan':'Port Scan', 
                 'Webattack-bruteforce':'Web Attack-Bruteforce', 
                 'Webattack-XSS':'Web Attack-XSS', 
                 'Webattack-SQLi':'Web Attack-SQLi'}

df['Label'] = df['Label'].replace(label_changes)

Decidi dropar as linhas com **Web Attack**, pois tem poucas amostras, além de existirem outras amostras até maiores com web ataques específicos, eu pensei em juntar tudo em **Web Attack**, mas percebi que os ataques são muito diferentes pra classificar tudo como **Web Attack**.

In [13]:
df.drop(df[df['Label'] == 'Web Attack'].index, inplace=True)

Removendo a coluna **ClassLabel**, esta coluna exibe o tipo do ataque, mas a coluna **Label** exibe o tipo e o subtipo do ataque, ou seja, é uma redundância.

In [14]:
df.drop(columns=['ClassLabel'], inplace=True)

Resultado:

In [15]:
pd.set_option('display.max_rows', None)
values_count = df['Label'].value_counts()
print(values_count.sort_values())

Label
DoS-Heartbleed                   11
Web Attack-Sql Injection         21
Heartbleed                       22
Web Attack-SQLi                  99
Brute Force                     584
(D)DOS                          584
DoS-Slowbody                    621
DDoS-NetBIOS                    675
DoS-Rudy                        699
Web Attack-Brute Force         1507
Web Attack-XSS                 1528
DoS-Slowheaders                1649
DDoS-Slowloris                 1858
DDoS-SNMP                      2017
Web Attack-Bruteforce          2020
DDoS-LDAP                      2092
Portscan                       2255
DoS-Slowread                   2786
DDoS-DNS                       3668
DDoS-Ddossim                   5115
DoS Slowloris                  5796
SSH-Patator                    5897
Bruteforce-FTP                 5984
FTP-Patator                    7938
DDoS-UDPLag                    8452
DoS-Slowhttptest              10770
DDoS-MSSQL                    11784
DoS-Slowloris         

#### Registros duplicados

In [16]:
pd.set_option('display.max_rows', 10) # Retornando a um valor menor na exibição do Pandas

In [17]:
df[df.duplicated()]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Protocol,Fwd Packets Length Total,Bwd Packets Length Total,Packet Length Min,Packet Length Max,Avg Packet Size,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min
2109,80.0,77.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2257,443.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2749,443.0,49.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2862,443.0,4.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
2877,443.0,1.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12048726,,1.0,3.0,0.0,,,6.0,,6.0,0.0,...,,18.0,0.0,,6.0,8.0,0.0,-1.0,2.0,20.0
12054681,,158.0,1.0,1.0,,,6.0,,6.0,0.0,...,,6.0,6.0,,6.0,9.0,0.0,0.0,0.0,20.0
12129925,,644.0,1.0,1.0,,,6.0,,6.0,0.0,...,,6.0,6.0,,6.0,9.0,0.0,0.0,0.0,20.0
12152448,,428.0,1.0,1.0,,,6.0,,6.0,0.0,...,,6.0,6.0,,6.0,9.0,0.0,0.0,0.0,20.0


Exibindo os registros não duplicados

In [18]:
df[~df.duplicated()]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Protocol,Fwd Packets Length Total,Bwd Packets Length Total,Packet Length Min,Packet Length Max,Avg Packet Size,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min
0,54865.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
1,55054.0,109.0,1.0,1.0,6.0,6.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
2,55055.0,52.0,1.0,1.0,6.0,6.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
3,46236.0,34.0,1.0,1.0,6.0,6.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
4,54863.0,3.0,2.0,0.0,12.0,0.0,6.0,6.0,6.000000,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12185598,,44797921.0,6.0,5.0,,,6.0,,6.000000,0.000000,...,,36.0,0.0,,6.0,3.818182,256.0,290.0,5.0,20.0
12185599,,49.0,3.0,0.0,,,45.0,,25.333334,23.028967,...,,76.0,0.0,,45.0,40.333332,122.0,-1.0,1.0,32.0
12185600,,1286687.0,41.0,42.0,,,456.0,,64.975609,109.864571,...,,2664.0,6954.0,,976.0,115.879517,29200.0,243.0,24.0,32.0
12185601,,217.0,2.0,1.0,,,31.0,,15.500000,21.920311,...,,31.0,6.0,,31.0,22.666666,137.0,0.0,0.0,32.0


Descartando registros duplicados

In [19]:
initial_len = df.shape[0]
df = df.drop_duplicates()
print(f'Tamanho inicial: {initial_len}, tamanho final {df.shape[0]} | Descartadas {initial_len - df.shape[0]} duplicadas')

df = df.reset_index(drop=True) # Resetando index

Tamanho inicial: 12185019, tamanho final 11850098 | Descartadas 334921 duplicadas


#### Registros com valores não finitos

In [20]:
df_columns_isfinite = np.isfinite(df.drop(['Label'], axis='columns')).all(axis=0)
df_columns_isfinite[df_columns_isfinite == False]

Destination Port               False
Total Length of Fwd Packets    False
Total Length of Bwd Packets    False
Fwd Packet Length Min          False
Bwd Packet Length Min          False
                               ...  
Avg Packet Size                False
Init Fwd Win Bytes             False
Init Bwd Win Bytes             False
Fwd Act Data Packets           False
Fwd Seg Size Min               False
Length: 41, dtype: bool

In [21]:
df_rows_isfinite = np.isfinite(df.drop(['Label'], axis='columns')).all(axis=1)
inf_indexes = df_rows_isfinite[df_rows_isfinite == False].index
df.iloc[inf_indexes][['Flow Bytes/s', 'Flow Packets/s']]

Unnamed: 0,Flow Bytes/s,Flow Packets/s
0,4.000000e+06,666666.666700
1,1.100917e+05,18348.623850
2,2.307692e+05,38461.538460
3,3.529412e+05,58823.529410
4,4.000000e+06,666666.666700
...,...,...
11850093,8.036087e-01,0.245547
11850094,1.551020e+06,61224.489800
11850095,7.475011e+03,64.506753
11850096,1.705069e+05,13824.884790


Transformando valores infinitos no maior valor finito encontrado na coluna, isso é feito para as duas colunas.

Isso é feito porque a quantidade de registros infinitos é insignificante, seria inviável fazer isso em um conjunto de dados com muitos registros infinitos, pois existiriam muitos valores máximos e isso poderia comprometer o treinamento.

In [22]:
max_finite_flow_packets_per_sec = df[np.isfinite(df['Flow Packets/s'])]['Flow Packets/s'].max()
max_finite_flow_bytes_per_sec = df[np.isfinite(df['Flow Bytes/s'])]['Flow Bytes/s'].max()

df.loc[df['Flow Packets/s'] == np.inf, 'Flow Packets/s'] = max_finite_flow_packets_per_sec
df.loc[df['Flow Bytes/s'] == np.inf, 'Flow Bytes/s'] = max_finite_flow_bytes_per_sec

df = df.reset_index(drop=True) # Resetando index

#### Registros com valores Null/NaN/NA

In [23]:
df.columns[df.isna().any(axis=0)]

Index(['Destination Port', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Min',
       'Bwd Packet Length Min', 'Flow Bytes/s', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Min Packet Length',
       'Max Packet Length', 'FIN Flag Count', 'RST Flag Count',
       'PSH Flag Count', 'ACK Flag Count', 'CWE Flag Count', 'ECE Flag Count',
       'Down/Up Ratio', 'Average Packet Size', 'Fwd Header Length.1',
       'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',
       'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',
       'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd',
       'min_seg_size_forward', 'Protocol', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Packet Length Min', 'Packet Length Max',
       'Avg Packet Size', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes',
       'Fwd Act Data Packets', 'Fwd Seg Size Min'],
      dtype='object')

In [24]:
df[df.isna().any(axis=1)][['Flow Bytes/s']]

Unnamed: 0,Flow Bytes/s
0,4.000000e+06
1,1.100917e+05
2,2.307692e+05
3,3.529412e+05
4,4.000000e+06
...,...
11850093,8.036087e-01
11850094,1.551020e+06
11850095,7.475011e+03
11850096,1.705069e+05


Preenchendo registros NaN/Null/NA com a média dos valores de cada coluna.

In [25]:
for column in tqdm(df.columns):
    if column != "Label":
        column_median = df[column].median()
        df[column].fillna(column_median, inplace=True)

df = df.reset_index(drop=True) # Resetando index

100%|██████████| 89/89 [00:09<00:00,  9.85it/s]


#### Features correlacionadas

In [26]:
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)


Coletando as features correlacionadas, com o objetivo de evitar a redundância no treinamento do modelo.

In [27]:
df_without_Label = df.drop('Label', axis='columns')

In [28]:
corr_matrix = df_without_Label.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)

# Limpeza
del df_without_Label

In [29]:
correlation_list[:10]

[(('Avg Bwd Segment Size', 'Bwd Packet Length Mean'), 1.0000000000000009),
 (('CWE Flag Count', 'Fwd URG Flags'), 1.0),
 (('Avg Fwd Segment Size', 'Fwd Packet Length Mean'), 1.0),
 (('Subflow Fwd Packets', 'Total Fwd Packets'), 1.0),
 (('Subflow Bwd Packets', 'Total Backward Packets'), 1.0),
 (('Fwd IAT Max', 'Flow IAT Max'), 0.9999870380168029),
 (('Fwd IAT Total', 'Flow Duration'), 0.999974341814589),
 (('Fwd IAT Min', 'Flow IAT Min'), 0.9999721070393633),
 (('Fwd IAT Std', 'Flow IAT Std'), 0.9999326855111909),
 (('ECE Flag Count', 'RST Flag Count'), 0.9998943214795447)]

Criando uma lista do que será dropado.

In [30]:
f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

Exibindo as features correlacionadas.

In [31]:
f2drop

['Bwd Packet Length Mean',
 'Fwd URG Flags',
 'Fwd Packet Length Mean',
 'Total Fwd Packets',
 'Total Backward Packets',
 'Flow IAT Max',
 'Flow Duration',
 'Flow IAT Min',
 'Flow IAT Std',
 'RST Flag Count',
 'Flow IAT Mean',
 'Subflow Fwd Bytes',
 'Idle Std',
 'Subflow Bwd Packets',
 'Idle Mean',
 'Bwd Packet Length Max',
 'Subflow Fwd Packets',
 'Fwd IAT Std',
 'Fwd Packet Length Max']

Adicionando a feature **Destination Port** à lista de features a serem dropadas, porque não existe muita correlação entre o tamanho do número da porta com a função da porta, por exemplo, não existe algo do tipo "portas maiores tem uma função mais pra X, enquanto portas menores servem mais pra Y".

In [32]:
f2drop += ['Destination Port']

Removendo as features que atrapalham o modelo de aprendizagem

In [33]:
df = df.drop(f2drop, axis='columns')

#### Normalização dos dados

In [34]:
std_scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = std_scaler.fit_transform(df[numeric_cols])
#df = pd.DataFrame(std_scaler.fit_transform(df), columns=df.columns)

#### Limpeza de variáveis

In [36]:
del f2drop
del corr_matrix
del correlation_list
del df_columns_isfinite
del df_rows_isfinite
del inf_indexes
del max_finite_flow_packets_per_sec
del max_finite_flow_bytes_per_sec
del initial_len
del numeric_cols
del std_scaler
del label_changes
del values_count

NameError: name 'f2drop' is not defined

### Dados após o tratamento

Exibindo o **.describe()**.

In [37]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
df.describe(include='all')

Unnamed: 0,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Max,Idle Min,Label,Protocol,Fwd Packets Length Total,Bwd Packets Length Total,Packet Length Min,Packet Length Max,Avg Packet Size,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min
count,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850098.0,11850098.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850098.0,11850098.0,11850098.0,11850098.0,11850098.0,11850098.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850098,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0,11850100.0
unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,42,,,,,,,,,,
top,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,BENIGN,,,,,,,,,,
freq,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9336752,,,,,,,,,,
mean,3.598878e-16,6.283604e-18,-1.93172e-15,7.328663000000001e-17,-3.679087e-15,6.224711e-16,-1.62662e-17,-5.450207000000001e-17,3.968663e-18,-1.386117e-17,2.245536e-18,5.092181e-19,-1.310506e-17,-6.230899e-17,-1.156142e-16,7.252872e-18,2.9668660000000004e-17,1.511015e-18,0.0,0.0,-1.014539e-17,-2.2752170000000002e-18,-3.9176060000000003e-17,-1.764338e-16,3.617149e-15,9.635192e-16,-9.668962e-16,-8.071315e-16,-8.227597e-17,1.451256e-15,1.575773e-18,1.294657e-15,3.799124e-17,-1.41479e-16,-6.432817000000001e-17,-7.652452e-16,1.068062e-15,3.411105e-15,4.058874e-16,-1.136782e-15,5.229866e-18,0.0,0.0,0.0,0.0,0.0,0.0,-4.5720199999999995e-20,2.543744e-15,-4.660162e-16,-1.8346690000000003e-17,-4.201761e-18,5.949562e-17,1.189625e-16,-2.8331530000000004e-17,-1.1920230000000001e-17,-3.185124e-18,-7.8237e-18,,6.002424e-16,-1.2871810000000001e-17,-5.600125e-18,-4.800039e-16,2.917201e-16,-6.462725e-16,1.95463e-16,-2.008931e-16,1.422273e-18,9.895710000000001e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.03725562,-0.003599777,-0.1993741,-0.4382963,-0.2560248,-0.4440539,-4.090027,-14.41003,-1589.784,-3174.802,-1072.876,-1082.751,-0.3428937,-0.1993493,-0.2542569,-0.2772788,-0.09564411,-0.1920655,0.0,0.0,-260.8076,-1502.618,-0.1273563,-0.1143483,-0.3885026,-0.2906605,-0.6403185,-0.5300755,-0.2291891,-0.08356312,-0.2109413,-0.2713442,-0.2733082,-0.2306884,-0.002598277,-0.05301626,-2.630298,-0.6464832,-0.4971481,-0.512603,-3130.547,0.0,0.0,0.0,0.0,0.0,0.0,-0.007358968,-0.2383451,-0.1127797,-0.006640306,-1012.403,-0.08232329,-0.06754923,-0.1024218,-0.06781397,-0.01221315,-0.102146,,-7.798134,-0.0221183,-0.006877409,-0.05894981,-0.551234,-0.7153695,-0.5090679,-0.3759625,-0.01571682,-47.43985
25%,-0.0237549,-0.003459646,-0.1314311,-0.4382963,-0.2560248,-0.4440539,-0.04364084,-0.1382484,-0.02681793,-0.01771492,-0.01297457,-0.002945882,-0.3428937,-0.1993493,-0.2542569,-0.2772788,-0.09564411,-0.1920655,0.0,0.0,0.008212524,0.0037544,-0.1273489,-0.1143422,-0.2377819,-0.1939554,-0.5475769,-0.5225124,-0.2291789,-0.08356312,-0.2109413,-0.2713442,-0.2733082,-0.2306884,-0.002598277,-0.05301626,0.1936165,-0.1664668,-0.4542182,-0.512603,0.0006049966,0.0,0.0,0.0,0.0,0.0,0.0,-0.007358968,-0.2039272,-0.1125427,-0.003425495,0.001244441,-0.08232329,-0.06754923,-0.1024218,-0.06781397,-0.01221315,-0.102146,,-0.06860029,-0.02156165,-0.006809563,-0.05894981,-0.4755358,-0.4146555,-0.5016269,-0.3759625,-0.01516356,0.02794239
50%,-0.0237549,-0.003459646,-0.1314311,-0.4146159,-0.2560248,-0.4440539,-0.04362289,-0.1380496,-0.02674894,-0.01764615,-0.01293202,-0.002945858,-0.342892,-0.1993423,-0.2542569,-0.2772754,-0.09564364,-0.1920655,0.0,0.0,0.008212564,0.003756521,-0.1272309,-0.1141826,-0.2377819,-0.1939554,-0.3335577,-0.4045132,-0.2263682,-0.08356312,-0.2109413,-0.2713442,-0.2733082,-0.2306884,-0.002598277,-0.05301626,0.1936165,-0.1664668,-0.2266896,-0.2826756,0.0006049966,0.0,0.0,0.0,0.0,0.0,0.0,-0.007230698,-0.2039272,-0.1125427,-0.003425495,0.001244441,-0.08232329,-0.06754923,-0.1024218,-0.06781397,-0.01221315,-0.102146,,-0.06860029,-0.02080136,-0.006679023,-0.05894981,-0.340556,-0.2300709,-0.3889576,-0.3688985,-0.01516356,0.02794239
75%,-0.0237549,-0.003459646,-0.1314311,0.2171433,-0.2560248,0.2260318,-0.0430501,-0.1324598,-0.01864446,-0.01366623,-0.007509514,-0.002945538,-0.3072512,-0.1678611,-0.2310316,-0.2287598,-0.09561888,-0.1920655,0.0,0.0,0.008212681,0.003765005,-0.1238419,-0.1108896,-0.2377819,-0.1939554,0.01466921,0.1673368,-0.1421626,-0.08356312,-0.2109413,-0.2713442,-0.2733082,-0.2306884,-0.002598277,-0.05301626,0.1936165,-0.1664668,0.08669891,-0.02958806,0.0006049966,0.0,0.0,0.0,0.0,0.0,0.0,-0.006758716,-0.2039272,-0.1125427,-0.003425495,0.001244441,-0.08232329,-0.06754923,-0.1024218,-0.06781397,-0.01221315,-0.102146,,-0.06860029,-0.01408092,-0.00637758,-0.05894981,0.3015101,0.0638119,-0.02903697,-0.3628599,-0.0146103,0.02794239


In [38]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

Exibindo o **.info()**.

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11850098 entries, 0 to 11850097
Data columns (total 69 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Total Length of Fwd Packets  float64
 1   Total Length of Bwd Packets  float64
 2   Fwd Packet Length Min        float64
 3   Fwd Packet Length Std        float64
 4   Bwd Packet Length Min        float64
 5   Bwd Packet Length Std        float64
 6   Flow Bytes/s                 float64
 7   Flow Packets/s               float64
 8   Fwd IAT Total                float64
 9   Fwd IAT Mean                 float64
 10  Fwd IAT Max                  float64
 11  Fwd IAT Min                  float64
 12  Bwd IAT Total                float64
 13  Bwd IAT Mean                 float64
 14  Bwd IAT Std                  float64
 15  Bwd IAT Max                  float64
 16  Bwd IAT Min                  float64
 17  Fwd PSH Flags                float64
 18  Bwd PSH Flags                float64
 19

# Dividindo os dados (treino, validação, teste)

In [40]:
df_train = df.query('Label == "BENIGN"').sample(frac=0.6)#, random_state=RANDOM_SEED)
df_val_test = df.drop(df_train.index)

df_train = df_train.reset_index(drop=True)
df_val_test = df_val_test.reset_index(drop=True)

X_train = df_train.drop('Label', axis='columns')

In [None]:
del df

In [None]:
X_val, X_test, classes_val, classes_test = train_test_split(df_val_test.drop('Label', axis='columns'), df_val_test['Label'], test_size=0.65, stratify=df_val_test['Label']) #random_state=RANDOM_SEED)

X_val, X_test = X_val.reset_index(drop=True), X_test.reset_index(drop=True)
classes_val, classes_test =  classes_val.reset_index(drop=True), classes_test.reset_index(drop=True)

y_val, y_test = classes_val.apply(lambda c: 0 if c == 'BENIGN' else 1), classes_test.apply(lambda c: 0 if c == 'BENIGN' else 1)

In [None]:
del df_train, df_val_test

# Ignorar

In [None]:
# Valor específico
#valor_especifico = 'BENIGN'

# Coleta de valores diferentes do valor específico
#valores_diferentes = df.loc[df['Label'] != valor_especifico, 'Label'].tolist()

#print(valores_diferentes)


In [None]:
#pd.set_option('display.max_rows', None)
#contagem_valores = df['Label'].value_counts()
#print(contagem_valores)


In [None]:
#df['Z'] = df['X'] + df['Y']

# Apagar as colunas X e Y
#df.drop(columns=['X', 'Y'], inplace=True)