# Importações

In [1]:
#import os
#import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random
#import matplotlib.pyplot as plt
#import seaborn as sns
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.metrics import classification_report
#from sklearn.datasets import load_digits
#from models/FirstStage import FirstStage
#from models/SecondStage import SecondStage
#from models/Extension import Extension

from tqdm import tqdm


# Dados

### Carregando os dados

In [2]:
cic_ids_2017 = pd.read_parquet("data/cic_ids_2017.parquet")

Exibindo o **.info()**

In [3]:
cic_ids_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

### Padronizando e limpando os dados

#### Renomeando e Exibindo

In [4]:
df = cic_ids_2017

# Limpeza, pra liberar memória
del cic_ids_2017

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int64  
 1   Flow Duration                int64  
 2   Total Fwd Packets            int64  
 3   Total Backward Packets       int64  
 4   Total Length of Fwd Packets  int64  
 5   Total Length of Bwd Packets  int64  
 6   Fwd Packet Length Max        int64  
 7   Fwd Packet Length Min        int64  
 8   Fwd Packet Length Mean       float64
 9   Fwd Packet Length Std        float64
 10  Bwd Packet Length Max        int64  
 11  Bwd Packet Length Min        int64  
 12  Bwd Packet Length Mean       float64
 13  Bwd Packet Length Std        float64
 14  Flow Bytes/s                 float64
 15  Flow Packets/s               float64
 16  Flow IAT Mean                float64
 17  Flow IAT Std                 float64
 18  Flow IAT Max                 int64  
 19  

#### Alterando Labels

In [6]:
pd.set_option('display.max_rows', 999)
values_count = df['Label'].value_counts()
print(values_count)

Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


Padronizando labels e corrigindo labels redundantes/repetidas ou com caracteres irreconhecíveis.

In [7]:
label_changes = {'infilteration':'Infiltration', 
                 'Infilteration':'Infiltration',  
                 'PortScan':'Port Scan', 
                 'Web Attack � Brute Force':'Brute Force-Web Attack', 
                 'Web Attack � XSS':'Web Attack-XSS', 
                 'Web Attack � Sql Injection':'Sql Injection-Web Attack', 
                 'DoS slowloris':'DoS Slowloris', 
                 'Benign':'BENIGN', 
                 'DoS Hulk':'DoS-Hulk', 
                 'DoS-Goldeneye':'DoS-GoldenEye', 
                 'DoS GoldenEye':'DoS-GoldenEye', 
                 'DoS Slowhttptest':'DoS-Slowhttptest', 
                 'DoS Slowloris':'DoS-Slowloris', 
                 'portscan':'Port Scan', 
                 'Webattack-bruteforce':'Brute Force-Web Attack', 
                 'Webattack-XSS':'Web Attack-XSS', 
                 'Webattack-SQLi':'Web Attack-SQLi', 
                 'DoS attacks-Hulk':'DoS-Hulk', 
                 'Brute Force -Web':'Brute Force-Web Attack', 
                 'DDoS attacks-LOIC-HTTP':'DDoS-LOIC-HTTP', 
                 'DDoS attack-HOIC':'DDoS-HOIC', 
                 'DDOS attack-LOIC-UDP':'DDOS-LOIC-UDP', 
                 'DoS attacks-SlowHTTPTest':'DoS-SlowHTTPTest', 
                 'DoS attacks-GoldenEye':'DoS-GoldenEye', 
                 'DoS attacks-Slowloris':'DoS-Slowloris', 
                 'SSH-Bruteforce':'Brute Force-SSH', 
                 'FTP-BruteForce':'Brute Force-FTP', 
                 'Brute Force -XSS':'Brute Force-XSS', 
                 'SSH-Patator':'Patator-SSH', 
                 'FTP-Patator':'Patator-FTP', 
                 'DDOS-LOIC-UDP':'DDoS-LOIC-UDP' }

df['Label'] = df['Label'].replace(label_changes)

Resultado:

In [8]:
pd.set_option('display.max_rows', None)
values_count = df['Label'].value_counts()
print(values_count.sort_values())

Label
Heartbleed                       11
Sql Injection-Web Attack         21
Infiltration                     36
Web Attack-XSS                  652
Brute Force-Web Attack         1507
Bot                            1966
DoS-Slowhttptest               5499
DoS Slowloris                  5796
Patator-SSH                    5897
Patator-FTP                    7938
DoS-GoldenEye                 10293
DDoS                         128027
Port Scan                    158930
DoS-Hulk                     231073
BENIGN                      2273097
Name: count, dtype: int64


Existem algumas colunas que significam a mesma coisa, mas estão com nomes diferentes.

In [9]:
df.rename(columns={'coloração': 'cor'}, inplace=True)

#### Registros duplicados

In [10]:
pd.set_option('display.max_rows', 10) # Retornando a um valor menor na exibição do Pandas

In [11]:
df[df.duplicated()]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
2109,80,77,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2257,443,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2749,443,49,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2862,443,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2877,443,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2830701,53,179,2,2,46,46,23,23,23.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830725,53,161,2,2,82,114,41,41,41.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830726,53,212,2,2,84,162,42,42,42.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830731,443,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


Exibindo os registros não duplicados

In [12]:
df[~df.duplicated()]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2830738,53,32215,4,2,112,152,28,28,28.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830739,53,324,2,2,84,362,42,42,42.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830740,58030,82,2,1,31,6,31,0,15.5,21.92031,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2830741,53,1048635,6,2,192,256,32,32,32.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


Descartando registros duplicados

In [13]:
initial_len = df.shape[0]
df = df.drop_duplicates()
print(f'Tamanho inicial: {initial_len}, tamanho final {df.shape[0]} | Descartadas {initial_len - df.shape[0]} duplicadas')

df = df.reset_index(drop=True) # Resetando index

Tamanho inicial: 2830743, tamanho final 2522362 | Descartadas 308381 duplicadas


#### Registros com valores não finitos

In [14]:
df_columns_isfinite = np.isfinite(df.drop(['Label'], axis='columns')).all(axis=0)
df_columns_isfinite[df_columns_isfinite == False]

Flow Bytes/s      False
Flow Packets/s    False
dtype: bool

In [15]:
df_rows_isfinite = np.isfinite(df.drop(['Label'], axis='columns')).all(axis=1)
inf_indexes = df_rows_isfinite[df_rows_isfinite == False].index
df.iloc[inf_indexes][['Flow Bytes/s', 'Flow Packets/s']]

Unnamed: 0,Flow Bytes/s,Flow Packets/s
65,inf,inf
1767,inf,inf
1890,inf,inf
3365,inf,inf
6752,,inf
...,...,...
2515917,inf,inf
2517136,inf,inf
2517256,,inf
2517261,inf,inf


Transformando valores infinitos no maior valor finito encontrado na coluna, isso é feito para as duas colunas.

Isso é feito porque a quantidade de registros infinitos é insignificante, seria inviável fazer isso em um conjunto de dados com muitos registros infinitos, pois existiriam muitos valores máximos e isso poderia comprometer o treinamento.

In [16]:
max_finite_flow_packets_per_sec = df[np.isfinite(df['Flow Packets/s'])]['Flow Packets/s'].max()
max_finite_flow_bytes_per_sec = df[np.isfinite(df['Flow Bytes/s'])]['Flow Bytes/s'].max()

df.loc[df['Flow Packets/s'] == np.inf, 'Flow Packets/s'] = max_finite_flow_packets_per_sec
df.loc[df['Flow Bytes/s'] == np.inf, 'Flow Bytes/s'] = max_finite_flow_bytes_per_sec

df = df.reset_index(drop=True) # Resetando index

#### Registros com valores Null/NaN/NA

In [17]:
df.columns[df.isna().any(axis=0)]

Index(['Flow Bytes/s'], dtype='object')

In [18]:
df[df.isna().any(axis=1)][['Flow Bytes/s']]

Unnamed: 0,Flow Bytes/s
6752,
14586,
14887,
207571,
235442,
...,...
2444305,
2468086,
2488798,
2503689,


Preenchendo registros NaN/Null/NA com a média dos valores de cada coluna.

In [19]:
for column in tqdm(df.columns):
    if column != "Label":
        column_median = df[column].median()
        df[column].fillna(column_median, inplace=True)

df = df.reset_index(drop=True) # Resetando index

100%|██████████| 79/79 [00:01<00:00, 43.40it/s]


#### Features correlacionadas

In [20]:
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)


Coletando as features correlacionadas, com o objetivo de evitar a redundância no treinamento do modelo.

In [21]:
df_without_Label = df.drop('Label', axis='columns')

In [22]:
corr_matrix = df_without_Label.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)

# Limpeza
del df_without_Label

In [23]:
correlation_list[:10]

[(('Avg Bwd Segment Size', 'Bwd Packet Length Mean'), 1.0000000000000002),
 (('SYN Flag Count', 'Fwd PSH Flags'), 1.0),
 (('CWE Flag Count', 'Fwd URG Flags'), 1.0),
 (('Avg Fwd Segment Size', 'Fwd Packet Length Mean'), 1.0),
 (('Fwd Header Length.1', 'Fwd Header Length'), 1.0),
 (('Subflow Fwd Packets', 'Total Fwd Packets'), 1.0),
 (('Subflow Bwd Packets', 'Total Backward Packets'), 1.0),
 (('Subflow Bwd Bytes', 'Total Length of Bwd Packets'), 0.9999998601337008),
 (('Subflow Fwd Bytes', 'Total Length of Fwd Packets'), 0.9999993581444724),
 (('Total Backward Packets', 'Total Fwd Packets'), 0.999070199856941)]

Criando uma lista do que será dropado.

In [24]:
f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])

Exibindo as features correlacionadas.

In [25]:
f2drop

['Bwd Packet Length Mean',
 'Fwd PSH Flags',
 'Fwd URG Flags',
 'Fwd Packet Length Mean',
 'Fwd Header Length',
 'Total Fwd Packets',
 'Total Backward Packets',
 'Total Length of Bwd Packets',
 'Total Length of Fwd Packets',
 'Subflow Fwd Packets',
 'Flow Duration',
 'Flow IAT Max',
 'Packet Length Mean',
 'RST Flag Count',
 'Subflow Bwd Packets',
 'Idle Mean',
 'Fwd IAT Max',
 'Max Packet Length',
 'Bwd Packet Length Max',
 'Fwd Packet Length Max',
 'Idle Max']

Removendo a coluna **Destination Port**, porque não existe muita correlação entre o tamanho do número da porta com a função da porta, por exemplo, não existe algo do tipo "portas maiores tem uma função mais pra X, enquanto portas menores servem mais pra Y".

In [26]:
f2drop = f2drop + ['Destination Port']

Removendo as features que atrapalham o modelo de aprendizagem

In [27]:
df = df.drop(f2drop, axis='columns')

#### Normalização dos dados

In [28]:
std_scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = std_scaler.fit_transform(df[numeric_cols])
#df = pd.DataFrame(std_scaler.fit_transform(df), columns=df.columns)

#### Limpeza de variáveis

In [29]:
del f2drop
del corr_matrix
del correlation_list
del df_columns_isfinite
del df_rows_isfinite
del inf_indexes
del max_finite_flow_packets_per_sec
del max_finite_flow_bytes_per_sec
del initial_len
del numeric_cols
del std_scaler
del label_changes
del values_count

### Dados após o tratamento

Exibindo o **.describe()**.

In [30]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
df.describe(include='all')

Unnamed: 0,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Min,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Bwd PSH Flags,Bwd URG Flags,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Bytes,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Std,Idle Min,Label
count,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362.0,2522362
unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15
top,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,BENIGN
freq,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2096484
mean,-3.4975550000000003e-17,-2.936864e-16,4.8497020000000006e-17,3.773393e-16,-3.752209e-18,1.02966e-16,-1.68748e-16,2.042644e-16,-1.600041e-18,-5.4807040000000006e-17,-1.786637e-16,7.932598000000001e-17,5.588876e-18,2.8665520000000005e-17,-7.851469000000001e-17,-1.289949e-16,-1.5594770000000002e-17,-1.2169330000000001e-17,0.0,0.0,5.535352999999999e-19,9.969833e-17,8.139927000000001e-17,-1.568491e-17,2.7764090000000003e-17,-1.154734e-16,7.720761000000001e-17,7.139338000000001e-17,-8.653743e-17,-8.296551e-17,1.195298e-16,9.394607e-18,-4.191657e-18,3.9572850000000006e-17,2.340116e-16,1.537842e-16,-1.68748e-16,5.887474999999999e-19,0.0,0.0,0.0,0.0,0.0,0.0,-3.808548e-18,-1.2394679999999999e-19,2.7403520000000003e-17,1.5634200000000003e-17,-1.845118e-18,8.098798999999999e-19,-5.4761970000000004e-17,2.0552640000000002e-17,-2.816974e-17,3.85362e-18,-2.9521880000000004e-17,1.561279e-16,
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
min,-0.3157301,-0.2603734,-0.6086432,-0.4272235,-5.012928,-9.101634,-0.3086918,-0.3874991,-0.05652462,-0.4619675,-0.2913499,-0.3615981,-0.1253188,-0.3674039,-0.2157983,-0.2515916,-0.2905172,-0.123422,0.0,0.0,-697.9481,-0.2113489,-0.170704,-0.6577143,-0.4994425,-0.3141941,-0.1822487,-0.2263518,-0.650789,-0.6737097,-0.3359707,-0.005631812,-0.01652971,-1.006876,-0.6146201,-0.3246529,-0.5382677,-1444.319,0.0,0.0,0.0,0.0,0.0,0.0,-0.05785475,-0.007563513,-0.4977609,-0.2499688,-0.008908117,-467.0845,-0.1333278,-0.1108473,-0.1584073,-0.1070804,-0.1160437,-0.3616369,
25%,-0.3157301,-0.2603734,-0.6086432,-0.4272235,-0.04575152,-0.2208677,-0.3086713,-0.3874991,-0.05651898,-0.4619674,-0.2913498,-0.3615981,-0.1253175,-0.3674039,-0.2157983,-0.2515916,-0.2905172,-0.123422,0.0,0.0,0.001672911,-0.2113426,-0.1706996,-0.6577143,-0.4959473,-0.314191,-0.1822487,-0.2263518,-0.650789,-0.6737097,-0.3359707,-0.005631812,-0.01652971,-1.006876,-0.588566,-0.2939625,-0.5287804,0.001310249,0.0,0.0,0.0,0.0,0.0,0.0,-0.05671953,-0.007561011,-0.4977609,-0.2499688,-0.008908117,0.002697323,-0.1333278,-0.1108473,-0.1584073,-0.1070804,-0.1160437,-0.3616369,
50%,-0.2828324,-0.2603734,-0.6086432,-0.4272235,-0.04568296,-0.2205655,-0.3043264,-0.3860427,-0.05651865,-0.4619661,-0.291345,-0.3615981,-0.1253172,-0.3674038,-0.215798,-0.2515916,-0.2905171,-0.1234216,0.0,0.0,0.001685912,-0.2111527,-0.1702024,-0.5795243,-0.4505097,-0.3135919,-0.1822487,-0.2263518,-0.650789,-0.6737097,-0.3359707,-0.005631812,-0.01652971,0.4308006,-0.3808571,-0.1399427,-0.3991205,0.001311326,0.0,0.0,0.0,0.0,0.0,0.0,-0.05161106,-0.00749886,-0.4804991,-0.2498568,-0.007424891,0.002697323,-0.1333278,-0.1108473,-0.1584073,-0.1070804,-0.1160437,-0.3616369,
75%,0.292878,-0.01057447,0.5484015,-0.1292188,-0.04370489,-0.1412287,-0.1899086,-0.2113495,-0.05649708,-0.3187631,-0.2177564,-0.3057435,-0.1253121,-0.3624259,-0.2126366,-0.2472027,-0.2838249,-0.1234166,0.0,0.0,0.001745713,-0.162521,-0.1160708,0.7888003,-0.01198334,-0.254438,-0.1822487,-0.2263518,1.536596,1.484319,-0.3359707,-0.005631812,-0.01652971,0.4308006,-0.09465183,-0.05867018,-0.1935621,0.001314913,0.0,0.0,0.0,0.0,0.0,0.0,-0.02644708,-0.0071514,0.06345397,-0.2235364,-0.004458437,0.002707763,-0.1333278,-0.1108473,-0.1584073,-0.1070804,-0.1160437,-0.3616369,


In [31]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

Exibindo o **.info()**.

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522362 entries, 0 to 2522361
Data columns (total 57 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Fwd Packet Length Min    float64
 1   Fwd Packet Length Std    float64
 2   Bwd Packet Length Min    float64
 3   Bwd Packet Length Std    float64
 4   Flow Bytes/s             float64
 5   Flow Packets/s           float64
 6   Flow IAT Mean            float64
 7   Flow IAT Std             float64
 8   Flow IAT Min             float64
 9   Fwd IAT Total            float64
 10  Fwd IAT Mean             float64
 11  Fwd IAT Std              float64
 12  Fwd IAT Min              float64
 13  Bwd IAT Total            float64
 14  Bwd IAT Mean             float64
 15  Bwd IAT Std              float64
 16  Bwd IAT Max              float64
 17  Bwd IAT Min              float64
 18  Bwd PSH Flags            float64
 19  Bwd URG Flags            float64
 20  Bwd Header Length        float64
 21  Fwd Pack

# Dividindo os dados

## Limitar a quantidade de dados maliciosos

In [33]:
# Definir o limite máximo de linhas para os valores diferentes de "BENIGN" e "Unknown"
max_rows = 1948

# Filtrar os valores diferentes de "BENIGN" e "Unknown"
df_f = df[~df['Label'].isin(['BENIGN', 'Infiltration', 'Heartbleed'])]

#Agrupar os valores e limitar o número de linhas para cada valor
df_f = df_f.groupby('Label').head(max_rows)

# Concatenar os valores "BENIGN" e "Unknown" com os valores filtrados
df = df[df['Label'].isin(['BENIGN', 'Heartbleed', 'Infiltration'])]

df = pd.concat([df_f, df])

del df_f

# Verificar se os valores foram limitados corretamente
print(df['Label'].value_counts())

Label
BENIGN                      2096484
DDoS                           1948
Port Scan                      1948
Bot                            1948
Patator-FTP                    1948
                             ...   
Brute Force-Web Attack         1470
Web Attack-XSS                  652
Infiltration                     36
Sql Injection-Web Attack         21
Heartbleed                       11
Name: count, Length: 15, dtype: int64


## Dividindo em treino, validação e teste (benigno e malioso)

In [34]:
# Filtrar os dados onde a coluna Label seja igual a 'BENIGN'
df_benign = df[df['Label'] == 'BENIGN']

# Selecionar aleatoriamente linhas para cada dataframe
indexes1 = df_benign.sample(n=(random.randint(10000, 100000))).index

df_benign = df_benign.drop(indexes1)

indexes2 = df_benign.sample(n=129485).index

df_benign = df_benign.drop(indexes2)

indexes3 = df_benign.sample(n=56468).index

# Remover linhas selecionadas dos dataframes anteriores
train_benign = df.loc[indexes1]
validation_benign = df.loc[indexes2]
test_benign = df.loc[indexes3]

In [35]:
# Filtrar os dados onde a coluna Label seja diferente de "BENIGN", "Infiltration" e "Heartbleed"
df_malicious = df[~df['Label'].isin(["BENIGN", "Infiltration", "Heartbleed"])]
df_malicious_zero_day = df[df['Label'].isin(["Infiltration", "Heartbleed"])]

# Selecionar aleatoriamente linhas para cada dataframe
indexes1 = df_malicious.sample(n=5320).index

df_malicious = df_malicious.drop(indexes1)

indexes2 = df_malicious.sample(n=1500).index

df_malicious = df_malicious.drop(indexes2)
df_malicious = pd.concat([df_malicious, df_malicious_zero_day])

indexes3 = df_malicious.sample(n=2967).index

# Criar os dataframes
train_malicious = df.loc[indexes1]
validation_malicious = df.loc[indexes2]
test_malicious = df.loc[indexes3]

In [36]:
#train = pd.concat([train_benign, train_malicious])
#validation = pd.concat([validation_benign, validation_benign])
#test = pd.concat([test_benign, test_malicious])

## Dividindo entre os estágios

In [37]:
# Treinamento dos estágios 1 e 3
#indexes1 = train_benign.sample(n=int(len(train_benign) * 1/3), replace=False).index
#indexes01 = train_benign.drop(indexes1).index
#train_benign_x = train_benign.loc[indexes01]

#train_benign = train_benign.drop(indexes1)
#train_benign_1_3_stages = train_benign.loc[indexes1]

# Validação do estágio 2
#indexes2 = train_benign_x.sample(n=int(len(train_benign) * 2/3), replace=False).index
#train_benign_x = train_benign_x.loc[indexes2]
#validation_2_stage_50 = train_benign_x
#indexes3 = validation_malicious.sample(n=int(len(validation_malicious) * 5/100), replace=False).index
#indexes03 = validation_malicious.drop(indexes3).index
#validation_malicious_2_stage_50 = 

#validation_2_stage = pd.concat([validation_2_stage_50, validation_malicious]) 

# Validação dos estágios 1 e 3
#validation_1_3_stages_95 = validation_benign
#validation_1_3_stages_95 = pd.concat([validation_1_3_stages_95, ])

## Limpeza de variáveis

In [None]:
del df
del indexes1
del indexes2
del indexes3
del df_benign
del df_malicious
del df_malicious_zero_day

# Ignorar

In [None]:
# Valor específico
#valor_especifico = 'BENIGN'

# Coleta de valores diferentes do valor específico
#valores_diferentes = df.loc[df['Label'] != valor_especifico, 'Label'].tolist()

#print(valores_diferentes)


In [None]:
#pd.set_option('display.max_rows', None)
#contagem_valores = df['Label'].value_counts()
#print(contagem_valores)


In [None]:
#df['Z'] = df['X'] + df['Y']

# Apagar as colunas X e Y
#df.drop(columns=['X', 'Y'], inplace=True)