In [1]:
import math
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import OneClassSVM

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

from joblib import dump, load

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix

from keras.models import Model, Sequential
from keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense

import tensorflow as tf
from tensorflow.keras import layers, models

import torch
import torch.nn as nn
import torch.optim as optim

from torchsummary import summary

In [2]:
sns.set_theme()
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Carregando os dados

Os valores são números muito pequenos com muitas casas decimais, por isso é bom que o dataframe consiga representar isso também.

In [3]:
pd.set_option('display.float_format', '{:.20f}'.format)

In [4]:
df_benign = pd.read_csv("data/dados benignos/mensagens_benignas.csv")
df_benign

Unnamed: 0,Timestamp,DLC,Arb ID,Byte 0,Byte 1,Byte 2,Byte 3
0,0.00000000000000000000,3,193,0,0,0,-1
1,0.00012946128845210000,3,193,0,0,0,-1
2,0.00015544891357419999,3,193,0,0,0,-1
3,0.00015616416931149999,3,193,0,0,0,-1
4,0.00013208389282220000,2,194,24,0,-1,-1
...,...,...,...,...,...,...,...
2673812,0.00172662734985350001,1,165,88,-1,-1,-1
2673813,0.00108718872070310006,1,165,88,-1,-1,-1
2673814,0.00091290473937979998,1,165,88,-1,-1,-1
2673815,0.00008821487426757812,1,165,88,-1,-1,-1


In [5]:
df_malicious_random_dos = pd.read_csv("data/ataques/mensagens_maliciosas_random_dos.csv")
df_malicious_spoofing_zero_payload = pd.read_csv("data/ataques/mensagens_maliciosas_spoofing_zero_payload.csv")
df_malicious_zero_dos = pd.read_csv("data/ataques/mensagens_maliciosas_zero_dos.csv")

In [6]:
df_malicious_random_dos

Unnamed: 0,Timestamp,DLC,Arb ID,Byte 0,Byte 1,Byte 2,Byte 3
0,0.00000000000000000000,4,192,17,110,1,93
1,0.00037145614624020001,1,1061,225,-1,-1,-1
2,0.00169730186462400002,2,1412,198,237,-1,-1
3,0.00179910659790029999,1,2035,254,-1,-1,-1
4,0.00183224678039550001,3,544,66,102,79,-1
...,...,...,...,...,...,...,...
455997,0.00016474723815910000,4,192,207,106,1,92
455998,0.00016379356384270001,4,192,207,106,1,92
455999,0.00016403198242180000,4,192,207,106,1,92
456000,0.00016498565673819999,4,192,207,106,1,92


In [None]:
df_malicious_spoofing_zero_payload

In [None]:
df_malicious_zero_dos

# Tratando dados

## Normalização dos dados

In [7]:
scaler_minmax = MinMaxScaler()

scaler_minmax.fit(df_benign)

scaler_minmax.transform(df_benign)
scaler_minmax.transform(df_malicious_random_dos)
scaler_minmax.transform(df_malicious_spoofing_zero_payload)
scaler_minmax.transform(df_malicious_zero_dos)

array([[ 0.00000000e+00,  4.28571429e-01, -2.10526316e-02, ...,
         3.90625000e-03,  3.90625000e-03,  9.90099010e-03],
       [ 1.45392236e-03,  4.28571429e-01, -2.10526316e-02, ...,
         3.90625000e-03,  3.90625000e-03,  9.90099010e-03],
       [ 1.85423866e-03,  4.28571429e-01, -2.10526316e-02, ...,
         3.90625000e-03,  3.90625000e-03,  9.90099010e-03],
       ...,
       [ 1.65648121e-04,  1.42857143e-01,  1.00000000e+00, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.13662069e-04,  4.28571429e-01,  9.89473684e-01, ...,
         4.60937500e-01,  7.81250000e-03,  9.50495050e-01],
       [ 2.14562330e-04,  4.28571429e-01,  9.89473684e-01, ...,
         4.60937500e-01,  7.81250000e-03,  9.50495050e-01]])

In [8]:
df_benign

Unnamed: 0,Timestamp,DLC,Arb ID,Byte 0,Byte 1,Byte 2,Byte 3
0,0.00000000000000000000,3,193,0,0,0,-1
1,0.00012946128845210000,3,193,0,0,0,-1
2,0.00015544891357419999,3,193,0,0,0,-1
3,0.00015616416931149999,3,193,0,0,0,-1
4,0.00013208389282220000,2,194,24,0,-1,-1
...,...,...,...,...,...,...,...
2673812,0.00172662734985350001,1,165,88,-1,-1,-1
2673813,0.00108718872070310006,1,165,88,-1,-1,-1
2673814,0.00091290473937979998,1,165,88,-1,-1,-1
2673815,0.00008821487426757812,1,165,88,-1,-1,-1


## Criando Labels 

In [9]:
list_labels_benign = ["BENIGN"] * len(df_benign)
list_labels_random_dos = ["MALICIOUS"] * len(df_malicious_random_dos)
list_labels_spoofing_zero_payload = ["MALICIOUS"] * len(df_malicious_spoofing_zero_payload)
list_labels_zero_dos = ["MALICIOUS"] * len(df_malicious_zero_dos)

## Criação de Janelas Temporais

In [10]:
def create_dataset(data, labels, time_step=1):
    X, Y = [], []
    for i in range(len(data) - time_step):
        a = data[i:(i + time_step)]
        X.append(a)
        Y.append(labels[i + time_step])
    return np.array(X), np.array(Y)

In [14]:
benign_windows, benign_labels = create_dataset(df_benign, list_labels_benign, 100)
malicious_random_dos_windows, malicious_random_dos_labels = create_dataset(df_malicious_random_dos, list_labels_random_dos, 100)
malicious_spoofing_zero_payload_windows, malicious_spoofing_zero_payload_labels = create_dataset(df_malicious_spoofing_zero_payload, list_labels_spoofing_zero_payload, 100)
malicious_zero_dos_windows, malicious_zero_dos_labels = create_dataset(df_malicious_zero_dos, list_labels_zero_dos, 100)

In [13]:
len(benign_windows[0])

100

## Dividindo dados em Treino, Validação e Teste

In [None]:
n_linhas = len(df_benign)

# Dividindo o DataFrame em três partes iguais, mas mantendo a ordem dos elementos
df_train = df_benign.iloc[:int(n_linhas*0.6)]   
df_val = df_benign.iloc[int(n_linhas*0.6):int(n_linhas*0.7)]
df_test = df_benign.iloc[int(n_linhas*0.7):]

# IAs