## Task 5

### Fonte dos dados: [iotanalytics](https://iotanalytics.unsw.edu.au/) e [stratosphereips](https://www.stratosphereips.org/datasets-normal)

1. Criar script que receba um `.pcap` e e gere um `.csv` limpo (excluir as colunas IP O/D,mac O/D, APPlication* e excluir colunas vazias)
2. Escolher um numero de pcaps do IOT analitcs e tratar (~ 400k de fluxos)
3. Gerar fluxos já tratados
4. Rotular os fluxos: criar a coluna "target" (0 -> trafego normal, 1 -> IOT)
5. Unir os arquivos cvs

In [1]:
import pandas as pd
from pandas import DataFrame
from pandas import Index
import nfstream
import os 
import socket
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
pd.set_option("styler.format.thousands", ",")

In [3]:
def concat_csv_and_pcap(mypath: str, fileName:str, label:int) -> DataFrame:

    lista = []

    for arg in os.listdir(path=mypath):

        # camilho completo do arquivo que será aberto
        name = os.path.join(mypath, arg)

        if arg.endswith(".pcap"):
            
            try:
                df = nfstream.NFStreamer(source= os.path.join(mypath, arg),statistical_analysis=True).to_pandas()
                lista.append(df)

            except Exception as e:
                print(f"Error!, ao ler o arquivo {name}")
                exit()
    
    data = pd.concat(lista)
    data['label'] = label # rotulando o trafego
    
    arg = open(fileName,'w')
    arg.write(data.to_csv())
    arg.close()
    return data

In [4]:
try:
    dados_IOT = pd.read_csv("../csv/trafego_IOT.csv")
except:
    dados_IOT = concat_csv_and_pcap("../pcaps/IOT/", "../csv/trafego_IOT.csv", 1)

In [5]:
dados_IOT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415021 entries, 0 to 415020
Data columns (total 88 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Unnamed: 0                    415021 non-null  int64  
 1   id                            415021 non-null  int64  
 2   expiration_id                 415021 non-null  int64  
 3   src_ip                        415021 non-null  object 
 4   src_mac                       415021 non-null  object 
 5   src_oui                       415021 non-null  object 
 6   src_port                      415021 non-null  int64  
 7   dst_ip                        415021 non-null  object 
 8   dst_mac                       415021 non-null  object 
 9   dst_oui                       415021 non-null  object 
 10  dst_port                      415021 non-null  int64  
 11  protocol                      415021 non-null  int64  
 12  ip_version                    415021 non-nul

In [6]:

def ratio_isna (data:DataFrame) -> None:

    # obtém o número de pontos de dados ausentes por coluna
    data_cols_erros = dados_IOT.isnull().sum()

    data_cols_erros = data_cols_erros[data_cols_erros > 0]

    size = len(data.index)

    # obtendo uma lista das colunas
    cols_name = data_cols_erros.index.tolist()

    # obtendo os uma lista com os valores de erro de cada coluna
    erros = [float(item / size) for item in data_cols_erros.values.tolist()]

    plt.bar(cols_name, erros)

    plt.xlabel("Colunas com valores invalidos.")
    plt.ylabel("Numero de erros em %.")

    plt.xticks(rotation = 90)
    plt.show()

In [7]:
# removendo 0 ou mais ocorrencias das colunas que possuem application no inicio -> application_name	application_category_name	application_is_guessed	application_confidence
def cleaning_by_labels (data:DataFrame) -> Index:
    
    # seleciona as colunas que começão com application para serem removidas
    cols_to_drop = data.filter(regex=r'^application_*')

    # remove todas as colunas contidas na lista
    df_filtrado = data.drop(columns=['src_ip','src_mac', 'dst_ip', 'dst_mac'] + list(cols_to_drop.columns))

    return df_filtrado, data.columns.difference(df_filtrado.columns)

dados_IOT_cleared, c1 = cleaning_by_labels(dados_IOT)


In [8]:

def cleaning_by_isna (data:DataFrame) -> Index:

    # remove colunas que possuem mais de 50% de valores inválidos
    limite = len(data) / 2 
    DF_cleared = data.dropna(axis=1, thresh=limite)

    # remove todas as linhas que possuem algum valor null or vazio
    DF_cleared = DF_cleared.dropna(axis=0, how='any')

    # para saber as colunas que foram reovidas nesse passo
    return DF_cleared, data.columns.difference(DF_cleared.columns)

dados_IOT_cleared, c2 = cleaning_by_isna(dados_IOT_cleared)

In [9]:
res = open('../csv/trafego_IOT_limpo.csv','w')
res.write(dados_IOT_cleared.to_csv())
res.close()

In [10]:
c1

Index(['application_category_name', 'application_confidence',
       'application_is_guessed', 'application_name', 'dst_ip', 'dst_mac',
       'src_ip', 'src_mac'],
      dtype='object')

# Task 6 - Inicio da faze de treinamento
* Relatório das colunas excluidas.
* Desafio do titanic.
* Criar um modelosup com os dados obtidos até o momento.

In [11]:
try:
    dados_Normal = pd.read_csv("../csv/trafego_normal.csv")
except:
    dados_Normal = concat_csv_and_pcap("../pcaps/Fixo/", "../csv/trafego_normal.csv", 0)

In [12]:
dados_normal_clear, n1 = cleaning_by_labels(dados_Normal)

dados_normal_clear, n2 = cleaning_by_isna(dados_normal_clear)

res = open('../csv/trafego_normal_limpo.csv','w')
res.write(dados_IOT_cleared.to_csv())
res.close()

In [13]:
n1.append(n2)

Index(['application_category_name', 'application_confidence',
       'application_is_guessed', 'application_name', 'dst_ip', 'dst_mac',
       'src_ip', 'src_mac', 'client_fingerprint', 'content_type',
       'server_fingerprint', 'user_agent'],
      dtype='object')

In [14]:
linhas_removidas_IOT = dados_IOT[~dados_IOT.index.isin(dados_IOT_cleared.index)]

linhas_removidas_Normal = dados_Normal[~dados_Normal.index.isin(dados_normal_clear.index)]

linhas_removidas_Normal

Unnamed: 0.1,Unnamed: 0,id,expiration_id,src_ip,src_mac,src_oui,src_port,dst_ip,dst_mac,dst_oui,...,application_name,application_category_name,application_is_guessed,application_confidence,requested_server_name,client_fingerprint,server_fingerprint,user_agent,content_type,label
0,0,0,0,fe80::69dd:e614:2b2:dfd0,08:00:27:a3:83:43,08:00:27,0,ff02::16,33:33:00:00:00:16,33:33:00,...,ICMPV6,Network,0,6,,,,,,0
1,1,1,0,::,08:00:27:a3:83:43,08:00:27,0,ff02::1:ffb2:dfd0,33:33:ff:b2:df:d0,33:33:ff,...,ICMPV6,Network,0,6,,,,,,0
2,2,2,0,fe80::69dd:e614:2b2:dfd0,08:00:27:a3:83:43,08:00:27,0,ff02::2,33:33:00:00:00:02,33:33:00,...,ICMPV6,Network,0,6,,,,,,0
4,4,4,0,10.0.2.2,52:54:00:12:35:02,52:54:00,67,10.0.2.15,08:00:27:a3:83:43,08:00:27,...,DHCP,Network,0,6,,,,,,0
5,5,5,0,fe80::69dd:e614:2b2:dfd0,08:00:27:a3:83:43,08:00:27,546,ff02::1:2,33:33:00:01:00:02,33:33:00,...,DHCPV6,Network,0,6,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404422,18551,18551,0,10.0.2.15,08:00:27:a3:83:43,08:00:27,49373,23.51.123.27,52:54:00:12:35:02,52:54:00,...,HTTP,Web,1,1,,,,,,0
404423,18552,18552,0,10.0.2.15,08:00:27:a3:83:43,08:00:27,49374,23.51.123.27,52:54:00:12:35:02,52:54:00,...,HTTP,Web,1,1,,,,,,0
404445,18574,18574,0,10.0.2.15,08:00:27:a3:83:43,08:00:27,49381,172.217.23.142,52:54:00:12:35:02,52:54:00,...,HTTP,Web,1,1,,,,,,0
404450,18579,18579,0,10.0.2.15,08:00:27:a3:83:43,08:00:27,49382,31.13.93.7,52:54:00:12:35:02,52:54:00,...,HTTP,Web,1,1,,,,,,0


In [15]:
linhas_removidas_IOT

Unnamed: 0.1,Unnamed: 0,id,expiration_id,src_ip,src_mac,src_oui,src_port,dst_ip,dst_mac,dst_oui,...,application_name,application_category_name,application_is_guessed,application_confidence,requested_server_name,client_fingerprint,server_fingerprint,user_agent,content_type,label
9,9,9,0,fe80::fc66:5abf:d63a:94fb,d0:53:49:1b:0c:90,d0:53:49,546,ff02::1:2,33:33:00:01:00:02,33:33:00,...,DHCPV6,Network,0,6,,,,,,1
99,99,99,0,fe80::fc66:5abf:d63a:94fb,d0:53:49:1b:0c:90,d0:53:49,546,ff02::1:2,33:33:00:01:00:02,33:33:00,...,DHCPV6,Network,0,6,,,,,,1
146,146,146,0,192.168.1.191,60:6c:66:cb:78:61,60:6c:66,0,192.168.33.254,00:13:33:b0:18:50,00:13:33,...,ICMP,Network,0,6,,,,,,1
280,280,280,0,192.168.1.191,60:6c:66:cb:78:61,60:6c:66,55844,172.217.23.234,00:13:33:b0:18:50,00:13:33,...,HTTP,Web,1,1,,,,,,1
281,281,281,0,192.168.1.191,60:6c:66:cb:78:61,60:6c:66,55846,172.217.23.234,00:13:33:b0:18:50,00:13:33,...,HTTP,Web,1,1,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414532,74041,74041,0,192.168.1.191,60:6c:66:cb:78:61,60:6c:66,54572,172.217.23.227,00:13:33:b0:18:50,00:13:33,...,HTTP,Web,1,1,,,,,,1
414539,74048,74048,0,192.168.1.191,60:6c:66:cb:78:61,60:6c:66,54580,172.217.23.227,00:13:33:b0:18:50,00:13:33,...,HTTP,Web,1,1,,,,,,1
414545,74054,74054,0,192.168.1.191,60:6c:66:cb:78:61,60:6c:66,60600,54.192.93.161,00:13:33:b0:18:50,00:13:33,...,HTTP,Web,1,1,,,,,,1
414766,74275,74275,0,192.168.1.191,60:6c:66:cb:78:61,60:6c:66,43652,52.210.15.176,00:13:33:b0:18:50,00:13:33,...,HTTP,Web,1,1,,,,,,1


In [16]:
res = open('../Relatorio/linhas_excluidas_IOT.csv','w')
res.write(linhas_removidas_IOT.to_csv())
res.close()

In [17]:
res = open('../Relatorio/linhas_excluidas_Normal.csv','w')
res.write(linhas_removidas_Normal.to_csv())
res.close()

In [18]:
import ydata_profiling 


profile_Normal = ydata_profiling.ProfileReport(dados_Normal, minimal=True)

profile_IOT = ydata_profiling.ProfileReport(dados_IOT, minimal=True)

profile_Normal_cleared = ydata_profiling.ProfileReport(dados_normal_clear, minimal=True)

profile_IOT_cleared = ydata_profiling.ProfileReport(dados_IOT_cleared, minimal=True)

profile_Normal_erros = ydata_profiling.ProfileReport(linhas_removidas_Normal, minimal=True)

profile_IOT_erros = ydata_profiling.ProfileReport(linhas_removidas_IOT, minimal=True)

# Exibindo o relatório do perfil
profile_Normal.to_file("../Relatorio/trafego_normal_bruto.html")

profile_IOT.to_file("../Relatorio/trafego_IOT_bruto.html")

profile_Normal_cleared.to_file("../Relatorio/trafego_normal_limpo.html")

profile_IOT_cleared.to_file("../Relatorio/trafego_IOT_limpo.html")

profile_Normal_erros.to_file("../Relatorio/trafego_normal_erros.html")

profile_IOT_erros.to_file("../Relatorio/trafego_IOT_erros.html")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 94/94 [00:14<00:00,  6.31it/s, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [01:14<00:00, 74.06s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.31s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 16.37it/s]
Summarize dataset: 100%|██████████| 94/94 [00:07<00:00, 12.34it/s, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [01:07<00:00, 67.93s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.97s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 61.78it/s]
Summarize dataset: 100%|██████████| 82/82 [00:05<00:00, 14.68it/s, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [00:49<00:00, 49.89s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.32s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 21.67it/s]
Su

In [24]:
ratio_isna(dados_Normal)

  plt.show()


In [20]:
ratio_isna(dados_IOT)

  plt.show()


In [30]:
erros_IOT = dados_IOT.isnull().sum()

erros_IOT = erros_IOT[erros_IOT > 0 ]

erros_IOT.values = np.ar [item / len(dados_IOT.index) for item in erros_IOT.values]


[0.13407514318552555,
 0.8432320292226176,
 0.8445042540016048,
 0.8885357608410177,
 0.8979063710029132]

In [31]:
erros_Normal = dados_Normal.isnull().sum()

erros_Normal = erros_Normal[erros_Normal > 0 ]

[item / len(dados_Normal.index) for item in erros_Normal.values]

[0.07324795894910906,
 0.8279510889312383,
 0.8285813862090956,
 0.8922414112726814,
 0.9017057574566639]

In [21]:
print(f"Normal ({len(dados_Normal.index)} - {len(linhas_removidas_Normal.index)})")

Normal (404571 - 29634)


In [22]:
print(f"IOT ({len(dados_IOT.index)} - {len(linhas_removidas_IOT.index)})")

IOT (415021 - 55644)


In [23]:
print(f"Normal {n2} || IOT {c2}")

Normal Index(['client_fingerprint', 'content_type', 'server_fingerprint',
       'user_agent'],
      dtype='object') || IOT Index(['client_fingerprint', 'content_type', 'server_fingerprint',
       'user_agent'],
      dtype='object')
