# IDS - Sistema de detecção de Intrusão

Este projeto de pesquisa nasceu na universidade federal de uberlandia, e tem como objetivo de mostrar a prosibilidade de treinar sistema de detecção de anomalia IDS usando machine leraning e ciancia de dados para para detectar familias de dispositivos em tempo real na rede.  

Para essa solução usarei a metodologia CRISP-DM.
Por que essa metodologia?.
* Organização.
* Planeijamento.
* Independe do tipo de projeto.

**Etapas da metodologia**.

1. Etapa - **Entendimento do negócio**.
* Objetivos do projéto.
* Critérios de sucesso.
* Recursos e contingenciamentos.
* Objetivos do data mining.
* Planeijamento estrutural.     
2. Etapa - **Entendimento dos dados**.
* Coleta.
* Descrição.
* Exploração.
* Verificação da qualidade dos dados.
3. Etapa - **Preparação dos dados**.
* Seleção.
* Limpeza.
* Construção.
* Integração.
* Formatação.
4. Etapa - **Modelagem**.
* Escolha a tecnica de modelagem.
* Design de teste.
* Construção do modelo.
* Avaliação.
5. Etapa - **Revisão dos resultados**.
* Revisão dos critérios de sucesso.
* Revião do processo completo.
* Determinação de proximos passos.

**Fonte dos dados**: [iotanalytics](https://iotanalytics.unsw.edu.au/) e [stratosphereips](https://www.stratosphereips.org/datasets-normal)

## 2- Entendimento

A coleta dos dados variam para cada tipo de trafego e as fontes para cada tipo foram:

*  IOT: Benigno-> [iotanalytics](https://iotanalytics.unsw.edu.au/) | Maligno-> [None]()
* Normal: Benigno-> [stratosphereips](https://www.stratosphereips.org/datasets-normal) | Maligno-> [None]()
* Movel: Benigno-> [None]() | Maligno-> [None]()

In [1]:
import pandas as pd
from pandas import DataFrame
from pandas import Index
import nfstream
import os 
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
pd.set_option("styler.format.thousands", ",")

In [3]:
# função que percorre uma determinada pasta e converte todos os pcaps dessa pasta para csv e adciona o label

def concat_csv_and_pcap(mypath: str, label:int, fileName:str = 'trafego_IOT.csv') -> DataFrame:

    lista = []

    for arg in os.listdir(path=mypath):

        name = os.path.join(mypath, arg)

        if arg.endswith(".pcap"):
            
            try:
                df = nfstream.NFStreamer(source= os.path.join(mypath, arg),statistical_analysis=True).to_pandas()
                lista.append(df)

            except Exception as e:
                print(f"Error!, ao ler o arquivo {name}")
                exit()
    
    trafego = pd.concat(lista)
    trafego.reset_index(drop=True, inplace=True)
    trafego['label'] = label # rotulando o trafego
    trafego.to_csv('./csv/' + fileName) # salvando
    return trafego

In [4]:
def open_datasets (datasets:dict, labels:dict = None, fileName:str = 'trafego_bruto.csv') -> DataFrame:

    lista = []

    for key, values in  datasets.items():

        for file in values:

            if os.path.exists(file) and file.endswith('.pcap'):

                try:
                    df = nfstream.NFStreamer(source= file,statistical_analysis=True).to_pandas()

                    if labels == None:
                        df['label'] = key
                    else:
                        df['label'] = labels[key]
                    
                    lista.append(df)

                except:
                    print(f'Erro in {file}')

    trafego = pd.concat(lista)
    trafego.reset_index(drop=True, inplace=True)
    trafego.to_csv('./csv/' + fileName)
    return trafego

In [5]:
def concat_datasets(source: str, fileOutName: str = 'trafego_bruto.csv', chunksize: int = 100000) -> pd.DataFrame:
    
    output_file = '../data/' + fileOutName  # Nome do arquivo de saída
    
    first_file = True  # Indica se é o primeiro arquivo para criar o arquivo de saída corretamente

    for fileName in os.listdir(source):
        if fileName.endswith('.csv'):
            try:
                file_path = os.path.join(source, fileName)  # Caminho completo do arquivo
                
                # Ler o CSV em chunks (partes menores)
                for chunk in pd.read_csv(file_path, chunksize=chunksize):
                    chunk.reset_index(drop=True, inplace=True)  # Resetar o índice

                    # Salvar os dados em um único arquivo CSV progressivamente
                    chunk.to_csv(output_file, mode='w' if first_file else 'a', header=first_file, index=False)
                    first_file = False  # Apenas o primeiro arquivo deve ter cabeçalho

            except Exception as e:
                print(f'Erro ao processar {fileName}: {e}')

    print(f"Concatenação concluída! Arquivo salvo em: {output_file}")

In [6]:
import os
import pandas as pd
import dask.dataframe as dd

parquet_chunks_dir = "../data/parquet_chunks"
file_trafego_bruto = '../data/trafego_bruto.csv'
file_dic_labels = "../data/CICIoT2023_dictionary.csv"
source_csv = "../data/ME/csv/"
base_dir = "../data/"

# Escolher o arquivo de tráfego correto (evita leituras desnecessárias)
if os.path.exists(file_trafego_bruto):
    file_to_read = file_trafego_bruto
else:
    file_to_read = concat_datasets(source_csv)  # Criar dataset se não existir

# Processar apenas se file_trafego ainda não existir
if os.path.exists(file_dic_labels) and not os.path.exists(parquet_chunks_dir):

    # Ler e limpar o dicionário de MAC Addresses
    df_dic_labels = pd.read_csv(file_dic_labels) # usecols=["MAC Address", "Device Name", "Category"]

    # removendo nulos e duplicados
    df_dic_labels = df_dic_labels.dropna(subset=["MAC Address"]).drop_duplicates(subset=["MAC Address"])

    # removendo erros de formatação
    df_dic_labels["MAC Address"] = df_dic_labels["MAC Address"].str.lower().str.replace(":", "").str.strip()

    # criando indice para facilitar
    mac_to_data = df_dic_labels.set_index("MAC Address").to_dict(orient="index")

    # diretório para salvar os chunck no formato .parquet
    os.makedirs(parquet_chunks_dir, exist_ok=True)

    label_cols = list(df_dic_labels.columns.difference(["MAC Address"]))

    chunk_size = 100_000
    for chunk_idx, chunk in enumerate(pd.read_csv(file_to_read, chunksize=chunk_size)):

        combine_rows = []
        for _, row in chunk.iterrows():
            
            new_row = row.to_dict()

            for col in label_cols:
                new_row[f'label_{col}'] = None

            for mac_col in ['src_mac', 'dst_mac']:
                mac = str(row[mac_col]).lower().replace(":", "").strip()
                if mac in mac_to_data:

                    for col, value in mac_to_data[mac].items():
                        new_row[f'label_{col}'] = value
                    break

            combine_rows.append(new_row)

        combine_chunk = pd.DataFrame(combine_rows)

        combine_chunk['label_Category'] = combine_chunk['label_Category'].fillna('Desconhecido')
        combine_chunk['label_Category'] = combine_chunk['label_Category'].astype('category')

        # Salvar cada chunk como Parquet
        combine_chunk.to_parquet(
            os.path.join(parquet_chunks_dir, f"trafego_chunk_{chunk_idx}.parquet"),
            engine='pyarrow',
            compression='snappy'
        )

In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


Gerando relatorio para uma analise dos dados.

In [7]:
import dask.dataframe as dd

df_data = dd.read_parquet(parquet_chunks_dir, engine='pyarrow', blocksize='25MB')

df_data.columns

Index(['Unnamed: 0', 'id', 'expiration_id', 'src_ip', 'src_mac', 'src_oui',
       'src_port', 'dst_ip', 'dst_mac', 'dst_oui', 'dst_port', 'protocol',
       'ip_version', 'vlan_id', 'tunnel_id', 'bidirectional_first_seen_ms',
       'bidirectional_last_seen_ms', 'bidirectional_duration_ms',
       'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms',
       'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets',
       'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms',
       'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes',
       'bidirectional_min_ps', 'bidirectional_mean_ps',
       'bidirectional_stddev_ps', 'bidirectional_max_ps', 'src2dst_min_ps',
       'src2dst_mean_ps', 'src2dst_stddev_ps', 'src2dst_max_ps',
       'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps',
       'dst2src_max_ps', 'bidirectional_min_piat_ms',
       'bidirectional_mean_piat_ms', 'bidirectional_stddev_piat_ms',
       'bidirectional_max_piat

In [None]:
from ydata_profiling import ProfileReport
import gc as ColetorDeLixo


os.makedirs("../Relatorios", exist_ok=True)

for col in df_data.columns:
    
    if col != 'label_Category' or os.path.exists("f../Relatorios/Relatório da coluna {col}.html"):
        continue

    # pegar a coluna
    pandas_col = df_data[[col]].compute()

    # gerar o relatorio
    profile = ProfileReport(pandas_col, minimal=True, explorative=False)

    # salvar
    profile.to_file(f"../Relatorios/Relatório da coluna {col}.html")

    # 4. Liberar memória explicitamente
    del pandas_col, profile
    ColetorDeLixo.collect()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Temos ao todo 87 colunas para analisar.

In [9]:
# Calcula o número de valores únicos por coluna
# nunique_por_coluna = df_data.nunique().compute()

# Filtra as colunas que têm apenas um valor único
# nunique_por_coluna[nunique_por_coluna == 1].index.tolist()

## 3-Preparação

Epata em que prepararemos os dados para treinar nosso modelo.

Normalmente eu costumo fazer a filtragem remoção de colunas numéricas no final da etapa 3 para garantir que todos os dados categóricos em que eu não encontrei uma solução viável para transformá-los eu os removo, entretanto, pela natureza dos dados e como eu tenho que ver, as linhas que possuem valores nulos eu vou as remover e não quero que uma coluna que eu não vou usar tenha linhas com dados nulos e atrapalhe meu modelo.

In [None]:
# Preserva o índice original em uma coluna
df_data = df_data.reset_index().rename(columns={'index': 'row_id'})

numero_max = 1000_000

temp_df_pandas = df_data[['label_Category']].compute()

def coleta_idx(group:DataFrame, x):
    """
    Retorna o grupo inteiro se o número de ocorrências for menor ou igual a x;
    caso contrário, retorna x linhas aleatórias.
    """
    if len(group) <= x:
        return group.index.tolist()
    else:
        return group.sample(n=x, random_state=42).index.tolist()
    
indices_selecionados = []

# Aplica a função a cada grupo
for categoria, grupo in temp_df_pandas.groupby('label_Category', group_keys=False):
    indices_selecionados.extend(coleta_idx(grupo, numero_max))

indices_selecionados = sorted(indices_selecionados)

del temp_df_pandas
ColetorDeLixo.collect()

# Após coletar os índices (baseados na coluna 'row_id')
df_data = df_data[df_data['row_id'].isin(indices_selecionados)].compute()


In [None]:

df_data['label_Category'].value_counts().compute()


In [None]:
df_data['label'] = df_data['label_Category'].cat.as_known()

In [None]:
colunas_desnecessarias = ['id','src_ip', 'dst_ip', 'row_id']

# seleciona as colunas que começam com application ou src_e dest_ip para serem removidas
cols_to_drop = [col for col in df_data.columns if str(col).startswith("application_") or str(col).startswith("Unnamed")  or col in colunas_desnecessarias]

# remove todas as colunas contidas na lista
df_data = df_data.drop(columns=cols_to_drop)

cols_to_drop

In [None]:
# Filtra colunas que são de tipo numérico (int, float, etc.)
colunas_numericas = df_data.select_dtypes(include=[np.number]).columns.tolist()

# Cria um novo DataFrame só com essas colunas
df_data = df_data[colunas_numericas]

df_data.columns

In [None]:
nunique_por_coluna = df_data.nunique().compute()

colunas_validas = df_data[nunique_por_coluna > 1].index.tolist()

df_data = df_data[colunas_validas]

colunas_validas

* Removendo Colunas que tem mais de 50% de valores nulos e depois as linhas que contem valores nulos

In [None]:
size = len(df_data)

proporcao_nulos = (df_data.isnull().sum() / size).compute()

colunas_para_manter = proporcao_nulos[proporcao_nulos <= 0.5].index.tolist()

df_data = df_data[colunas_para_manter]

df_data = df_data.dropna()

## 4- Modelagem

### Modelos

In [None]:
import random

camera1 = family['Câmeras'][random.randint(0, 5)]
camera2 = family['Câmeras'][random.randint(0, 5)]

print(f"{camera1} e {camera2}")

In [None]:
# Split

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from yellowbrick.classifier.rocauc import roc_auc
import random

'''
teste_size = random.randrange(10, 40) / 100

data_x = dados.drop(columns=['label','id'])

data_y = dados['label']

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=teste_size, random_state=42)
'''
camera1 = family['Câmeras'][random.randint(0, 5)]
camera2 = family['Câmeras'][random.randint(0, 5)]

# Criar um DataFrame de teste com base nos endereços MAC desejados
test_x = dados[dados['src_mac'].isin([camera1]) & dados['dst_mac'].isin([camera1])]

test_y = test_x['label'].values

# Criar um DataFrame de treinamento excluindo os registros do conjunto de teste
train_x = dados[~dados.isin(test_x.index)]

train_y = test_x['label'].values

train_x = train_x.drop(columns=['label','id']).values

test_x = test_x.drop(columns=['label','id']).values

models = DataFrame() # onde serão quardadas as saidas

In [None]:
colunas_numericas_train = train_x.select_dtypes(include=['number']).columns

train_x = train_x[colunas_numericas_train]

colunas_numericas_test = test_x.select_dtypes(include=['number']).columns

test_x = test_x[colunas_numericas_test]

* Selecionando as melhores variaveis reditores com o algoritmo [Boruta](https://medium.com/data-hackers/como-selecionar-melhores-vari%C3%A1veis-para-o-seu-modelo-com-boruta-ef7cbfb3fc35).

In [None]:
# feature Selection

from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier 

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)

feat_selector.fit(train_x.values, train_y.values)


In [None]:
def plot_confusion_matrix (clf_name:str, y_true, y_pred, target_names = ['Não Sobrevivel', 'Sobrevivel']):

    cf_matrix = confusion_matrix(y_true, y_pred)
    ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
    ax.set_title(clf_name)
    #ax.set_xlabel('Valores')
    #ax.set_ylabel('Valores')
    ax.xaxis.set_ticklabels(target_names)
    ax.yaxis.set_ticklabels(target_names)
    figure = ax.get_figure()

    figure.show()

In [None]:
# filtrado as features relevantes selecionadas pelo boruta

train_x = train_x.iloc[:, feat_selector.support_.tolist()]

test_x = test_x.iloc[:, feat_selector.support_.tolist()]

In [None]:
# FlorestaAleatória

from sklearn.ensemble import RandomForestClassifier

RFClf = RandomForestClassifier()

RFClf.fit(train_x, train_y)

rfclf_y = RFClf.predict(test_x)

models['RFClf'] = rfclf_y

In [None]:
accuracy = accuracy_score(test_y, rfclf_y)

print(accuracy)

In [None]:
plot_confusion_matrix('RFCClf', test_y, rfclf_y, ['Normal', 'IOT'])

In [None]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

Knn = KNeighborsClassifier(n_neighbors=2)

Knn.fit(train_x, train_y)

knn_y = Knn.predict(test_x)

models['KNN'] = knn_y

In [None]:
accuracy = accuracy_score(test_y, knn_y)

print(accuracy)

In [None]:
plot_confusion_matrix('KNN', test_y, knn_y, ['Normal', 'IOT'])

In [None]:
# XGBOOST

from xgboost import XGBClassifier

XGB = XGBClassifier()

XGB.fit(train_x, train_y)

xgb_y = XGB.predict(test_x)

models['XGB'] = xgb_y

In [None]:
accuracy = accuracy_score(test_y, xgb_y)

print(accuracy)

In [None]:
plot_confusion_matrix('XGB', test_y, xgb_y, ['Normal', 'IOT'])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier



* HistGradientBoostingClassifier para substituir o lighgbm

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier



Referencia do modelo [catboost](https://catboost.ai/docs/)

In [None]:
import catboost

CatBClf = catboost.CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, loss_function='Logloss')

CatBClf.fit(train_x, train_y, verbose=False)

cb_y = CatBClf.predict(test_x)

models['CB'] = cb_y

In [None]:
accuracy = accuracy_score(test_y, cb_y)

accuracy

In [None]:
plot_confusion_matrix('RFCClf', test_y, cb_y, ['Normal', 'IOT'])

In [None]:
from sklearn.svm import SVC

SVClf = SVC()

SVClf.fit(train_x, train_y)

svc_y = SVClf.predict(test_x)

models['SVC'] = svc_y

In [None]:
# ref -> https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-auto-examples-classification-plot-classifier-comparison-py
from sklearn.naive_bayes import GaussianNB

NBayes = GaussianNB()

NBayes.fit(train_x, train_y)

nb_y = NBayes.predict(test_x)

models['NBayes'] = nb_y

In [None]:
def calculate_leaderboard (models:DataFrame) -> DataFrame:

    leaderboard = DataFrame(columns= list(models.columns))

    for col in list(leaderboard.columns):

        leaderboard.at['accuracy', col] = round(accuracy_score(test_y, np.array(models[col])) * 100, 4)

        matrix = confusion_matrix(test_y, np.array(models[col]))
        
        leaderboard.at['precision', col] = round(precision_score(test_y, np.array(models[col])) * 100, 4)

        leaderboard.at['recall', col] = round(recall_score(test_y, np.array(models[col])) * 100, 4)

        leaderboard.at['f1_score', col] = round(f1_score(test_y, np.array(models[col])) * 100, 4)
    
    leaderboard['mean all'] = [round(i, 4) for i in leaderboard.mean(axis=1).values]

    return leaderboard, 

myleaderboard = calculate_leaderboard(models)

myleaderboard

In [None]:
def plot_confusion_matrix (clf_name:str, y_true, y_pred, target_names = ['Não Sobrevivel', 'Sobrevivel']):

    cf_matrix = confusion_matrix(y_true, y_pred)
    ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Blues')
    ax.set_title(clf_name)
    #ax.set_xlabel('Valores')
    #ax.set_ylabel('Valores')
    ax.xaxis.set_ticklabels(target_names)
    ax.yaxis.set_ticklabels(target_names)
    figure = ax.get_figure()

    figure.show()

Referências:
* [XGBClassifier](https://towardsdatascience.com/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390),
[Refinando Hiperâmetros XGBClf](https://medium.com/@keler.yohan/como-refinar-os-hiper-par%C3%A2metros-de-um-modelo-xgboost-em-python-a096f6ca8f14)

* [area sobre a curva](https://medium.com/bio-data-blog/entenda-o-que-%C3%A9-auc-e-roc-nos-modelos-de-machine-learning-8191fb4df772)

* [Coeficiente kappa](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html#sklearn.metrics.cohen_kappa_score)



In [None]:
from sklearn.model_selection import RandomizedSearchCV
from yellowbrick.classifier.rocauc import roc_auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, cohen_kappa_score, ConfusionMatrixDisplay
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

modelos = [
    (
        'RFClf',
        RandomForestClassifier(),
        {
            'n_estimators': [i for i in range(50, 1000, 50)],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state':[0,42]
        }
    ),
    (
        'XGBoost',
        XGBClassifier(),
        {
            #'colsample_bytree': [i / 100 for i in range(3, 15, 1)],
            'eta': [i / 1000 for i in range(1, 100, 25)],
            'eval_metric': ['auc'],
            'max_depth': [i for i in range(5, 85, 5)],
            'min_child_weight': [i for i in range(15) ],
            'objective': ['binary:hinge'],
            'subsample': [i / 100 for i in range(3, 15, 1)],
            'booster':['gbtree','dart']
        }
    ),
    (
        'SVC',
        SVC(),
        [
            {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
            {"kernel": ["linear"], "C": [1, 10, 100, 1000]},
        ]
    ),
    (
        'KNN',
        KNeighborsClassifier(),
        {
            'n_neighbors':[2],
        }
    ),
    (
        'Naive Baiyes',
        GaussianNB(),
        {

        }

    ),
    (
        'Cat boost',
        CatBoostClassifier(),
        {
            
        }
    ),
    (
        'Ligth GBM',
        HistGradientBoostingClassifier(),
        {
            'learning_rate':[.0001, .00025, .0005, .00075, .001, .0025, .0075, .01],
            'loss':['log_loss', 'auto', 'binary_crossentropy', 'categorical_crossentropy'],
            'max_iter':[i for i in range(100, 1000, 100)],
            'max_depth': [ i for i in range(30, 3, 87)],
            'max_leaf_nodes':[i for i in range(30, 80, 5)]
        }
    )
]

for name, clf, paramss in modelos:

    algritm = RandomizedSearchCV(clf, paramss, cv = 5)
 
    algritm.fit(train_x, train_y)

    y_pred = algritm.predict(test_x)

    classification_report(test_y, y_pred)