In [27]:
# 1) Importação das bibliotecas necessárias

import pandas as pd                          # pandas: leitura e manipulação de tabelas (DataFrame)
import numpy as np                           # numpy: operações numéricas e arrays
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler  
                                             # OneHotEncoder: codifica categorias em vetores binários
                                             # MinMaxScaler: normaliza variáveis contínuas em um intervalo

In [24]:
# 2) Leitura dos dados
df_train = pd.read_csv(r'..\data\csv_files\KDDTrain+_20Percent.csv')
#         └─ carrega o CSV de treino em um DataFrame
df_test  = pd.read_csv(r'..\data\csv_files\KDDTest-21.csv')
#         └─ carrega o CSV de teste em outro DataFrame

In [20]:
df_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,tcp,ftp_data,SF,491.0,0.0,0,0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0.0,udp,other,SF,146.0,0.0,0,0.0,0.0,0.0,...,1.0,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,26.0,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0.0,tcp,http,SF,232.0,8153.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0.0,tcp,http,SF,199.0,420.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [21]:
df_test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,13.0,tcp,telnet,SF,118.0,2425.0,0,0.0,0.0,0.0,...,10.0,0.38,0.12,0.04,0.0,0.0,0.0,0.12,0.3,anomaly
1,0.0,udp,private,SF,44.0,0.0,0,0.0,0.0,0.0,...,254.0,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,anomaly
2,0.0,tcp,telnet,S3,0.0,44.0,0,0.0,0.0,0.0,...,79.0,0.31,0.61,0.0,0.0,0.21,0.68,0.6,0.0,anomaly
3,0.0,udp,private,SF,53.0,55.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.87,0.0,0.0,0.0,0.0,0.0,normal
4,0.0,tcp,private,SH,0.0,0.0,0,0.0,0.0,0.0,...,1.0,0.06,1.0,1.0,0.0,1.0,1.0,0.0,0.0,anomaly


In [25]:
# 3) Separação entre variáveis categóricas e numéricas
cat_cols = ['protocol_type','service','flag']
num_cols = df_train.columns.difference(cat_cols + ['class'])
#         └─ identifica as colunas contínuas (todas menos as categóricas e a classe)


In [31]:
# 4) Codificação one‐hot das variáveis categóricas
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat_train = ohe.fit_transform(df_train[cat_cols])
#             └─ aprende e transforma categorias de treino em vetores binários
X_cat_test  = ohe.transform(df_test[cat_cols])
#             └─ transforma categorias de teste com o mesmo mapeamento

X_cat_test

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [32]:
# 5) Normalização das variáveis numéricas para o intervalo [-1,1]
scaler = MinMaxScaler(feature_range=(-1,1))
X_num_train = scaler.fit_transform(df_train[num_cols])
#             └─ ajusta o escalador aos dados de treino e transforma
X_num_test  = scaler.transform(df_test[num_cols])
#             └─ aplica o mesmo escalonamento aos dados de teste

X_num_test

array([[-1.        , -1.        , -0.99905851, ..., -1.        ,
        -1.        , -1.        ],
       [-0.98823529,  0.        , -1.        , ..., -1.        ,
        -1.        , -1.        ],
       [-1.        , -1.        , -0.99998292, ..., -1.        ,
        -1.        , -1.        ],
       ...,
       [-1.        , -1.        , -0.99999418, ..., -1.        ,
        -1.        , -1.        ],
       [-0.98431373,  0.6       , -1.        , ..., -1.        ,
        -1.        , -1.        ],
       [-0.98823529, -1.        , -1.        , ..., -1.        ,
        -1.        , -1.        ]])

In [33]:
# 6) Combinação das features numéricas e categóricas em uma única matriz
X_train = np.hstack([X_num_train, X_cat_train])
X_test  = np.hstack([X_num_test,  X_cat_test])
#         └─ empilha as colunas lado a lado (horizontally)

X_test

array([[-1.        , -1.        , -0.99905851, ...,  0.        ,
         1.        ,  0.        ],
       [-0.98823529,  0.        , -1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-1.        , -1.        , -0.99998292, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-1.        , -1.        , -0.99999418, ...,  0.        ,
         0.        ,  0.        ],
       [-0.98431373,  0.6       , -1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.98823529, -1.        , -1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [34]:
# 7) Extração do vetor‐alvo (classes)
y_train = df_train['class'].values  # valores “normal” ou “anomaly” para treino
y_test  = df_test ['class'].values  # mesmos rótulos para teste

y_test

array(['anomaly', 'anomaly', 'anomaly', ..., 'anomaly', 'anomaly',
       'anomaly'], dtype=object)

In [None]:
# Pedrão aqui a gente pode fazer o que quiser com os dados, eles já estão prontos para serem usados nos modelos