In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split, ParameterGrid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import wilcoxon
import gc

In [29]:
df_attack = pd.read_csv("CICIDSFULLOnlyAttacks.csv")
df_normal = pd.read_csv("CICIDSFULLOnlyNormal.csv")

In [30]:
dataframe = {
    'Benign': 0, 'DDOS attack-HOIC': 1, 'DDoS attacks-LOIC-HTTP': 1,
    'DoS attacks-Hulk': 1, 'Bot': 1, 'FTP-BruteForce': 1,
    'SSH-Bruteforce': 1, 'Infilteration': 1,
    'DoS attacks-SlowHTTPTest': 1, 'DoS attacks-GoldenEye': 1, 'DoS attacks-Slowloris': 1,
    'DDOS attack-LOIC-UDP': 1, 'Brute Force -Web': 1, 'Brute Force -XSS': 1, 'SQL Injection': 1
}

df_attack.replace(dataframe, inplace=True)
df_normal.replace(dataframe, inplace=True)


  df_attack.replace(dataframe, inplace=True)
  df_normal.replace(dataframe, inplace=True)


In [31]:
df_x = pd.concat([df_attack, df_normal])

In [136]:
df_x = pd.concat([df_attack, df_normal])

corr_matrix = df_x.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

limite = 0.9
to_drop = [column for column in upper.columns if any(upper[column] > limite)]

df_testes = df_x.drop(columns=to_drop)

df_testes.columns

Index(['dst_port', 'protocol', 'flow_duration', 'tot_fwd_pkts', 'tot_bwd_pkts',
       'totlen_fwd_pkts', 'fwd_pkt_len_max', 'fwd_pkt_len_min',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean', 'flow_byts/s',
       'flow_pkts/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'bwd_pkts/s', 'pkt_len_var', 'fin_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'down/up_ratio', 'fwd_byts/b_avg', 'fwd_pkts/b_avg', 'fwd_blk_rate_avg',
       'bwd_byts/b_avg', 'bwd_pkts/b_avg', 'bwd_blk_rate_avg',
       'init_fwd_win_byts', 'init_bwd_win_byts', 'fwd_seg_size_min',
       'active_mean', 'active_std', 'idle_min', 'label'],
      dtype='object')

In [3]:
#df_testes.to_csv('df_testes.csv', index=False)
df_testes = pd.read_csv("df_testes.csv")

In [None]:
del df_x
del df_attack
del df_normal
gc.collect()

In [None]:
df_reduced_percent = df_testes.sample(frac=0.01, random_state=42) #Parcela do dataset que roda na minha maquina

#descomentar essa linha no treinamento na maquina que aguenta o tranco
#df_reduced_percent = df_testes 

x = df_reduced_percent.drop('label', axis = 1)
y = df_reduced_percent['label']

x1 = x.values
y1 = y.values

In [5]:
y.value_counts()

label
0    27540
1    27398
Name: count, dtype: int64

In [7]:
# Hiperparâmetros KNN
param_grid_KNN = {
    'n_neighbors': [3, 5, 7, 9, 11,],
    'metric': ['euclidean', 'manhattan', 'cosine', 'chebyshev', 'braycurtis', 'correlation']
}

# Hiperparâmetros Decision Tree (DT)
param_grid_DT = {
    'max_depth': [3, 6, 7, 9, 11]
}

# Hiperparâmetros Random Forest (RF)
param_grid_RF = {
    'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}

# Hiperparâmetros XGBoost (XGB)
param_grid_XGB = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}

#Arrays de metricas
test_scores_KNN, precision_scores_KNN, recall_scores_KNN, f1_scores_KNN, accs_KNN, par_KNN = [], [], [], [], [], []
test_scores_DT, precision_scores_DT, recall_scores_DT, f1_scores_DT, accs_DT, par_DT = [], [], [], [], [], []
test_scores_RF, precision_scores_RF, recall_scores_RF, f1_scores_RF, accs_RF, par_RF = [], [], [], [], [], []
test_scores_XGB, precision_scores_XGB, recall_scores_XGB, f1_scores_XGB, accs_XGB, par_XGB = [], [], [], [], [], []

#Instanciação do K-folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

#Instanciação do NearMiss (ele quebra o modelo sempre, vamos ver com o conjunto de dados total)

#quando for usar na maquina que tanka tem que mudar o sampling_strategy para 0.2
#near_miss_gridSearch = NearMiss(sampling_strategy=1.0, n_neighbors_ver3=3) 
#near_miss = NearMiss(sampling_strategy=1.0, n_neighbors_ver3=3) 

"""
Não faz sentido pegar subasmostragem dentro do grid serach se ele já usa uma subamostragem de outra subamostragem, 
no caso ele usa 20% do conjundo de treinamento para validção

Faz sentido usar no conjunto de treino do proprio algoritmo (fora do grid search), dado que ele treina com todo o conjunto de treino
"""
scaler = StandardScaler()


In [8]:
for train_index, test_index in kf.split(x1):
    X_train, X_test, Y_train, Y_test = x1[train_index], x1[test_index], y1[train_index], y1[test_index]

    X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, Y_train, random_state=42, test_size=0.2)

    ############################################################################################################################
    # K-Nearest Neighbors - KNN
    for params in ParameterGrid(param_grid_KNN):
                
        X_train_val_KNN = scaler.fit_transform(X_train_val)
        X_test_val_KNN = scaler.transform(X_test_val)
    
        knn = KNeighborsClassifier(n_neighbors=params['n_neighbors'], metric=params['metric'], n_jobs=-1)
        knn.fit(X_train_val, y_train_val)
        y_pred = knn.predict(X_test_val)
        
        acc = accuracy_score(y_test_val, y_pred)
        accs_KNN.append(acc)
        par_KNN.append(params)
        
    X_train_KNN = scaler.fit_transform(X_train)
    X_test_KNN = scaler.transform(X_test)

    best_params_KNN = par_KNN[accs_KNN.index(max(accs_KNN))]
    knn_best = KNeighborsClassifier(metric=best_params_KNN['metric'], n_neighbors=best_params_KNN['n_neighbors'], n_jobs=-1)
    knn_best.fit(X_train_KNN, Y_train)
    y_pred = knn_best.predict(X_test_KNN)
    
    test_scores_KNN.append(accuracy_score(Y_test, y_pred))
    precision_scores_KNN.append(precision_score(Y_test, y_pred, average='weighted'))
    recall_scores_KNN.append(recall_score(Y_test, y_pred, average='weighted'))
    f1_scores_KNN.append(f1_score(Y_test, y_pred, average='weighted'))
    
    ############################################################################################################################
    # Decision Tree - DT
    for max_depth in param_grid_DT['max_depth']:
                
        X_train_val_DT = scaler.fit_transform(X_train_val)
        X_test_val_DT = scaler.transform(X_test_val)
        
        dt = DecisionTreeClassifier(max_depth=max_depth, max_features='log2', random_state=42)
        dt.fit(X_train_val_DT, y_train_val)
        y_pred = dt.predict(X_test_val_DT)
        
        acc = accuracy_score(y_test_val, y_pred)
        accs_DT.append(acc)
        par_DT.append(max_depth)
            
    X_train_DT = scaler.fit_transform(X_train)
    X_test_DT = scaler.transform(X_test)

    dt_best = DecisionTreeClassifier(max_depth=par_DT[accs_DT.index(max(accs_DT))], max_features='log2', random_state=42)
    dt_best.fit(X_train, Y_train)
    y_pred = dt_best.predict(X_test)
    
    test_scores_DT.append(accuracy_score(Y_test, y_pred))
    precision_scores_DT.append(precision_score(Y_test, y_pred, average='weighted'))
    recall_scores_DT.append(recall_score(Y_test, y_pred, average='weighted'))
    f1_scores_DT.append(f1_score(Y_test, y_pred, average='weighted'))

    ############################################################################################################################
    # Random Forest - RF
    for params in ParameterGrid(param_grid_RF):
          
        X_train_val_RF = scaler.fit_transform(X_train_val)
        X_test_val_RF = scaler.transform(X_test_val)
        
        rf = RandomForestClassifier(n_estimators=params['n_estimators'], random_state=42, max_features='sqrt',  n_jobs=-1, verbose=1)
        rf.fit(X_train_val_RF, y_train_val)
        y_pred = rf.predict(X_test_val_RF)
        
        acc = accuracy_score(y_test_val, y_pred)
        accs_RF.append(acc)
        par_RF.append(params)
        
    X_train_RF = scaler.fit_transform(X_train)
    X_test_RF = scaler.transform(X_test)

    best_param_RF = par_RF[accs_RF.index(max(accs_RF))]
    rf_best = RandomForestClassifier(**best_param_RF, random_state=42, max_features='sqrt',  n_jobs=-1, verbose=1)
    rf_best.fit(X_train, Y_train)
    y_pred = rf_best.predict(X_test)

    test_scores_RF.append(accuracy_score(Y_test, y_pred))
    precision_scores_RF.append(precision_score(Y_test, y_pred, average='weighted'))
    recall_scores_RF.append(recall_score(Y_test, y_pred, average='weighted'))
    f1_scores_RF.append(f1_score(Y_test, y_pred, average='weighted'))

    ############################################################################################################################
    # XGBoost _ XGB
    for params in ParameterGrid(param_grid_XGB):
        
        X_train_val_XGB = scaler.fit_transform(X_train_val)
        X_test_val_XGB = scaler.transform(X_test_val)
        
        xgb = XGBClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'], learning_rate=params['learning_rate'], n_jobs=-1,
                            random_state=42, use_label_encoder=False, eval_metric='logloss', tree_method='hist')
        xgb.fit(X_train_val_XGB, y_train_val)
        y_pred = xgb.predict(X_test_val_XGB)
        acc = accuracy_score(y_test_val, y_pred)
        accs_XGB.append(acc)
        par_XGB.append(params)
        
    X_train_XGB = scaler.fit_transform(X_train)
    X_test_XGB = scaler.transform(X_test)

    best_params = par_XGB[accs_XGB.index(max(accs_XGB))]
    xgb_best = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
    xgb_best.fit(X_train, Y_train)
    y_pred = xgb_best.predict(X_test)
    
    test_scores_XGB.append(accuracy_score(Y_test, y_pred))
    precision_scores_XGB.append(precision_score(Y_test, y_pred, average='weighted'))
    recall_scores_XGB.append(recall_score(Y_test, y_pred, average='weighted'))
    f1_scores_XGB.append(f1_score(Y_test, y_pred, average='weighted'))
    
    
    data = {
    "Modelo": ["KNN", "DT", "RF", "XGB"],
    "Acurácia": [np.mean(test_scores_KNN), np.mean(test_scores_DT), np.mean(test_scores_RF), np.mean(test_scores_XGB)],
    "Precisão": [np.mean(precision_scores_KNN), np.mean(precision_scores_DT), np.mean(precision_scores_RF), np.mean(precision_scores_XGB)],
    "Recall": [np.mean(recall_scores_KNN), np.mean(recall_scores_DT), np.mean(recall_scores_RF), np.mean(recall_scores_XGB)],
    "F1 Score": [np.mean(f1_scores_KNN), np.mean(f1_scores_DT), np.mean(f1_scores_RF), np.mean(f1_scores_XGB)]
}

metrics = pd.DataFrame(data)
metrics



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  18 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s f

Unnamed: 0,Modelo,Acurácia,Precisão,Recall,F1 Score
0,KNN,0.967964,0.968662,0.967964,0.96795
1,DT,0.957862,0.958236,0.957862,0.957854
2,RF,0.966726,0.967311,0.966726,0.966714
3,XGB,0.96891,0.969817,0.96891,0.968894


In [9]:
metrics.to_csv('metrics.csv', index=False)

In [None]:
model_metrics = {
    "KNN": {
        "accuracy": test_scores_KNN,
        "precision": precision_scores_KNN,
        "recall": recall_scores_KNN,
        "f1": f1_scores_KNN
    },
    "DT": {
        "accuracy": test_scores_DT,
        "precision": precision_scores_DT,
        "recall": recall_scores_DT,
        "f1": f1_scores_DT
    },
    "RF": {
        "accuracy": test_scores_RF,
        "precision": precision_scores_RF,
        "recall": recall_scores_RF,
        "f1": f1_scores_RF
    },
    "XGB": {
        "accuracy": test_scores_XGB,
        "precision": precision_scores_XGB,
        "recall": recall_scores_XGB,
        "f1": f1_scores_XGB
    }
}

# Comparação de todas as métricas entre todos os pares de modelos
model_names = list(model_metrics.keys())
metrics = ["accuracy", "precision", "recall", "f1"]
wilcoxon_results = {}

for i in range(len(model_names)):
    for j in range(i + 1, len(model_names)):  # Para evitar repetições
        model1, model2 = model_names[i], model_names[j]
        for metric in metrics:
            # Pegando os valores das métricas
            data1 = model_metrics[model1][metric]
            data2 = model_metrics[model2][metric]

            min_length = min(len(data1), len(data2))
            if min_length == 0:
                continue  # Pula essa comparação se uma das listas estiver vazia

            # Truncar para o menor tamanho comum
            data1, data2 = data1[:min_length], data2[:min_length]

            # Aplicar Wilcoxon
            stat, p_value = wilcoxon(data1, data2)
            wilcoxon_results[f"{model1} vs {model2} - {metric}"] = (stat, p_value)

table_data = []

for comparison, (stat, p) in wilcoxon_results.items():
    reject_h0 = "Sim" if p < 0.05 else "Não"
    table_data.append([comparison, stat, round(p, 3), reject_h0])

df_results = pd.DataFrame(table_data, columns=["Comparação", "Estatística", "p-valor", "Rejeita H₀"])

df_results

Unnamed: 0,Comparação,Estatística,p-valor,Rejeita H₀
0,KNN vs DT - accuracy,0.0,0.002,Sim
1,KNN vs DT - precision,0.0,0.002,Sim
2,KNN vs DT - recall,0.0,0.002,Sim
3,KNN vs DT - f1,0.0,0.002,Sim
4,KNN vs RF - accuracy,4.0,0.027,Sim
5,KNN vs RF - precision,5.0,0.02,Sim
6,KNN vs RF - recall,4.0,0.027,Sim
7,KNN vs RF - f1,6.0,0.027,Sim
8,KNN vs XGB - accuracy,6.0,0.025,Sim
9,KNN vs XGB - precision,3.0,0.01,Sim


In [28]:
besT_params_RF

[{'n_estimators': 10},
 {'n_estimators': 20},
 {'n_estimators': 10},
 {'n_estimators': 10},
 {'n_estimators': 10},
 {'n_estimators': 20},
 {'n_estimators': 10},
 {'n_estimators': 10},
 {'n_estimators': 10},
 {'n_estimators': 20}]