In [1]:
import time
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib
import xgboost as xgb
import time
import numpy as np


In [3]:
path = r'E:\Projetos\llm_security\dataset'
path_model = r'E:\Projetos\llm_security\models'

train_df = pd.read_parquet(f'{path}/train_net.parquet')
test_df = pd.read_parquet(f'{path}/test_net.parquet')


In [None]:

# Selecting only the necessary features
features_to_use = ['protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes']
categorical_features = ['protocol_type', 'service', 'flag']
numerical_features = ['src_bytes', 'dst_bytes']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'  # This drops the columns that are not explicitly transformed
)

# Apply preprocessing to both training and testing data
X_train = preprocessor.fit_transform(train_df[features_to_use])
X_test = preprocessor.transform(test_df[features_to_use])

# Convert to CSR format for models that require it
X_train_csr = csr_matrix(X_train)
X_test_csr = csr_matrix(X_test)

# Convert to dense format for deep learning models
X_train_dense = X_train_csr.toarray()
X_test_dense = X_test_csr.toarray()

y_train = train_df['binary_label']
y_test = test_df['binary_label']

# Salvar o pipeline de pré-processamento
joblib.dump(preprocessor, f'{path_model}/preprocessor_model.joblib')
print("✅ Pré-processador salvo em 'preprocessor_model.joblib'")


✅ Pré-processador salvo em 'preprocessor_model.joblib'


In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, f1_score
from sklearn.exceptions import NotFittedError

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

path_results = r'E:\Projetos\llm_security\results'

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Ridge": RidgeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Perceptron": Perceptron(),
    "SGD-Lasso": SGDClassifier(penalty='l1', random_state=42, max_iter=1000),
    "SGD-ElasticNet": SGDClassifier(penalty='elasticnet', random_state=42, max_iter=1000),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

# Lista para resultados
results = []

for model_name, model in models.items():
    print(f"\n🔍 Treinando: {model_name}")
    start_time = time.time()

    try:
        model.fit(X_train, y_train)
    except Exception as e:
        print(f"❌ Erro ao treinar {model_name}: {e}")
        continue

    training_time = time.time() - start_time

    try:
        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            if len(np.unique(y_test)) == 2:
                y_score = model.predict_proba(X_test)[:, 1]
            else:
                y_score = None
        else:
            y_score = None
    except NotFittedError:
        print(f"❌ Modelo {model_name} não foi treinado corretamente.")
        continue

    acc = accuracy_score(y_test, y_pred)
    try:
        roc_auc = roc_auc_score(y_test, y_score) if y_score is not None else np.nan
    except Exception:
        roc_auc = np.nan

    # F1-score (binário ou macro para multi-classe)
    if len(np.unique(y_test)) == 2:
        f1 = f1_score(y_test, y_pred)
    else:
        f1 = f1_score(y_test, y_pred, average="macro")

    print(f"⏱️ Tempo de treinamento: {training_time:.2f} segundos")
    print(f"✅ Acurácia: {acc:.4f}")
    print(f"🎯 ROC AUC: {roc_auc:.4f}")
    print(f"⭐ F1-score: {f1:.4f}")
    print(classification_report(y_test, y_pred))

    num_batches = X_test.shape[0] // 30
    inference_times = []
    for i in range(num_batches):
        batch_X = X_test[i*30:(i+1)*30]
        start_inference = time.time()
        model.predict(batch_X)
        end_inference = time.time()
        inference_times.append(end_inference - start_inference)

    avg_inference_time = np.mean(inference_times) if inference_times else np.nan
    print(f"⚙️ Tempo médio de inferência por 30 amostras: {avg_inference_time:.6f} s")

    results.append({
        "Modelo": model_name,
        "Acurácia": acc,
        "ROC AUC": roc_auc,
        "F1-score": f1,
        "Tempo Treinamento (s)": training_time,
        "Tempo Inferência Médio (s)": avg_inference_time
    })

df_results = pd.DataFrame(results)
df_results.to_excel(f'{path_results}/benchmark_resultados.xlsx', index=False)
print("\n📈 Benchmark salvo em 'benchmark_resultados.xlsx'")



🔍 Treinando: Random Forest
⏱️ Tempo de treinamento: 5.93 segundos
✅ Acurácia: 0.8225
🎯 ROC AUC: 0.9249
⭐ F1-score: 0.8312
              precision    recall  f1-score   support

           0       0.74      0.89      0.81      9711
           1       0.91      0.77      0.83     12833

    accuracy                           0.82     22544
   macro avg       0.83      0.83      0.82     22544
weighted avg       0.84      0.82      0.82     22544

⚙️ Tempo médio de inferência por 30 amostras: 0.002470 s

🔍 Treinando: KNN
⏱️ Tempo de treinamento: 0.01 segundos
✅ Acurácia: 0.8244
🎯 ROC AUC: 0.8902
⭐ F1-score: 0.8315
              precision    recall  f1-score   support

           0       0.74      0.91      0.82      9711
           1       0.92      0.76      0.83     12833

    accuracy                           0.82     22544
   macro avg       0.83      0.83      0.82     22544
weighted avg       0.84      0.82      0.83     22544

⚙️ Tempo médio de inferência por 30 amostras: 0.12629

Parameters: { "use_label_encoder" } are not used.



⏱️ Tempo de treinamento: 0.29 segundos
✅ Acurácia: 0.8138
🎯 ROC AUC: 0.9613
⭐ F1-score: 0.8108
              precision    recall  f1-score   support

           0       0.71      0.96      0.82      9711
           1       0.96      0.70      0.81     12833

    accuracy                           0.81     22544
   macro avg       0.84      0.83      0.81     22544
weighted avg       0.85      0.81      0.81     22544

⚙️ Tempo médio de inferência por 30 amostras: 0.000491 s

🔍 Treinando: Ridge
⏱️ Tempo de treinamento: 0.14 segundos
✅ Acurácia: 0.7816
🎯 ROC AUC: nan
⭐ F1-score: 0.7850
              precision    recall  f1-score   support

           0       0.69      0.89      0.78      9711
           1       0.89      0.70      0.78     12833

    accuracy                           0.78     22544
   macro avg       0.79      0.79      0.78     22544
weighted avg       0.81      0.78      0.78     22544

⚙️ Tempo médio de inferência por 30 amostras: 0.000062 s

🔍 Treinando: Logistic Re


### **1. O que são os dados?**

**KDD Cup 99**, que representa:

* **Conexões de rede** entre computadores (ex: ao acessar um site, enviar um e-mail, etc.)
* Cada conexão é descrita por **várias características** (ou "atributos").

Exemplos de atributos:

| Nome            | O que representa                        |
| --------------- | --------------------------------------- |
| `protocol_type` | Tipo de protocolo (TCP, UDP, ICMP)      |
| `service`       | Serviço acessado (HTTP, FTP, SSH, etc.) |
| `flag`          | Estado da conexão                       |
| `src_bytes`     | Quantidade de dados enviados            |
| `dst_bytes`     | Quantidade de dados recebidos           |

---

###  **2. O que é o rótulo (label)?**

Cada linha dos dados possui uma **etiqueta (chamada `label`)**, indicando:

* **"normal"** → se a conexão é legítima
* Ou o **tipo de ataque** (ex: DoS, Probe, etc.)

Neste código, o label é convertido para **apenas dois valores**:

* `0` = conexão normal
* `1` = algum tipo de ataque


---


Além disso, você mede o tempo de:

*  **Treinamento** do modelo
*  **Inferência (previsão)** sobre pacotes (em grupos de 30)




# Treinar o melhor modelo o Gradient Boosting

In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, f1_score
import joblib

model_name = "Gradient Boosting"
model = GradientBoostingClassifier(random_state=42)

print(f"\n🔍 Treinando: {model_name}")
start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time

# Previsão
y_pred = model.predict(X_test)
if hasattr(model, "predict_proba"):
    y_score = model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) == 2 else None
else:
    y_score = None

# Métricas
acc = accuracy_score(y_test, y_pred)
try:
    roc_auc = roc_auc_score(y_test, y_score) if y_score is not None else np.nan
except Exception:
    roc_auc = np.nan

f1 = f1_score(y_test, y_pred) if len(np.unique(y_test)) == 2 else f1_score(y_test, y_pred, average="macro")

print(f"⏱️ Tempo de treinamento: {training_time:.2f} segundos")
print(f"✅ Acurácia: {acc:.4f}")
print(f"🎯 ROC AUC: {roc_auc:.4f}")
print(f"⭐ F1-score: {f1:.4f}")
print(classification_report(y_test, y_pred))

# Tempo médio de inferência para lotes de 30 amostras
num_batches = X_test.shape[0] // 30
inference_times = []
for i in range(num_batches):
    batch_X = X_test[i*30:(i+1)*30]
    start_inference = time.time()
    model.predict(batch_X)
    end_inference = time.time()
    inference_times.append(end_inference - start_inference)

avg_inference_time = np.mean(inference_times) if inference_times else np.nan
print(f"⚙️ Tempo médio de inferência por 30 amostras: {avg_inference_time:.6f} s")

# Salvar resultados em Excel
df_results = pd.DataFrame([{
    "Modelo": model_name,
    "Acurácia": acc,
    "ROC AUC": roc_auc,
    "F1-score": f1,
    "Tempo Treinamento (s)": training_time,
    "Tempo Inferência Médio (s)": avg_inference_time
}])
df_results.to_excel(f'{path_results}/resultados_gradient_boosting.xlsx', index=False)
print("\n📈 Resultado salvo em 'resultados_gradient_boosting.xlsx'")

# Salvar modelo treinado
joblib.dump(model, f'{path_model}/gradient_boosting_model.joblib')
print("✅ Modelo Gradient Boosting salvo como 'gradient_boosting_model.joblib'")



🔍 Treinando: Gradient Boosting
⏱️ Tempo de treinamento: 6.09 segundos
✅ Acurácia: 0.8622
🎯 ROC AUC: 0.9326
⭐ F1-score: 0.8752
              precision    recall  f1-score   support

           0       0.81      0.88      0.85      9711
           1       0.90      0.85      0.88     12833

    accuracy                           0.86     22544
   macro avg       0.86      0.86      0.86     22544
weighted avg       0.87      0.86      0.86     22544

⚙️ Tempo médio de inferência por 30 amostras: 0.000155 s

📈 Resultado salvo em 'resultados_gradient_boosting.xlsx'
✅ Modelo Gradient Boosting salvo como 'gradient_boosting_model.joblib'
