## Treinamento de modelo v1

Fonte da arquitetura do modelo: https://repositorio.ufsc.br/handle/123456789/254052

### Pré-processamento dos dados

In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [73]:
# Fonte para o pré-processamento dos dados do dataset NSL-KDD:
# https://www.kaggle.com/code/ajeffreyrufus/network-intrusion-detection-using-ml-99-accuracy
train_data = pd.read_csv(
    "../../data/raw/NSL-KDD/KDDTrain+.txt", header=None
)
test_data = pd.read_csv(
    "../../data/raw/NSL-KDD/KDDTest+.txt", header=None
)

In [74]:
# Carregar dataset
columns = [
    "duration",
    "protocol_type",
    "service",
    "flag",
    "src_bytes",
    "dst_bytes",
    "land",
    "wrong_fragment",
    "urgent",
    "hot",
    "num_failed_logins",
    "logged_in",
    "num_compromised",
    "root_shell",
    "su_attempted",
    "num_root",
    "num_file_creations",
    "num_shells",
    "num_access_files",
    "num_outbound_cmds",
    "is_host_login",
    "is_guest_login",
    "count",
    "srv_count",
    "serror_rate",
    "srv_serror_rate",
    "rerror_rate",
    "srv_rerror_rate",
    "same_srv_rate",
    "diff_srv_rate",
    "srv_diff_host_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate",
    "attack",
    "level",
]

In [75]:
train_data.columns = columns
test_data.columns = columns

In [76]:
full_data = pd.concat([train_data, test_data], ignore_index=True)

# Codificar variáveis categóricas
label_encoders = {}
for column in ["protocol_type", "service", "flag"]:
    le = LabelEncoder()
    full_data[column] = le.fit_transform(full_data[column])
    label_encoders[column] = le

In [77]:
X = full_data.drop(["attack", "level"], axis=1)
y = full_data["attack"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [78]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [79]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [80]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Combine unique labels from both y_train and y_test
all_labels = pd.concat([y_train, y_test]).unique()

# Encode the target variable (attack) using LabelEncoder
attack_encoder = LabelEncoder()
attack_encoder.fit(all_labels)

# Transform the target variables
y_train_encoded = attack_encoder.transform(y_train)
y_test_encoded = attack_encoder.transform(y_test)

# One-hot encode the target variable
num_classes = len(attack_encoder.classes_)
y_train_cat = y_train_encoded
y_test_cat = y_test_encoded

In [81]:
# Model Architecture
model = Sequential(
    [
        Dense(X_train.shape[1], activation="tanh", input_shape=(X_train.shape[1],)),
        Dense(X_train.shape[1], activation="tanh"),
        Dense(num_classes, activation="softmax"),
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [82]:
# Compile model
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Fit model
history = model.fit(
    X_train_scaled,
    y_train_cat,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    verbose=1,
)

loss, accuracy = model.evaluate(X_test_scaled, y_test_cat)
print(f"Test Accuracy: {accuracy}")

Epoch 1/100
[1m2971/2971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 911us/step - accuracy: 0.8844 - loss: 0.5796 - val_accuracy: 0.9663 - val_loss: 0.1167
Epoch 2/100
[1m2971/2971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 895us/step - accuracy: 0.9729 - loss: 0.1005 - val_accuracy: 0.9759 - val_loss: 0.0849
Epoch 3/100
[1m2971/2971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 924us/step - accuracy: 0.9774 - loss: 0.0768 - val_accuracy: 0.9795 - val_loss: 0.0762
Epoch 4/100
[1m2971/2971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 896us/step - accuracy: 0.9816 - loss: 0.0640 - val_accuracy: 0.9783 - val_loss: 0.0738
Epoch 5/100
[1m2971/2971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 891us/step - accuracy: 0.9820 - loss: 0.0599 - val_accuracy: 0.9812 - val_loss: 0.0650
Epoch 6/100
[1m2971/2971[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 890us/step - accuracy: 0.9837 - loss: 0.0549 - val_accuracy: 0.9835 - val_loss: 0.060

In [None]:
from sklearn.inspection import permutation_importance

# Calcular a importância das features
result = permutation_importance(
    model, X_test_scaled, y_test_encoded, scoring="accuracy"
)

feature_importance = pd.DataFrame(
    {"Feature": X.columns, "Importance": result.importances_mean}
)
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

print(feature_importance)