In [2]:
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [None]:
train_data = pd.read_csv(
    "../../data/raw/NSL-KDD/KDDTrain+.txt", header=None
)
test_data = pd.read_csv(
    "../../data/raw/NSL-KDD/KDDTest+.txt", header=None
)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# === 1. Carregar dados ===
def load_nslkdd(path, binary_labels=True):
    df = pd.read_csv(path, header=None)
    df.columns = get_nslkdd_columns(len(df.columns))
    
    if binary_labels:
        df['label'] = df['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')

    X = df.drop(columns=['label'])
    y = df['label']
    return X, y

def get_nslkdd_columns(n):
    base = [
        "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
        "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
        "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
        "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
        "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
        "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
        "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
        "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
        "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
        "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate",
        "dst_host_srv_rerror_rate", "label"
    ]
    return base[:n]

# === 2. Pré-processamento unificado ===
def preprocess(X, y):
    # One-hot nos atributos categóricos
    cat_cols = ['protocol_type', 'service', 'flag']
    X_encoded = pd.get_dummies(X, columns=cat_cols)
    feature_names = X_encoded.columns.tolist()

    # Escalonar os dados (exceto para RF se quiser pular depois)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_encoded)

    # Encode y binário
    y_bin = (y != 'normal').astype(int)  # 1=ataque, 0=normal

    return X_scaled, y_bin, feature_names, scaler

# === 3. Dividir treino/teste ===
def split(X, y, test_size=0.3):
    return train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

# === 4. Treinar modelos ===
def train_models(X_train, X_test, y_train, y_test):
    results = {}

    # --- Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    results['RF'] = {
        'model': rf,
        'accuracy': accuracy_score(y_test, rf.predict(X_test)),
        'report': classification_report(y_test, rf.predict(X_test), output_dict=True)
    }

    # --- MLP
    mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    results['MLP'] = {
        'model': mlp,
        'accuracy': accuracy_score(y_test, mlp.predict(X_test)),
        'report': classification_report(y_test, mlp.predict(X_test), output_dict=True)
    }

    # --- LightGBM
    lgbm = LGBMClassifier(n_estimators=100, random_state=42)
    lgbm.fit(X_train, y_train)
    results['LGBM'] = {
        'model': lgbm,
        'accuracy': accuracy_score(y_test, lgbm.predict(X_test)),
        'report': classification_report(y_test, lgbm.predict(X_test), output_dict=True)
    }

    return results

# === 5. Pipeline principal ===
def main():
    path = "nsl_kdd.csv"
    X, y = load_nslkdd(path)
    X_proc, y_proc, features, scaler = preprocess(X, y)
    X_train, X_test, y_train, y_test = split(X_proc, y_proc)

    results = train_models(X_train, X_test, y_train, y_test)

    for model_name, data in results.items():
        print(f"\n--- {model_name} ---")
        print(f"Acurácia: {data['accuracy']:.4f}")
        print(pd.DataFrame(data['report']).transpose())

        # Salvar modelo
        joblib.dump(data['model'], f"models/{model_name.lower()}.pkl")

    # Salvar dados e scaler para XAI
    joblib.dump((X_test, y_test, features, scaler), "models/test_data.pkl")

if __name__ == "__main__":
    main()
