# CICIDS2017

### Logistic Regression (no SMOTE)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# === 0. Reproducibility ===
np.random.seed(42)

# === 1. Load unified, normalized dataset ===
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2. Split features and label ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# (Opcional pero recomendado): convertir ±inf a NaN antes de imputar
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3. Train/Test split (stratified) ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4. Pipeline: Imputer (median) + Logistic Regression ===
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", LogisticRegression(random_state=42, max_iter=1000))
])

pipe.fit(X_train, y_train)

# === 5. Predict and calculate metrics ===
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

# === 6. Confusion matrix and metrics ===
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

# === 7. Print results ===
print("=== Classification Metrics ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics ===
Accuracy      : 0.9878
Precision     : 0.9722
Recall        : 0.9783
F1-Score      : 0.9752
FAR           : 0.0091
AUC           : 0.9961

=== Confusion Matrix ===
TN: 313619, FP: 2895
FN: 2245, TP: 101237


### Logistic Regression (with SMOTE)

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # ¡Ojo! el Pipeline de imblearn

# === 0. Reproducibility ===
np.random.seed(42)

# === 1. Load unified, normalized dataset ===
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2. Split features and label ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# Convertir ±inf a NaN antes de imputar
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3. Train/Test split (stratified) ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4. Pipeline: Imputer -> SMOTE -> Logistic Regression ===
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(random_state=42, max_iter=1000))
])

# Entrenar
pipe.fit(X_train, y_train)

# === 5. Predict and calculate metrics ===
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

# === 6. Confusion matrix and metrics ===
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0

# === 7. Print results ===
print("=== Classification Metrics (LogReg + SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (LogReg + SMOTE) ===
Accuracy      : 0.9862
Precision     : 0.9623
Recall        : 0.9824
F1-Score      : 0.9723
FAR           : 0.0126
AUC           : 0.9962

=== Confusion Matrix ===
TN: 312536, FP: 3978
FN: 1817, TP: 101665


## Naive Bayes (NO SMOTE)

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# === 0. Reproducibilidad ===
np.random.seed(42)

# === 1. Cargar dataset unificado y normalizado ===
# Debe tener todas las columnas numéricas y 'Label' (0/1) al final
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2. Separar features y label ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# Manejo de infinitos -> NaN (para que el imputer los resuelva)
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3. Split estratificado y reproducible ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4. Pipeline: Imputer (median) -> GaussianNB ===
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("nb", GaussianNB())
])

# Entrenamiento
pipe.fit(X_train, y_train)

# === 5. Predicción y métricas ===
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]  # probas para AUC

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

# === 6. Resultados ===
print("=== Classification Metrics (GaussianNB, no SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")



=== Classification Metrics (GaussianNB, no SMOTE) ===
Accuracy      : 0.8565
Precision     : 0.6364
Recall        : 0.9742
F1-Score      : 0.7699
FAR           : 0.1820
AUC           : 0.9711

=== Confusion Matrix ===
TN: 258924, FP: 57590
FN: 2668, TP: 100814


## Naive Bayes (with SMOTE)

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # importante: Pipeline de imblearn

# === 0. Reproducibilidad ===
np.random.seed(42)

# === 1. Cargar dataset unificado y normalizado ===
# Debe tener todas las columnas numéricas y 'Label' (0/1) al final
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2. Separar features y label ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# Manejo de infinitos -> NaN (para que el imputer los resuelva)
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3. Split estratificado y reproducible ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4. Pipeline: Imputer (median) -> SMOTE -> GaussianNB ===
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("smote", SMOTE(random_state=42)),
    ("nb", GaussianNB())
])

# Entrenamiento
pipe.fit(X_train, y_train)

# === 5. Predicción y métricas ===
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

# === 6. Resultados ===
print("=== Classification Metrics (GaussianNB + SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (GaussianNB + SMOTE) ===
Accuracy      : 0.8564
Precision     : 0.6362
Recall        : 0.9742
F1-Score      : 0.7697
FAR           : 0.1821
AUC           : 0.9686

=== Confusion Matrix ===
TN: 258862, FP: 57652
FN: 2671, TP: 100811


### LDA (no SMOTE)

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# === 0. Reproducibilidad ===
np.random.seed(42)

# === 1. Cargar dataset unificado y normalizado ===
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2. Separar features y label ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# Manejo de infinitos -> NaN (para que el imputer los resuelva)
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3. Split estratificado y reproducible ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4. Pipeline: Imputer (median) -> LDA ===
# Nota: shrinkage='auto' requiere solver='lsqr' o 'eigen'
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("lda", LDA(solver="lsqr", shrinkage="auto"))
])

# Entrenamiento
pipe.fit(X_train, y_train)

# === 5. Predicción y métricas ===
y_pred = pipe.predict(X_test)
# LDA soporta predict_proba
y_proba = pipe.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

# === 6. Resultados ===
print("=== Classification Metrics (LDA) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (LDA) ===
Accuracy      : 0.9784
Precision     : 0.9426
Recall        : 0.9715
F1-Score      : 0.9569
FAR           : 0.0193
AUC           : 0.9923

=== Confusion Matrix ===
TN: 310397, FP: 6117
FN: 2950, TP: 100532


### LDA (with SMOTE)

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # importante: pipeline de imblearn

# === 0. Reproducibilidad ===
np.random.seed(42)

# === 1. Cargar dataset unificado y normalizado ===
# Debe tener todas las columnas numéricas y 'Label' (0/1) al final
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2. Separar features y label ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# Manejo de infinitos -> NaN (para que el imputer los resuelva)
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3. Split estratificado y reproducible ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4. Pipeline: Imputer (median) -> SMOTE -> LDA ===
# shrinkage='auto' con solver='lsqr' mejora estabilidad en alta dimensión
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("smote", SMOTE(random_state=42)),
    ("lda", LDA(solver="lsqr", shrinkage="auto"))
])

# Entrenamiento
pipe.fit(X_train, y_train)

# === 5. Predicción y métricas ===
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

# === 6. Resultados ===
print("=== Classification Metrics (LDA + SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (LDA + SMOTE) ===
Accuracy      : 0.9735
Precision     : 0.9156
Recall        : 0.9830
F1-Score      : 0.9481
FAR           : 0.0296
AUC           : 0.9924

=== Confusion Matrix ===
TN: 307136, FP: 9378
FN: 1755, TP: 101727


### Autoencoder + Logistic Regression (no SMOTE)

In [14]:
# === Autoencoder + Logistic Regression (sin SMOTE), reproducible ===
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# ---- Reproducibilidad (antes de importar TF) ----
os.environ["PYTHONHASHSEED"] = "42"
os.environ["TF_DETERMINISTIC_OPS"] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # opcional: fuerza CPU

random.seed(42)
np.random.seed(42)

import tensorflow as tf
tf.random.set_seed(42)
try:
    tf.config.experimental.enable_op_determinism(True)
except Exception:
    pass

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# === 1) Cargar dataset unificado y normalizado ===
# Debe tener todas las columnas numéricas y la última llamada 'Label' (0/1)
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2) Separar features y etiqueta ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# Manejo de infinitos -> NaN (para que el imputer los resuelva)
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3) Split estratificado y reproducible ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4) Imputación (sin SMOTE) ===
imputer = SimpleImputer(strategy="median")
X_train_imp = imputer.fit_transform(X_train)
X_test_imp  = imputer.transform(X_test)

# === 5) Autoencoder (determinista) ===
input_dim = X_train_imp.shape[1]
encoding_dim = 32  # puedes ajustar

inp = Input(shape=(input_dim,))
h1 = Dense(64, activation='relu')(inp)
z  = Dense(encoding_dim, activation='relu')(h1)
h2 = Dense(64, activation='relu')(z)
out = Dense(input_dim, activation='sigmoid')(h2)

autoencoder = Model(inputs=inp, outputs=out)
encoder = Model(inputs=inp, outputs=z)

autoencoder.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')

# Importante: shuffle=False para reproducibilidad (validation_split toma el último 10%)
autoencoder.fit(
    X_train_imp, X_train_imp,
    epochs=20,
    batch_size=256,
    shuffle=False,
    validation_split=0.1,
    verbose=0
)

# === 6) Embeddings ===
X_train_emb = encoder.predict(X_train_imp, verbose=0)
X_test_emb  = encoder.predict(X_test_imp,  verbose=0)

# === 7) Clasificador: Regresión Logística (sin SMOTE) ===
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train_emb, y_train)

# === 8) Predicciones y métricas ===
y_pred = clf.predict(X_test_emb)
y_proba = clf.predict_proba(X_test_emb)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

print("=== Classification Metrics (AE Embeddings + Logistic Regression, no SMOTE) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (AE Embeddings + Logistic Regression, no SMOTE) ===
Accuracy      : 0.9859
Precision     : 0.9691
Recall        : 0.9738
F1-Score      : 0.9715
FAR           : 0.0101
AUC           : 0.9960

=== Confusion Matrix ===
TN: 313306, FP: 3208
FN: 2709, TP: 100773


### Autoencoder + Logistic Regression (with SMOTE)

In [15]:
# === Autoencoder + SMOTE + Logistic Regression (reproducible) ===
import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE

# ---- Reproducibilidad (antes de importar TF) ----
os.environ["PYTHONHASHSEED"] = "42"
os.environ["TF_DETERMINISTIC_OPS"] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # opcional: fuerza CPU

random.seed(42)
np.random.seed(42)

import tensorflow as tf
tf.random.set_seed(42)
try:
    tf.config.experimental.enable_op_determinism(True)
except Exception:
    pass

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# === 1) Cargar dataset unificado y normalizado ===
# Debe tener todas las columnas numéricas y la última llamada 'Label' (0/1)
df = pd.read_csv("CICIDS2017_improved_unified.csv")

# === 2) Separar features y etiqueta ===
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(int).copy()

# Manejo de infinitos -> NaN (para que el imputer los resuelva)
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# === 3) Split estratificado y reproducible ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# === 4) Imputación (sin SMOTE todavía) ===
imputer = SimpleImputer(strategy="median")
X_train_imp = imputer.fit_transform(X_train)
X_test_imp  = imputer.transform(X_test)

# === 5) Autoencoder (determinista) ===
input_dim = X_train_imp.shape[1]
encoding_dim = 32  # puedes ajustar

inp = Input(shape=(input_dim,))
h1 = Dense(64, activation='relu')(inp)
z  = Dense(encoding_dim, activation='relu')(h1)
h2 = Dense(64, activation='relu')(z)
out = Dense(input_dim, activation='sigmoid')(h2)

autoencoder = Model(inputs=inp, outputs=out)
encoder = Model(inputs=inp, outputs=z)

autoencoder.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')

# Nota: shuffle=False para reproducibilidad (validation_split toma el último 10%)
autoencoder.fit(
    X_train_imp, X_train_imp,
    epochs=20,
    batch_size=256,
    shuffle=False,
    validation_split=0.1,
    verbose=0
)

# === 6) Embeddings ===
X_train_emb = encoder.predict(X_train_imp, verbose=0)
X_test_emb  = encoder.predict(X_test_imp,  verbose=0)

# === 7) SMOTE sobre embeddings de TRAIN (sin fuga) ===
sm = SMOTE(random_state=42)
X_train_emb_sm, y_train_sm = sm.fit_resample(X_train_emb, y_train)

# === 8) Clasificador: Regresión Logística ===
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train_emb_sm, y_train_sm)

# === 9) Predicciones y métricas ===
y_pred = clf.predict(X_test_emb)
y_proba = clf.predict_proba(X_test_emb)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
far = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Alarm Rate

print("=== Classification Metrics (AE Embeddings + SMOTE + Logistic Regression) ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"FAR           : {far:.4f}")
print(f"AUC           : {auc:.4f}")
print("\n=== Confusion Matrix ===")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


=== Classification Metrics (AE Embeddings + SMOTE + Logistic Regression) ===
Accuracy      : 0.9806
Precision     : 0.9428
Recall        : 0.9805
F1-Score      : 0.9613
FAR           : 0.0194
AUC           : 0.9954

=== Confusion Matrix ===
TN: 310363, FP: 6151
FN: 2016, TP: 101466
