# Laboratorio 8 Data Science
## Detección de Anomalías con Autoencoder, Isolation Forest y LOF

Autores:

- Nelson García Bravatti
- Christian Echeverría

# 1. Carga del Conjunto de Datos:

In [11]:
import pandas as pd
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import StandardScaler

# Cargar dataset
data = fetch_covtype(as_frame=True)
df = data.frame

# Separar características y target
X = df.drop('Cover_Type', axis=1)
y = df['Cover_Type']

# Identificar columnas numéricas y binarias
numeric_cols = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
                'Horizontal_Distance_To_Fire_Points']

binary_cols = [col for col in X.columns if col not in numeric_cols]

# Escalar solo numéricas
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# 2. Etiquetado de Normal/Anómalo

In [12]:
# Crear etiqueta binaria: 0 = normal (Lodgepole Pine), 1 = anomalía
y_bin = (y != 2).astype(int)  # 0 si es 2, 1 si es otro

# 3. División de Datos

In [13]:
from sklearn.model_selection import train_test_split

# Filtrar solo normales para train/val
X_normal = X_scaled[y_bin == 0]
y_normal = y_bin[y_bin == 0]

# División interna: train + val (solo normales)
X_train, X_val, _, _ = train_test_split(
    X_normal, y_normal, test_size=0.2, random_state=42, stratify=None
)

# Test: normales + anómalos
X_test = X_scaled
y_test = y_bin

# 4. Entrenado de modelos

### Autoencoder

In [14]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Arquitectura simétrica
input_dim = X_train.shape[1]
encoding_dim = 8

input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation='relu')(input_layer)
encoded = Dense(16, activation='relu')(encoded)
encoded = Dense(encoding_dim, activation='relu')(encoded)

decoded = Dense(16, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(decoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Entrenar solo con normales
autoencoder.fit(X_train, X_train,
                epochs=20,
                batch_size=256,
                shuffle=True,
                validation_data=(X_val, X_val),
                verbose=0)

<keras.src.callbacks.history.History at 0x7eb091503590>

### Isolation Forest

In [15]:
from sklearn.ensemble import IsolationForest

# Entrenar solo con normales
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_train)

### LOF


In [10]:
from sklearn.neighbors import LocalOutlierFactor

# Entrenar solo con normales (novelty=True)
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05, novelty=True)
lof.fit(X_train)

In [17]:
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score

# ---------- 1. Autoencoder ----------
# Reconstrucción y error
X_test_pred = autoencoder.predict(X_test, verbose=0)
mse = ((X_test - X_test_pred) ** 2).mean(axis=1)
# Umbral rápido: percentil 95 del error en validación
threshold_ae = mse[y_bin == 0].quantile(0.95)
y_pred_ae = (mse > threshold_ae).astype(int)

# ---------- 2. Isolation Forest ----------
y_pred_iso = (iso_forest.predict(X_test) == -1).astype(int)  # -1 = anomalía

# ---------- 3. LOF ----------
y_pred_lof = (lof.predict(X_test) == -1).astype(int)  # -1 = anomalía

# ---------- Métricas ----------
models = {
    'Autoencoder': (y_test, y_pred_ae),
    'IsolationForest': (y_test, y_pred_iso),
    'LOF': (y_test, y_pred_lof)
}

for name, (y_true, y_pred) in models.items():
    print(f"\n{name}")
    print(f"  ROC-AUC : {roc_auc_score(y_true, y_pred):.3f}")
    print(f"  PR-AUC  : {average_precision_score(y_true, y_pred):.3f}")
    print(f"  F1      : {f1_score(y_true, y_pred):.3f}")
    print(f"  Prec    : {precision_score(y_true, y_pred):.3f}")
    print(f"  Recall  : {recall_score(y_true, y_pred):.3f}")




Autoencoder
  ROC-AUC : 0.606
  PR-AUC  : 0.600
  F1      : 0.400
  Prec    : 0.846
  Recall  : 0.262

IsolationForest
  ROC-AUC : 0.547
  PR-AUC  : 0.547
  F1      : 0.243
  Prec    : 0.753
  Recall  : 0.145

LOF
  ROC-AUC : 0.793
  PR-AUC  : 0.780
  F1      : 0.754
  Prec    : 0.937
  Recall  : 0.630
