# Autoencoders

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [9]:
# Funciones auxiliares

def filtrar_outliers(df, columna, p_inf=0.01, p_sup=0.99):
    lim_inf = df[columna].quantile(p_inf)
    lim_sup = df[columna].quantile(p_sup)
    return df[df[columna].between(lim_inf, lim_sup)]

def cargar_datos(ruta):
    df = pd.read_csv(ruta)
    df["cholesterol"] = (df["cholesterol"] > 1).astype(int)
    df["gluc"] = (df["gluc"] > 1).astype(int)
    df["age_years"] = (df["age"] / 365).astype(int)
    return df

def agregar_bmi(df):
    bmi = df["weight"] / ((df["height"] / 100) ** 2)
    df["BMI"] = bmi
    df["overweight"] = (bmi > 25).astype(int)
    return df

In [10]:
# Preparación de datos
ruta = "C:/Users/trezz/Desktop/proyectosDeDesarrolloWeb/MedicalExamination/data/medical_examination.csv"
df = cargar_datos(ruta)
df = agregar_bmi(df)

# Filtrar outliers
for col in ["height", "weight", "ap_hi", "ap_lo", "BMI"]:
    df = filtrar_outliers(df, col)

# Seleccionamos variables numéricas relevantes
features = ["age_years", "height", "weight", "ap_hi", "ap_lo", "cholesterol", "gluc", "BMI"]
X = df[features].values

# Normalizamos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Definición del Autoencoder

input_dim = X_scaled.shape[1]
encoding_dim = 4  # tamaño del espacio latente

input_layer = Input(shape=(input_dim,))
encoder = Dense(8, activation="relu")(input_layer)
encoder = Dense(encoding_dim, activation="relu")(encoder)

decoder = Dense(8, activation="relu")(encoder)
decoder = Dense(input_dim, activation="linear")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss="mse")

In [12]:
# Entrenamiento

history = autoencoder.fit(
    X_scaled, X_scaled,
    epochs=10,
    batch_size=32,
    shuffle=True,
    validation_split=0.2
)

Epoch 1/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.6393 - val_loss: 0.4221
Epoch 2/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.3424 - val_loss: 0.2388
Epoch 3/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1889 - val_loss: 0.1664
Epoch 4/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1614 - val_loss: 0.1562
Epoch 5/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1541 - val_loss: 0.1517
Epoch 6/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1513 - val_loss: 0.1501
Epoch 7/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1496 - val_loss: 0.1508
Epoch 8/10
[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.1486 - val_loss: 0.1485
Epoch 9/10
[1m1630/1630

In [13]:
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)
df["reconstruction_error"] = mse

[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 434us/step
