In [15]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import os
import re
import ast
import matplotlib.pyplot as plt


In [16]:
# ==== CONFIGURACIÓN ====
model_dir = "/home/77462217B/lois/ADMeth/model/gridsearchmodelwomasklight22K"
datasets = [
    "/home/77462217B/lois/ADMeth/data/ValidateDataset5K_float16.npy",
    "/home/77462217B/lois/ADMeth/data/datasets/Michaud_float16.npy",
    "/home/77462217B/lois/ADMeth/data/datasets/FraCon_float16.npy",
    "/home/77462217B/lois/ADMeth/data/datasets/FraCas_float16.npy"
]

output_base_dir = "/home/77462217B/lois/ADMeth/outcomes/griddatasetv2outcomes/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
# ==== LEER CONFIGURACIÓN DEL MODELO ====
info_path = os.path.join(model_dir, "model_info.txt")
if not os.path.exists(info_path):
    raise FileNotFoundError(f"No se encontró model_info.txt en {model_dir}")

with open(info_path, "r") as f:
    info_lines = f.readlines()

# Parsear info
info_dict = {}
for line in info_lines:
    key, value = line.strip().split(":", 1)
    info_dict[key.strip()] = value.strip()

dense_layers = ast.literal_eval(info_dict["Dense layers"])
latent_dim = int(info_dict["Latent dimensions"])
use_dropout = info_dict["Dropout"].lower() == "true"
use_batchnorm = info_dict["BatchNorm"].lower() == "true"

segment_size = 10000
print(f"Configuración leída del modelo: Dropout={use_dropout}, BatchNorm={use_batchnorm}, Segment size={segment_size}")

Configuración leída del modelo: Dropout=False, BatchNorm=True, Segment size=10000


In [18]:
# ==== DEFINICIÓN DEL MODELO ====
class SegmentAutoencoder(nn.Module):
    def __init__(self, input_size, latent_size, use_dropout, use_batchnorm):
        super().__init__()
        layers_enc = [nn.Linear(input_size, dense_layers[0]), nn.ReLU()]
        if use_batchnorm: layers_enc.append(nn.BatchNorm1d(dense_layers[0]))
        if use_dropout: layers_enc.append(nn.Dropout(0.2))
        layers_enc.append(nn.Linear(dense_layers[0], latent_size))
        layers_enc.append(nn.ReLU())
        if use_batchnorm: layers_enc.append(nn.BatchNorm1d(latent_size))
        if use_dropout: layers_enc.append(nn.Dropout(0.2))

        layers_dec = [nn.Linear(latent_size, dense_layers[2]), nn.ReLU()]
        if use_batchnorm: layers_dec.append(nn.BatchNorm1d(dense_layers[2]))
        if use_dropout: layers_dec.append(nn.Dropout(0.2))
        layers_dec.append(nn.Linear(dense_layers[2], input_size))
        layers_dec.append(nn.Sigmoid())

        self.encoder = nn.Sequential(*layers_enc)
        self.decoder = nn.Sequential(*layers_dec)

    def forward(self, x):
        return self.decoder(self.encoder(x))

In [19]:
# ==== LISTAR MODELOS ORDENADOS ====
model_files = sorted(
    [f for f in os.listdir(model_dir) if f.startswith("autoencoder_segment_") and f.endswith(".pth")],
    key=lambda x: int(re.search(r"_(\d+)\.pth", x).group(1))
)

num_segments = len(model_files)
print(f"Se encontraron {num_segments} modelos de segmentos.")

Se encontraron 32 modelos de segmentos.


In [20]:
# ==== EVALUAR ====
for dataset_path in datasets:
    dataset_name = os.path.splitext(os.path.basename(dataset_path))[0]

    # Crear carpeta específica para este dataset
    dataset_output_dir = os.path.join(output_base_dir, dataset_name)
    os.makedirs(dataset_output_dir, exist_ok=True)

    print(f"\n→ Evaluando dataset: {dataset_name}")
    X_data = np.load(dataset_path).astype(np.float16).T
    n_samples, total_features = X_data.shape
    expected_features = 320000
    assert total_features == expected_features, f"{dataset_path} tiene {total_features} columnas, se esperaban {expected_features}"

    mse_matrix = np.zeros((n_samples, total_features), dtype=np.float16)
    segment_mse_avgs = []

    for idx, model_file in enumerate(model_files):
        # ==== Seleccionar el segmento correspondiente ====
        start = idx * segment_size
        end = start + segment_size
        X_segment = X_data[:, start:end]  # <-- División en segmentos
    
        # ==== Cargar el modelo del segmento ====
        model = SegmentAutoencoder(segment_size, latent_dim, use_dropout, use_batchnorm).to(device)
        model.load_state_dict(torch.load(os.path.join(model_dir, model_file), map_location=device))
        model.eval()
    
        # ==== Predicciones y cálculo de MSE por posición ====
#        with torch.no_grad():
#            X_tensor = torch.tensor(X_segment, device=device, dtype=torch.float32)
#            preds = model(X_tensor).cpu().numpy()
#            mse_per_position = (preds - X_segment) ** 2
#            mse_matrix[:, start:end] = mse_per_position
#            segment_mse_avgs.append(mse_per_position.mean())
            # ==== Predicciones y cálculo de MSE por posición (ignorando ceros como missing) ====
        with torch.no_grad():
            X_tensor = torch.tensor(X_segment, device=device, dtype=torch.float32)
            preds = model(X_tensor).cpu().numpy().astype(np.float16)
    
        X_seg32 = X_segment.astype(np.float32)
        mse_per_position = (preds - X_seg32) ** 2
    
        # Máscara de missing (valor original exactamente 0)
        missing_mask = (X_segment == 0)
    
        # Ponemos -1 en las posiciones missing en la matriz de salida
        mse_per_position[missing_mask] = -1.0
    
        # Guardamos en la matriz global
        mse_matrix[:, start:end] = mse_per_position.astype(np.float16)
    
        # Media del segmento sin contar missing
        valid_mask = ~missing_mask
        if np.any(valid_mask):
            seg_mean = mse_per_position[valid_mask].mean()
        else:
            seg_mean = np.nan
        segment_mse_avgs.append(seg_mean)



    # ==== MÉTRICAS GLOBALES (ignorando missings) ====
    missing_mask_all = (mse_matrix == -1)
    total_entries = mse_matrix.size
    num_missing = int(missing_mask_all.sum())
    missing_pct = 100.0 * num_missing / total_entries
    
    valid_mask_all = ~missing_mask_all
    if np.any(valid_mask_all):
        # AE = |pred - true| = sqrt(MSE por posición)
        ae_valid = np.sqrt(mse_matrix[valid_mask_all].astype(np.float32))
    
        mae_global = float(ae_valid.mean())
        rmse_global = float(np.sqrt(mse_matrix[valid_mask_all].astype(np.float32).mean()))
        median_ae = float(np.median(ae_valid))
        p95_ae = float(np.percentile(ae_valid, 95))
    
        # MSE global (por si lo seguías guardando)
        dataset_mse_global = float(mse_matrix[valid_mask_all].astype(np.float32).mean())
    else:
        mae_global = rmse_global = median_ae = p95_ae = np.nan
        dataset_mse_global = np.nan
    
    print(f"✅ MSE global: {dataset_mse_global:.6f}")
    print(f"✅ MAE: {mae_global:.6f} | RMSE: {rmse_global:.6f} | Mediana AE: {median_ae:.6f} | P95 AE: {p95_ae:.6f}")
    print(f"✅ Missing: {missing_pct:.2f}%")
    
    # ==== PLOT DISTRIBUCIÓN DE ERRORES ABSOLUTOS ====
    max_points_for_plot = 2_000_000
    ae_for_plot = ae_valid
    if ae_for_plot.size > max_points_for_plot:
        idx = np.random.choice(ae_for_plot.size, size=max_points_for_plot, replace=False)
        ae_for_plot = ae_for_plot[idx]
    
    plt.figure()
    plt.hist(ae_for_plot, bins=100)
    plt.xlabel("Absolute Error")
    plt.ylabel("Count")
    plt.title(f"Abs Error Distribution – {dataset_name}")
    plot_path = os.path.join(dataset_output_dir, f"{dataset_name}_abs_error_hist.png")
    plt.tight_layout()
    plt.savefig(plot_path, dpi=150)
    plt.close()
    print(f"Guardado plot: {plot_path}")


    # ==== GUARDAR MATRIZ COMPLETA ====
    mse_npy_path = os.path.join(dataset_output_dir, f"{dataset_name}_mse_per_sample_per_position.npy")
    np.save(mse_npy_path, mse_matrix)
    print(f"Guardado: {mse_npy_path}")

    # ==== GUARDAR RESUMEN ====
    summary_row = {
        "Dataset": dataset_name,
        "Missing_Pct": missing_pct,
        "MSE_Global": dataset_mse_global,
        "MAE_Global": mae_global,
        "RMSE_Global": rmse_global,
        "Median_AE": median_ae,
        "P95_AE": p95_ae,
        **{f"MSE_Segment_{i+1}": segment_mse_avgs[i] for i in range(num_segments)}
    }

    summary_df = pd.DataFrame([summary_row])
    summary_csv_path = os.path.join(dataset_output_dir, f"{dataset_name}_mse_summary.csv")
    summary_df.to_csv(summary_csv_path, index=False)
    print(f"Guardado: {summary_csv_path}")

    # ==== GUARDAR CONFIGURACIÓN DEL MODELO ====
    info_copy_path = os.path.join(dataset_output_dir, "model_info.txt")
    with open(info_copy_path, "w") as f:
        f.writelines(info_lines)
    print(f"Guardado: {info_copy_path}")



→ Evaluando dataset: ValidateDataset5K_float16


RuntimeError: Error(s) in loading state_dict for SegmentAutoencoder:
	Missing key(s) in state_dict: "encoder.3.weight", "encoder.3.bias", "encoder.5.weight", "encoder.5.bias", "encoder.5.running_mean", "encoder.5.running_var", "decoder.3.weight", "decoder.3.bias". 
	Unexpected key(s) in state_dict: "encoder.6.weight", "encoder.6.bias", "encoder.6.running_mean", "encoder.6.running_var", "encoder.6.num_batches_tracked", "encoder.4.weight", "encoder.4.bias", "decoder.4.weight", "decoder.4.bias". 