# Splunk App for Data Science and Deep Learning - Notebook for Autoencoder with TensorFlow and Keras (version 2.15)

## Autoencoder Example
This notebook contains an example workflow how to work on custom containerized code that seamlessly interfaces with the Splunk App for Data Science and Deep Learning (DSDL). As an example we use a custom autoencoder built on keras and tensorflow.

Note: By default every time you save this notebook the cells are exported into a python module which is then invoked by Splunk MLTK commands like <code> | fit ... | apply ... | summary </code>. Please read the Model Development Guide in the Deep Learning Toolkit app for more information.

## Stage 0 - import libraries
At stage 0 we define all imports necessary to run our subsequent code depending on various libraries.

In [None]:
# mltkc_import
# this definition exposes all python module imports that should be available in all subsequent commands

import json
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Importar helpers empresariales
import sys
sys.path.append('/dltk/notebooks_custom/helpers')

from telemetry_helper import log_metrics, log_training_step, log_error
from metrics_calculator import calculate_all_metrics
from preprocessor import standard_preprocessing, apply_preprocessing

# Global constants
MODEL_DIRECTORY = "/srv/app/model/data/"

# Configuraci√≥n del modelo (usando naming est√°ndar)
APP_NAME = "app1"
MODEL_TYPE = "autoencoder"
USE_CASE = "demo_anomalias"
VERSION = "v1"
MODEL_NAME = f"{APP_NAME}_{MODEL_TYPE}_{USE_CASE}_{VERSION}"

print(f"üì¶ Modelo configurado: {MODEL_NAME}")
print(f"‚úÖ Helpers empresariales importados correctamente")

In [None]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing purposes
print("numpy version: " + np.__version__)
print("pandas version: " + pd.__version__)
print("TensorFlow version: " + tf.__version__)
print("Keras version: " + keras.__version__)

In [None]:
### QUERY DATA FROM SPLUNK

# THIS CELL IS NOT EXPORTED - EDA: Exploraci√≥n de datos
from dsdlsupport import SplunkSearch

# Obtener muestra de datos para exploraci√≥n
print("üîç Obteniendo muestra de datos de Splunk...")
search = SplunkSearch.SplunkSearch(
    search='index=demo_anomalias_data | head 1000 | table feature_*'
)

In [None]:
### LOAD DATA SPLUNK TO DATA FRAME

# THIS CELL IS NOT EXPORTED - EDA: Informaci√≥n b√°sica
df_eda = search.as_df()
print(f"‚úÖ Datos obtenidos: {df_eda.shape[0]} filas, {df_eda.shape[1]} columnas")
df_eda.head()

In [None]:
### 2.2 Informaci√≥n B√°sica del Dataset

# THIS CELL IS NOT EXPORTED - EDA: Informaci√≥n b√°sica
print("=" * 60)
print("INFORMACI√ìN B√ÅSICA DEL DATASET")
print("=" * 60)
print(f"\nüìä Dimensiones: {df_eda.shape}")
print(f"\nüìã Columnas: {list(df_eda.columns)}")
print(f"\nüìà Tipos de datos:\n{df_eda.dtypes}")
print(f"\nüìâ Informaci√≥n completa:")
df_eda.info()

In [None]:
### 2.3 Estad√≠sticas Descriptivas

# THIS CELL IS NOT EXPORTED - EDA: Estad√≠sticas descriptivas
print("=" * 60)
print("ESTAD√çSTICAS DESCRIPTIVAS")
print("=" * 60)
print(df_eda.describe())


In [None]:
### 2.4 Detecci√≥n de Valores Faltantes

# THIS CELL IS NOT EXPORTED - EDA: Valores faltantes
print("=" * 60)
print("VALORES FALTANTES")
print("=" * 60)
missing = df_eda.isnull().sum()
if missing.sum() > 0:
    print("‚ö†Ô∏è Se encontraron valores faltantes:")
    print(missing[missing > 0])
else:
    print("‚úÖ No hay valores faltantes")

In [None]:
### 2.5 Visualizaciones B√°sicas

# THIS CELL IS NOT EXPORTED - EDA: Visualizaciones
import matplotlib.pyplot as plt
import seaborn as sns

# Configurar estilo
plt.style.use('seaborn-v0_8-darkgrid')
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('An√°lisis Exploratorio de Datos - demo_anomalias_data', fontsize=16)

# Histogramas de las primeras 4 features
for i, col in enumerate(df_eda.columns[:4]):
    ax = axes[i // 2, i % 2]
    df_eda[col].hist(bins=50, ax=ax, alpha=0.7)
    ax.set_title(f'Distribuci√≥n de {col}')
    ax.set_xlabel('Valor')
    ax.set_ylabel('Frecuencia')

plt.tight_layout()
plt.show()

In [None]:
### 2.6 Matriz de Correlaci√≥n

# THIS CELL IS NOT EXPORTED - EDA: Correlaciones
import numpy as np

# Calcular matriz de correlaci√≥n
corr_matrix = df_eda.corr()

# Visualizar
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Matriz de Correlaci√≥n - Features')
plt.tight_layout()
plt.show()

In [None]:
### 2.7 Conclusiones del EDA

# THIS CELL IS NOT EXPORTED - EDA: Conclusiones
print("=" * 60)
print("CONCLUSIONES DEL EDA")
print("=" * 60)

# Obtener solo columnas num√©ricas para an√°lisis
numeric_cols = df_eda.select_dtypes(include=[np.number]).columns
df_numeric = df_eda[numeric_cols]

print("‚úÖ Dimensiones del dataset:", df_eda.shape)
print("   - Filas (muestras):", df_eda.shape[0])
print("   - Columnas (features):", df_eda.shape[1])
print("   - Features num√©ricas:", len(df_numeric.columns))

print("\n‚úÖ Valores faltantes:", df_eda.isnull().sum().sum())
if df_eda.isnull().sum().sum() > 0:
    print("   ‚ö†Ô∏è  Hay valores faltantes que necesitamos manejar")

# Rango de valores (solo para columnas num√©ricas)
if len(df_numeric.columns) > 0:
    min_val = df_numeric.min().min()
    max_val = df_numeric.max().max()
    print(f"\n‚úÖ Rango de valores (num√©ricos):")
    print(f"   - M√≠nimo: {min_val:.2f}")
    print(f"   - M√°ximo: {max_val:.2f}")
    print(f"   - Rango total: {max_val - min_val:.2f}")
    
    # Verificar si necesitamos normalizaci√≥n
    std_values = df_numeric.std()
    if std_values.max() / std_values.min() > 10:
        print("   ‚ö†Ô∏è  Hay features con escalas muy diferentes ‚Üí Normalizaci√≥n REQUERIDA")
    else:
        print("   ‚úÖ Escalas similares ‚Üí Normalizaci√≥n recomendada")
else:
    print("\n‚ö†Ô∏è  No se encontraron columnas num√©ricas")

print("\nüìù Decisiones para el modelo basadas en EDA:")
print("   - Features a usar: Todas las num√©ricas disponibles")
print("   - Preprocesamiento: Normalizaci√≥n (StandardScaler)")
print("   - Arquitectura: Autoencoder simple (input ‚Üí encoding ‚Üí output)")
print("   - Encoding dimension: ~10% del input dimension (ajustable)")

In [None]:
# THIS CELL IS NOT EXPORTED - Verificar helpers
print("üîç Verificando helpers empresariales...")

try:
    from telemetry_helper import log_metrics
    print("‚úÖ telemetry_helper importado")
except ImportError as e:
    print(f"‚ùå Error importando telemetry_helper: {e}")

try:
    from metrics_calculator import calculate_all_metrics
    print("‚úÖ metrics_calculator importado")
except ImportError as e:
    print(f"‚ùå Error importando metrics_calculator: {e}")

try:
    from preprocessor import standard_preprocessing
    print("‚úÖ preprocessor importado")
except ImportError as e:
    print(f"‚ùå Error importando preprocessor: {e}")

print("\n‚úÖ Todos los helpers est√°n disponibles")

## Stage 2 - create and initialize a model

In [None]:
# mltkc_init
# initialize the model
# params: data and parameters
# returns the model object which will be used as a reference to call fit, apply and summary subsequently

def init(df, param):
    """
    Inicializar autoencoder para detecci√≥n de anomal√≠as.
    
    Args:
        df: DataFrame con datos de Splunk
        param: Diccionario con par√°metros del modelo
    
    Returns:
        model: Modelo Keras compilado
    """
    print(f"üîß Inicializando modelo: {MODEL_NAME}")
    
    # Obtener features del DataFrame
    if 'feature_variables' in param:
        feature_cols = param['feature_variables']
    else:
        # Si no hay feature_variables definidas, usar todas las num√©ricas
        feature_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
        if not feature_cols:
            # Fallback: buscar columnas que empiecen con 'feature_'
            feature_cols = [col for col in df.columns if col.startswith('feature_')]
    
    X = df[feature_cols] if feature_cols else df.select_dtypes(include=[np.number])
    
    print(f"üìä Shape de los datos: {X.shape}")
    print(f"üìã Features seleccionadas: {len(X.columns)}")
    
    input_dim = X.shape[1]
    
    # Par√°metros del modelo (con valores por defecto)
    encoding_dim = 10  # Dimensi√≥n de la capa oculta (bottleneck)
    if 'options' in param and 'params' in param['options']:
        if 'encoding_dim' in param['options']['params']:
            encoding_dim = int(param['options']['params']['encoding_dim'])
        if 'components' in param['options']['params']:
            encoding_dim = int(param['options']['params']['components'])
    
    activation = 'relu'
    if 'options' in param and 'params' in param['options']:
        if 'activation' in param['options']['params']:
            activation = param['options']['params']['activation']
    
    print(f"‚öôÔ∏è  Par√°metros del modelo:")
    print(f"   - Input dimension: {input_dim}")
    print(f"   - Encoding dimension: {encoding_dim}")
    print(f"   - Activation: {activation}")
    
    # Construir autoencoder
    # Encoder
    encoder = keras.layers.Dense(
        encoding_dim, 
        activation=activation,
        input_shape=(input_dim,),
        name='encoder'
    )
    
    # Decoder
    decoder = keras.layers.Dense(
        input_dim,
        activation=activation,
        name='decoder'
    )
    
    # Modelo completo
    model = keras.Sequential([
        encoder,
        decoder
    ], name='Autoencoder')
    
    # Compilar modelo
    model.compile(
        optimizer='adam',
        loss='mse',  # Mean Squared Error para autoencoder
        metrics=['mae']  # Mean Absolute Error como m√©trica adicional
    )
    
    print(f"‚úÖ Modelo compilado exitosamente")
    print(f"üìê Arquitectura: {input_dim} ‚Üí {encoding_dim} ‚Üí {input_dim}")
    
    return model

In [None]:
# THIS CELL IS NOT EXPORTED - Test init localmente
# Crear datos dummy para probar
test_df = pd.DataFrame({
    'feature_0': np.random.randn(100),
    'feature_1': np.random.randn(100),
    'feature_2': np.random.randn(100),
    'feature_3': np.random.randn(100),
    'feature_4': np.random.randn(100)
})

test_param = {
    'feature_variables': ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4'],
    'options': {
        'params': {
            'encoding_dim': 10
        }
    }
}

test_model = init(test_df, test_param)
print("\nüìä Resumen del modelo:")
test_model.summary()

## Stage 3 - fit the model

In [None]:
# mltkc_stage_create_model_fit
# returns a fit info json object

def fit(model, df, param):
    """
    Entrenar autoencoder con telemetr√≠a autom√°tica.
    
    Args:
        model: Modelo Keras inicializado
        df: DataFrame con datos de entrenamiento
        param: Diccionario con par√°metros de entrenamiento
    
    Returns:
        dict: Informaci√≥n del entrenamiento (historial, m√©tricas, etc.)
    """
    print(f"üöÄ Iniciando entrenamiento del modelo: {MODEL_NAME}")
    
    returns = {}
    
    # Obtener features
    if 'feature_variables' in param:
        feature_cols = param['feature_variables']
    else:
        feature_cols = [col for col in df.columns if col.startswith('feature_')]
        if not feature_cols:
            feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    X = df[feature_cols] if feature_cols else df.select_dtypes(include=[np.number])
    
    print(f"üìä Datos de entrenamiento: {X.shape[0]} muestras, {X.shape[1]} features")
    
    # Preprocesamiento: Normalizaci√≥n
    print("üîß Aplicando preprocesamiento (normalizaci√≥n)...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    
    # Guardar scaler en returns para uso posterior
    returns['scaler'] = scaler
    
    # Par√°metros de entrenamiento
    epochs = 50
    batch_size = 32
    validation_split = 0.2
    
    if 'options' in param and 'params' in param['options']:
        if 'epochs' in param['options']['params']:
            epochs = int(param['options']['params']['epochs'])
        if 'batch_size' in param['options']['params']:
            batch_size = int(param['options']['params']['batch_size'])
        if 'validation_split' in param['options']['params']:
            validation_split = float(param['options']['params']['validation_split'])
    
    print(f"‚öôÔ∏è  Par√°metros de entrenamiento:")
    print(f"   - Epochs: {epochs}")
    print(f"   - Batch size: {batch_size}")
    print(f"   - Validation split: {validation_split}")
    
    # Callback para TensorBoard (opcional)
    log_dir = f"/srv/notebooks/logs/fit/{MODEL_NAME}_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir,
        histogram_freq=1
    )
    
    # Callback personalizado para logging de telemetr√≠a
    class TelemetryCallback(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            """Enviar m√©tricas de cada √©poca a Splunk"""
            logs = logs or {}
            try:
                # ‚ö†Ô∏è CR√çTICO: Convertir valores NumPy/Pandas a tipos nativos de Python para JSON serialization
                # Los valores int64/float64 de NumPy no son serializables a JSON directamente
                epoch_value = int(epoch + 1)  # Convertir a int nativo
                loss_value = float(logs.get('loss', 0)) if logs.get('loss') is not None else 0.0
                val_loss_value = float(logs.get('val_loss', 0)) if logs.get('val_loss') is not None else 0.0
                mae_value = float(logs.get('mae', 0)) if logs.get('mae') is not None else 0.0
                val_mae_value = float(logs.get('val_mae', 0)) if logs.get('val_mae') is not None else 0.0
                
                log_training_step(
                    model_name=MODEL_NAME,
                    epoch=epoch_value,
                    loss=loss_value,
                    val_loss=val_loss_value,
                    mae=mae_value,
                    val_mae=val_mae_value
                )
            except Exception as e:
                print(f"‚ö†Ô∏è  Error enviando telemetr√≠a en √©poca {epoch + 1}: {e}")
                import traceback
                print(f"   Traceback completo: {traceback.format_exc()}")
    
    telemetry_callback = TelemetryCallback()
    
    # Entrenar modelo
    print("\nüèãÔ∏è  Iniciando entrenamiento...")
    history = model.fit(
        x=X_scaled_df,
        y=X_scaled_df,  # Autoencoder: input = output
        epochs=epochs,
        batch_size=batch_size,
        validation_split=validation_split,
        verbose=1,
        callbacks=[tensorboard_callback, telemetry_callback]
    )
    
    returns['fit_history'] = history
    returns['model_epochs'] = epochs
    returns['model_batch_size'] = batch_size
    returns['scaler'] = scaler  # Guardar scaler para uso en apply
    
    # Evaluar modelo en datos completos
    print("\nüìä Evaluando modelo en datos completos...")
    test_results = model.evaluate(X_scaled_df, X_scaled_df, verbose=0)
    returns['model_loss'] = test_results[0]
    returns['model_mae'] = test_results[1] if len(test_results) > 1 else None
    
    print(f"‚úÖ Entrenamiento completado")
    print(f"   - Loss final: {test_results[0]:.6f}")
    if len(test_results) > 1:
        print(f"   - MAE final: {test_results[1]:.6f}")
    
    # Calcular m√©tricas de reconstrucci√≥n
    print("\nüìà Calculando m√©tricas de reconstrucci√≥n...")
    X_pred = model.predict(X_scaled_df, verbose=0)
    
    # Calcular MSE y RMSE
    mse = mean_squared_error(X_scaled_df.values, X_pred)
    rmse = np.sqrt(mse)
    
    returns['mse'] = float(mse)
    returns['rmse'] = float(rmse)
    
    print(f"   - MSE: {mse:.6f}")
    print(f"   - RMSE: {rmse:.6f}")
    
    # Enviar m√©tricas finales a Splunk (telemetr√≠a)
    try:
        # ‚ö†Ô∏è CR√çTICO: Convertir valores NumPy/Pandas a tipos nativos de Python para JSON serialization
        # Los valores int64/float64 de NumPy no son serializables a JSON directamente
        mae_value = float(returns['model_mae']) if returns['model_mae'] is not None else None
        rmse_value = float(rmse) if rmse is not None else None
        mse_value = float(mse) if mse is not None else None
        loss_value = float(test_results[0]) if test_results[0] is not None else None
        
        log_metrics(
            model_name=MODEL_NAME,
            r2_score=None,  # Autoencoder no tiene R¬≤ tradicional
            mae=mae_value,
            rmse=rmse_value,
            mse=mse_value,
            loss=loss_value,
            app_name=APP_NAME,
            model_version=VERSION,
            project=USE_CASE
        )
        print("‚úÖ M√©tricas enviadas a Splunk")
    except Exception as e:
        print(f"‚ö†Ô∏è  Error enviando m√©tricas a Splunk: {e}")
        import traceback
        print(f"   Traceback completo: {traceback.format_exc()}")
    
    return returns

In [None]:
# THIS CELL IS NOT EXPORTED - Test fit localmente
# Usar datos dummy m√°s grandes
test_df_fit = pd.DataFrame({
    'feature_0': np.random.randn(500),
    'feature_1': np.random.randn(500),
    'feature_2': np.random.randn(500),
    'feature_3': np.random.randn(500),
    'feature_4': np.random.randn(500)
})

test_param_fit = {
    'feature_variables': ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4'],
    'options': {
        'params': {
            'epochs': '10',  # Pocas √©pocas para prueba r√°pida
            'batch_size': '32',
            'validation_split': '0.2'
        }
    }
}

# Crear modelo de prueba
test_model_fit = init(test_df_fit, test_param_fit)

# Entrenar (esto puede tomar unos minutos)
print("‚è≥ Entrenando modelo de prueba (esto tomar√° unos minutos)...")
fit_results = fit(test_model_fit, test_df_fit, test_param_fit)

print("\n‚úÖ Test de fit completado exitosamente")
print(f"   - Loss: {fit_results.get('model_loss', 'N/A')}")
print(f"   - MSE: {fit_results.get('mse', 'N/A')}")

## Stage 4 - apply the model

In [None]:
# mltkc_stage_create_model_apply

def apply(model, df, param):
    """
    Aplicar autoencoder para detecci√≥n de anomal√≠as.
    
    Args:
        model: Modelo Keras entrenado
        df: DataFrame con datos nuevos para inferencia
        param: Diccionario con par√°metros (debe contener scaler de fit)
    
    Returns:
        DataFrame: DataFrame con reconstrucciones y scores de anomal√≠a
    """
    print(f"üîÆ Aplicando modelo: {MODEL_NAME}")
    
    # Obtener features (debe coincidir con las usadas en fit)
    if 'feature_variables' in param:
        feature_cols = param['feature_variables']
    else:
        feature_cols = [col for col in df.columns if col.startswith('feature_')]
        if not feature_cols:
            feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    X = df[feature_cols] if feature_cols else df.select_dtypes(include=[np.number])
    
    print(f"üìä Datos de inferencia: {X.shape[0]} muestras, {X.shape[1]} features")
    
    # Obtener scaler del entrenamiento (desde param o fit_results)
    scaler = None
    if 'scaler' in param:
        scaler = param['scaler']
    elif hasattr(model, 'scaler'):
        scaler = model.scaler
    
    # Aplicar normalizaci√≥n
    if scaler is not None:
        # Usar scaler del entrenamiento
        X_scaled = scaler.transform(X)
        print("‚úÖ Usando scaler del entrenamiento")
    else:
        # Crear nuevo scaler si no est√° disponible (fallback)
        print("‚ö†Ô∏è  Scaler no encontrado en param. Aplicando normalizaci√≥n nueva...")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
    
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    
    # Predecir reconstrucciones
    print("üîÑ Calculando reconstrucciones...")
    X_reconstructed = model.predict(X_scaled_df, verbose=0)
    X_reconstructed_df = pd.DataFrame(X_reconstructed, columns=X.columns, index=X.index)
    
    # Calcular error de reconstrucci√≥n (MSE por muestra)
    reconstruction_error = np.mean((X_scaled_df.values - X_reconstructed_df.values) ** 2, axis=1)
    
    # Calcular threshold para anomal√≠as (percentil 95)
    # En producci√≥n, este threshold deber√≠a venir del conjunto de entrenamiento
    anomaly_threshold = np.percentile(reconstruction_error, 95)
    
    # Detectar anomal√≠as
    is_anomaly = reconstruction_error > anomaly_threshold
    anomaly_score = reconstruction_error / (anomaly_threshold + 1e-10)  # Normalizar score
    
    print(f"üìä Estad√≠sticas de reconstrucci√≥n:")
    print(f"   - Error medio: {np.mean(reconstruction_error):.6f}")
    print(f"   - Error mediano: {np.median(reconstruction_error):.6f}")
    print(f"   - Threshold (percentil 95): {anomaly_threshold:.6f}")
    print(f"   - Anomal√≠as detectadas: {np.sum(is_anomaly)} / {len(is_anomaly)} ({100*np.mean(is_anomaly):.2f}%)")
    
    # Construir DataFrame de resultados
    results = pd.DataFrame({
        'reconstruction_error': reconstruction_error,
        'anomaly_score': anomaly_score,
        'is_anomaly': is_anomaly.astype(int)
    }, index=X.index)
    
    # Agregar reconstrucciones como columnas
    for i, col in enumerate(X.columns):
        results[f'reconstruction_{col}'] = X_reconstructed_df[col].values
        results[f'original_{col}'] = X[col].values
    
    print(f"‚úÖ Inferencia completada")
    print(f"   - Shape de resultados: {results.shape}")
    
    # Enviar telemetr√≠a de inferencia a Splunk
    try:
        # ‚ö†Ô∏è CR√çTICO: Convertir valores NumPy/Pandas a tipos nativos de Python para JSON serialization
        # Los valores int64/float64 de NumPy no son serializables a JSON directamente
        
        # IMPORTANTE: Usar .item() para convertir scalars NumPy a tipos nativos de Python
        # Esto es m√°s robusto que int() o float() porque maneja todos los tipos NumPy
        num_predictions = int(len(df))  # len() ya retorna int nativo
        
        # Para valores NumPy, usar .item() si est√° disponible, sino usar int()/float()
        if hasattr(is_anomaly.sum(), 'item'):
            num_anomalies = int(is_anomaly.sum().item())
        else:
            num_anomalies = int(is_anomaly.sum())
        
        if hasattr(reconstruction_error.mean(), 'item'):
            avg_reconstruction_error = float(reconstruction_error.mean().item())
        else:
            avg_reconstruction_error = float(reconstruction_error.mean())
        
        if hasattr(anomaly_threshold, 'item'):
            anomaly_threshold_native = float(anomaly_threshold.item())
        else:
            anomaly_threshold_native = float(anomaly_threshold)
        
        # ‚ö†Ô∏è DIAGN√ìSTICO: Verificar que todos los valores son serializables a JSON
        # Esto ayuda a identificar problemas antes de pasarlos al helper
        # En apply(), ANTES de llamar a log_metrics/log_prediction, agregar:
        import json
        
        # Preparar todos los valores convertidos
        telemetry_data = {
            "model_name": MODEL_NAME,
            "num_predictions": int(len(df)),
            "num_anomalies": int(is_anomaly.sum()),
            "avg_reconstruction_error": float(reconstruction_error.mean()),
            "anomaly_threshold": float(anomaly_threshold),
            "app_name": APP_NAME,
            "model_version": VERSION,
            "project": USE_CASE
        }
        
        # Eliminar valores None
        telemetry_data = {k: v for k, v in telemetry_data.items() if v is not None}
        
        # INTENTAR serializar a JSON para verificar que todos los valores son serializables
        try:
            json.dumps(telemetry_data)
            print("‚úÖ Todos los valores son serializables a JSON")
        except TypeError as e:
            print(f"‚ùå ERROR DE SERIALIZACI√ìN: {e}")
            print(f"   Valores problem√°ticos:")
            for k, v in telemetry_data.items():
                try:
                    json.dumps({k: v})
                except TypeError:
                    print(f"      - {k}: {type(v)} = {v}")
                    # Convertir cualquier valor NumPy restante
                    if hasattr(v, 'item'):  # Es un scalar NumPy
                        telemetry_data[k] = v.item()
                    elif isinstance(v, (np.integer, np.floating)):
                        telemetry_data[k] = float(v) if isinstance(v, np.floating) else int(v)
            
            # Intentar de nuevo
            try:
                json.dumps(telemetry_data)
                print("‚úÖ Valores corregidos, ahora son serializables")
            except TypeError as e2:
                print(f"‚ùå ERROR PERSISTENTE: {e2}")
                raise  # Re-lanzar el error para que se capture en el except externo
        
        # ‚ö†Ô∏è CR√çTICO: Usar log_metrics directamente (ya est√° importado al inicio del notebook)
        # NO re-importar log_metrics aqu√≠ porque causa UnboundLocalError
        # Intentar usar log_prediction si est√° disponible, sino usar log_metrics directamente
        try:
            # Intentar importar log_prediction si est√° disponible
            try:
                from telemetry_helper import log_prediction
                # Si log_prediction existe, usarlo
                log_prediction(
                    model_name=telemetry_data["model_name"],
                    num_predictions=telemetry_data["num_predictions"],
                    num_anomalies=telemetry_data["num_anomalies"],
                    avg_reconstruction_error=telemetry_data["avg_reconstruction_error"],
                    anomaly_threshold=telemetry_data["anomaly_threshold"],
                    app_name=telemetry_data["app_name"],
                    model_version=telemetry_data["model_version"],
                    owner=OWNER if 'OWNER' in globals() else None,
                    project=telemetry_data["project"]
                )
                print("‚úÖ Telemetr√≠a de inferencia enviada a Splunk (usando log_prediction)")
            except ImportError:
                # Si log_prediction no existe, usar log_metrics directamente (ya est√° importado)
                # NO re-importar log_metrics aqu√≠ porque ya est√° importado al inicio del notebook
                log_metrics(
                    model_name=telemetry_data["model_name"],
                    num_predictions=telemetry_data["num_predictions"],
                    num_anomalies=telemetry_data["num_anomalies"],
                    avg_reconstruction_error=telemetry_data["avg_reconstruction_error"],
                    anomaly_threshold=telemetry_data["anomaly_threshold"],
                    app_name=telemetry_data["app_name"],
                    model_version=telemetry_data["model_version"],
                    project=telemetry_data["project"]
                )
                print("‚úÖ Telemetr√≠a de inferencia enviada a Splunk (usando log_metrics)")
        except Exception as telemetry_error:
            # Capturar cualquier otro error de telemetr√≠a (no solo ImportError)
            print(f"‚ö†Ô∏è  Error en telemetr√≠a (despu√©s de verificaci√≥n JSON): {telemetry_error}")
            import traceback
            print(f"   Traceback: {traceback.format_exc()}")
            # No re-lanzar el error para que apply() pueda continuar
    except Exception as e:
        print(f"‚ö†Ô∏è  Error enviando telemetr√≠a de inferencia a Splunk: {e}")
        import traceback
        print(f"   Traceback completo: {traceback.format_exc()}")
    
    return results

In [None]:
# THIS CELL IS NOT EXPORTED - Test apply localmente
# Crear datos nuevos para inferencia
test_df_apply = pd.DataFrame({
    'feature_0': np.random.randn(100),
    'feature_1': np.random.randn(100),
    'feature_2': np.random.randn(100),
    'feature_3': np.random.randn(100),
    'feature_4': np.random.randn(100)
})

# Agregar scaler al param (simulando que viene de fit)
test_param_apply = {
    'feature_variables': ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4'],
    'scaler': fit_results.get('scaler')  # Usar scaler del fit anterior
}

# Aplicar modelo
results = apply(test_model_fit, test_df_apply, test_param_apply)

print("\nüìä Primeras 10 filas de resultados:")
print(results.head(10))

print("\nüìà Estad√≠sticas de anomal√≠as:")
print(f"   - Total muestras: {len(results)}")
print(f"   - Anomal√≠as detectadas: {results['is_anomaly'].sum()}")
print(f"   - Porcentaje: {100 * results['is_anomaly'].mean():.2f}%")

## Stage 7 - provide a summary of the model

In [None]:
# return model summary

def summary(model=None):
    """
    Proporcionar metadatos y resumen del modelo.
    
    Args:
        model: Modelo Keras (opcional)
    
    Returns:
        dict: Metadatos del modelo
    """
    returns = {
        "model_name": MODEL_NAME,
        "app_name": APP_NAME,
        "model_type": MODEL_TYPE,
        "use_case": USE_CASE,
        "version": VERSION,
        "version_info": {
            "tensorflow": tf.__version__,
            "keras": keras.__version__,
            "numpy": np.__version__,
            "pandas": pd.__version__
        }
    }
    
    if model is not None:
        # Guardar resumen del modelo como string
        s = []
        model.summary(print_fn=lambda x: s.append(x + '\n'))
        returns["model_summary"] = ''.join(s)
        
        # Informaci√≥n de la arquitectura
        # ‚ö†Ô∏è CR√çTICO: Convertir valores NumPy a tipos nativos de Python para JSON serialization
        # DSDL serializa el resultado de summary() a JSON, y valores NumPy causan errores
        total_params = model.count_params()
        trainable_params = sum([tf.size(w).numpy() for w in model.trainable_weights])
        
        # Convertir a tipos nativos de Python
        if hasattr(total_params, 'item'):
            total_params = int(total_params.item())
        else:
            total_params = int(total_params)
        
        if hasattr(trainable_params, 'item'):
            trainable_params = int(trainable_params.item())
        else:
            trainable_params = int(trainable_params)
        
        returns["model_architecture"] = {
            "input_shape": str(model.input_shape) if hasattr(model, 'input_shape') else "N/A",
            "output_shape": str(model.output_shape) if hasattr(model, 'output_shape') else "N/A",
            "total_params": total_params,
            "trainable_params": trainable_params
        }
        
        # Informaci√≥n de capas
        returns["layers"] = []
        for i, layer in enumerate(model.layers):
            # Obtener output_shape de manera segura
            output_shape = "N/A"
            try:
                # En Keras 2.x/TensorFlow 2.x, intentar m√∫ltiples m√©todos
                if hasattr(layer, 'output') and layer.output is not None:
                    # M√©todo 1: Desde el tensor output (disponible despu√©s de build)
                    try:
                        output_shape = str(layer.output.shape)
                    except:
                        pass
                
                if output_shape == "N/A":
                    # M√©todo 2: Intentar obtener desde config
                    if hasattr(layer, 'get_config'):
                        config = layer.get_config()
                        if 'output_shape' in config:
                            output_shape = str(config['output_shape'])
                
                if output_shape == "N/A":
                    # M√©todo 3: Calcular si es posible
                    if callable(getattr(layer, 'compute_output_shape', None)):
                        # Necesitamos input_shape, intentar obtenerlo
                        if i == 0 and hasattr(model, 'input_shape') and model.input_shape:
                            # Primera capa: usar input_shape del modelo
                            computed = layer.compute_output_shape(model.input_shape)
                            output_shape = str(computed)
                        elif hasattr(layer, 'input_shape') and layer.input_shape:
                            # Capas intermedias: usar input_shape de la capa
                            computed = layer.compute_output_shape(layer.input_shape)
                            output_shape = str(computed)
            except Exception:
                # Si todo falla, usar "N/A"
                output_shape = "N/A"
            
            # Obtener par√°metros de manera segura
            params = 0
            try:
                params_raw = layer.count_params()
                # ‚ö†Ô∏è CR√çTICO: Convertir a tipo nativo de Python para JSON serialization
                if hasattr(params_raw, 'item'):
                    params = int(params_raw.item())
                else:
                    params = int(params_raw)
            except Exception:
                params = 0
            
            returns["layers"].append({
                "index": i,
                "name": layer.name,
                "type": type(layer).__name__,
                "output_shape": output_shape,
                "params": params
            })
    
    return returns

In [None]:
# THIS CELL IS NOT EXPORTED - Test summary
model_summary = summary(test_model_fit)
print("üìä Resumen del modelo:")
print(json.dumps(model_summary, indent=2, default=str))

### Stage 8 - save model

In [None]:
# mltkc_save
# Funci√≥n REQUERIDA: DSDL llama a save(model, name) despu√©s de fit()

def save(model, name):
    """
    Guardar modelo Keras en disco.
    
    IMPORTANTE: Esta funci√≥n es llamada autom√°ticamente por DSDL despu√©s de fit().
    
    Args:
        model: Modelo Keras entrenado (retornado por fit())
        name: Nombre del modelo (pasado por DSDL desde "into app:model_name")
    
    Returns:
        model: Retorna el modelo (requerido por DSDL)
    """
    # Importar os si no est√° disponible (para cuando DSDL exporta el m√≥dulo)
    import os
    
    # Asegurar que el directorio existe
    os.makedirs(MODEL_DIRECTORY, exist_ok=True)
    
    # Guardar modelo Keras
    filepath = MODEL_DIRECTORY + name + ".keras"
    model.save(filepath)
    
    print(f"‚úÖ Modelo guardado en: {filepath}")
    print(f"üìä Tama√±o del archivo: {os.path.getsize(filepath) / (1024*1024):.2f} MB")
    
    # NOTA: Si tienes un scaler u otros objetos, gu√°rdalos tambi√©n
    # Ejemplo: si el scaler est√° en el modelo o en globals
    # from sklearn.externals import joblib  # o import joblib
    # if hasattr(model, 'scaler'):
    #     joblib.dump(model.scaler, MODEL_DIRECTORY + name + "_scaler.pkl")
    
    # DSDL espera que retornes el modelo
    return model

In [None]:
### 8.2 Probar Funci√≥n `save()` Localmente

# THIS CELL IS NOT EXPORTED - Test save localmente
print("üíæ Probando funci√≥n save()...")

# Verificar que las variables necesarias existen
if 'test_model_fit' not in globals():
    print("‚ö†Ô∏è  test_model_fit no est√° definido.")
    print("   Necesitas ejecutar primero el test de fit() (Paso 6.2)")
    print("   Para crear un modelo de prueba r√°pido, ejecuta:")
    print("""
    # Crear datos dummy
    test_df_fit = pd.DataFrame({
        'feature_0': np.random.randn(100),
        'feature_1': np.random.randn(100),
        'feature_2': np.random.randn(100),
        'feature_3': np.random.randn(100),
        'feature_4': np.random.randn(100)
    })
    test_param_fit = {
        'feature_variables': ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4'],
        'options': {'params': {'epochs': '5', 'batch_size': '32'}}
    }
    test_model_fit = init(test_df_fit, test_param_fit)
    fit_results = fit(test_model_fit, test_df_fit, test_param_fit)
    """)
else:
    try:
        # Asegurar que MODEL_DIRECTORY est√° definido
        try:
            model_dir = MODEL_DIRECTORY
        except NameError:
            model_dir = "/srv/app/model/data/"
        
        # Guardar modelo de prueba usando la firma correcta
        saved_model = save(test_model_fit, name="test_autoencoder")
        print(f"‚úÖ Modelo guardado exitosamente")
        
        # Verificar que el archivo existe
        filepath = model_dir + "test_autoencoder.keras"
        if os.path.exists(filepath):
            file_size = os.path.getsize(filepath) / (1024 * 1024)
            print(f"üìä Tama√±o del archivo: {file_size:.2f} MB")
            print(f"‚úÖ Archivo creado correctamente: {filepath}")
        else:
            print(f"‚ö†Ô∏è  Archivo no encontrado: {filepath}")
            
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()

### Stage 9 - load model

In [None]:
# mltkc_load
# Funci√≥n opcional para cargar modelo guardado durante desarrollo
# DSDL NO llama a esta funci√≥n autom√°ticamente

def load(name):
    """
    Cargar modelo Keras desde disco.
    
    √ötil para desarrollo local o pruebas.
    DSDL NO usa esta funci√≥n autom√°ticamente.
    
    Args:
        name: Nombre del archivo (sin extensi√≥n)
    
    Returns:
        Model: Modelo Keras cargado
    """
    # Importar os si no est√° disponible
    import os
    
    # Asegurar que MODEL_DIRECTORY est√° definido (usar variable global o local)
    try:
        # Intentar usar MODEL_DIRECTORY global
        model_dir = MODEL_DIRECTORY
    except NameError:
        # Si no existe, usar valor por defecto
        model_dir = "/srv/app/model/data/"
    
    filepath = model_dir + name + ".keras"
    
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"‚ùå Archivo no encontrado: {filepath}")
    
    print(f"üì• Cargando modelo desde: {filepath}")
    model = keras.models.load_model(filepath)
    
    print(f"‚úÖ Modelo cargado exitosamente")
    print(f"üìä Arquitectura: {model.input_shape} ‚Üí {model.output_shape}")
    
    return model

In [None]:
### 9.3 Probar Funci√≥n `load()` Localmente (Opcional)

# THIS CELL IS NOT EXPORTED - Test load localmente (opcional)
print("üì• Probando funci√≥n load()...")

# Verificar que el archivo existe antes de intentar cargarlo
import os

# Asegurar que MODEL_DIRECTORY est√° definido
try:
    model_dir = MODEL_DIRECTORY
except NameError:
    model_dir = "/srv/app/model/data/"

test_filepath = model_dir + "test_autoencoder.keras"
if not os.path.exists(test_filepath):
    print(f"‚ö†Ô∏è  Archivo no encontrado: {test_filepath}")
    print("   Necesitas ejecutar primero el test de save() (Paso 8.2)")
    print("   O aseg√∫rate de que test_model_fit existe y ejecuta:")
    print("   saved_model = save(test_model_fit, name='test_autoencoder')")
else:
    try:
        loaded_model = load("test_autoencoder")
        print("‚úÖ Modelo cargado exitosamente")
        
        # Verificar que son equivalentes (solo si test_model_fit existe)
        if 'test_model_fit' in globals():
            print("\nüîç Verificando que el modelo cargado funciona...")
            test_input = np.random.randn(1, 5)  # 5 features
            output_original = test_model_fit.predict(test_input, verbose=0)
            output_loaded = loaded_model.predict(test_input, verbose=0)
            
            if np.allclose(output_original, output_loaded):
                print("‚úÖ Los modelos producen resultados id√©nticos")
            else:
                print("‚ö†Ô∏è  Los modelos producen resultados diferentes")
        else:
            print("‚ö†Ô∏è  test_model_fit no est√° definido, no se puede verificar equivalencia")
            print("   Pero el modelo se carg√≥ correctamente ‚úÖ")
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()