<a href="https://colab.research.google.com/github/luisangel2003ss/modelo/blob/main/cpyWaterSpillOil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook: Predicci√≥n multitarea con Keras

# Librer√≠as

In [None]:
import pandas as pd
import numpy as np
import json
import os
import optuna
import matplotlib.pyplot as plt
import tensorflow as tf
import warnings
import base64
import requests
import re
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.regularizers import l2
warnings.filterwarnings('ignore')

In [None]:
import os
print(os.path.getsize("spill_data_cleaned.csv"))  # Debe mostrar tama√±o > 0


In [None]:
import pandas as pd

df = pd.read_csv("spill_data_cleaned.csv", sep=';', encoding='latin-1', on_bad_lines='skip', engine='python')

# Convertir a min√∫sculas solo las columnas de texto que existen
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.lower()

print(df.head())


# 1. FUNCION PARA ENTRENAR CON OPTUNA

In [None]:
def run_optuna_optimization(X_train, y_reg_train, y_clf_train, n_trials=150):
    """
    Ejecuta optimizaci√≥n con Optuna y guarda los mejores par√°metros
    ESTO SE EJECUTA SOLO UNA VEZ PARA ENCONTRAR LOS MEJORES HIPERPAR√ÅMETROS
    """

    print("EJECUTANDO OPTIMIZACI√ìN CON OPTUNA...")
    print("Esto puede tomar tiempo pero solo se hace UNA VEZ")

    input_dim = X_train.shape[1]
    num_classes = y_clf_train.shape[1]

    # Divisi√≥n para validaci√≥n interna
    X_train_opt, X_val_opt, y_reg_train_opt, y_reg_val_opt, y_clf_train_opt, y_clf_val_opt = train_test_split(
        X_train, y_reg_train, y_clf_train, test_size=0.2, random_state=42
    )

    def objective(trial):

        # Hiperpar√°metros a optimizar
        n_layers = trial.suggest_int('n_layers', 2, 4)
        dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
        l2_reg = trial.suggest_float('l2_reg', 1e-6, 1e-2, log=True)
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
        optimizer_name = trial.suggest_categorical('optimizer', ['adam', 'rmsprop'])
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
        use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])

        # Construir modelo
        input_layer = Input(shape=(input_dim,))
        x = input_layer

        for i in range(n_layers):
            if i == 0:
                neurons = trial.suggest_int(f'neurons_layer_{i}', 64, 256)
            else:
                max_neurons = max(32, int(neurons * 0.7))
                neurons = trial.suggest_int(f'neurons_layer_{i}', 32, max_neurons)

            x = Dense(neurons, activation='relu', kernel_regularizer=l2(l2_reg))(x)

            if use_batch_norm:
                x = BatchNormalization()(x)

            x = Dropout(dropout_rate)(x)

        # Salidas
        regression_output = Dense(1, name='regression')(x)
        classification_output = Dense(num_classes, activation='softmax', name='classification')(x)

        model = Model(inputs=input_layer, outputs=[regression_output, classification_output])

        # Configurar optimizador
        if optimizer_name == 'adam':
            optimizer = Adam(learning_rate=learning_rate)
        else:
            optimizer = RMSprop(learning_rate=learning_rate)

        model.compile(
            optimizer=optimizer,
            loss={'regression': 'mae', 'classification': 'categorical_crossentropy'},
            loss_weights={'regression': 0.5, 'classification': 0.5}
        )

        # Entrenar
        try:
            history = model.fit(
                X_train_opt,
                {'regression': y_reg_train_opt, 'classification': y_clf_train_opt},
                validation_data=(X_val_opt, {'regression': y_reg_val_opt, 'classification': y_clf_val_opt}),
                epochs=35,
                batch_size=batch_size,
                callbacks=[EarlyStopping(patience=5, restore_best_weights=True)],
                verbose=0
            )

            # Evaluar
            val_results = model.evaluate(
                X_val_opt,
                {'regression': y_reg_val_opt, 'classification': y_clf_val_opt},
                verbose=0
            )

            return val_results[0]  # P√©rdida total

        except Exception as e:
            return float('inf')

    # Ejecutar optimizaci√≥n
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Guardar mejores par√°metros
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    best_params_data = {
        'timestamp': timestamp,
        'best_value': study.best_value,
        'best_params': study.best_params,
        'n_trials': len(study.trials),
        'optimization_complete': True
    }

    filename = f'best_params_{timestamp}.json'
    with open(filename, 'w') as f:
        json.dump(best_params_data, f, indent=2)

    print(f"Optimizaci√≥n completada!")
    print(f"Mejores par√°metros guardados en: {filename}")
    print(f"Mejor p√©rdida: {study.best_value:.4f}")

    return filename, study.best_params

# 2. FUNCI√ìN PARA CARGAR PAR√ÅMETROS DESDE JSON

In [None]:
def load_best_params_from_json(json_filename):
    """
    Carga los mejores par√°metros desde un archivo JSON
    ESTO SE USA EN PRODUCCI√ìN - NO NECESITA REOPTIMIZAR
    """

    if not os.path.exists(json_filename):
        raise FileNotFoundError(f"Archivo no encontrado: {json_filename}")

    with open(json_filename, 'r') as f:
        data = json.load(f)

    print(f"Cargando par√°metros desde: {json_filename}")
    print(f"Optimizaci√≥n realizada: {data['timestamp']}")
    print(f"Mejor p√©rdida obtenida: {data['best_value']:.4f}")

    return data['best_params']


# 3. FUNCI√ìN PARA CONSTRUIR MODELO CON PAR√ÅMETROS DADOS

In [None]:
def build_model_from_params(best_params, input_dim, num_classes):
    # Extraer par√°metros con valores por defecto si no existen
    n_layers = best_params.get('n_layers', 2)
    units = best_params.get('units', 64)
    dropout_rate = best_params.get('dropout_rate', 0.3)
    l2_reg = best_params.get('l2_reg', 1e-4)
    learning_rate = best_params.get('learning_rate', 1e-3)

    # Construcci√≥n del modelo
    inputs = Input(shape=(input_dim,), name="input_layer_1") 
    x = inputs

    for _ in range(n_layers):
        x = Dense(units, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg))(x)
        x = BatchNormalization()(x)
        x = Dropout(best_params.get('dropout_rate', 0.3))(x)
        
    # Salidas
    regression_output = Dense(1, name='regression')(x)
    classification_output = Dense(num_classes, activation='softmax', name='classification')(x)

    model = Model(inputs=inputs, outputs=[regression_output, classification_output])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate),
        loss={
            'regression': 'mse',
            'classification': 'categorical_crossentropy'
        },
        metrics={
            'regression': ['mae'],
            'classification': ['accuracy']
        }
    )

    return model


# 4. WORKFLOW PRINCIPAL, OPTIMIZAR O USAR PAR√ÅMETROS EXISTENTES?

In [None]:
# ===== CONFIGURACI√ìN =====
GITHUB_REPO = "luisangel2003ss/modelo"  # Tu repositorio
GITHUB_BRANCH = "main"  # Rama donde subir

# ===== FUNCIONES CORREGIDAS =====
def select_best_params_file(json_files):
    """
    Selecciona autom√°ticamente el mejor archivo de par√°metros basado en:
    1. Fecha de modificaci√≥n m√°s reciente
    2. Si hay archivos con m√©tricas en el nombre, usa el de mejores m√©tricas
    """
    if not json_files:
        print("‚ö†Ô∏è No se encontraron archivos de par√°metros")
        return None

    print(f"üîç Se encontraron {len(json_files)} archivos de par√°metros:")

    files_with_info = []
    for f in json_files:
        try:
            modification_time = os.path.getmtime(f)
            file_size = os.path.getsize(f)

            metrics_in_name = None
            if 'r2_' in f.lower() or 'acc_' in f.lower():
                numbers = re.findall(r'[\d.]+', f)
                if numbers:
                    try:
                        metrics_in_name = float(numbers[0])
                    except ValueError:
                        metrics_in_name = None

            files_with_info.append({
                'filename': f,
                'mod_time': modification_time,
                'metrics': metrics_in_name,
                'size': file_size
            })

            mod_time_str = pd.to_datetime(modification_time, unit='s').strftime('%Y-%m-%d %H:%M')
            metric_str = f", m√©trica: {metrics_in_name}" if metrics_in_name else ""
            print(f"  üìÑ {f} (modificado: {mod_time_str}, tama√±o: {file_size} bytes{metric_str})")

        except Exception as e:
            print(f"  ‚ö†Ô∏è Error leyendo {f}: {e}")
            files_with_info.append({
                'filename': f,
                'mod_time': 0,
                'metrics': None,
                'size': 0
            })

    valid_files = [f for f in files_with_info if f['size'] > 0]

    if not valid_files:
        print("‚ùå No se encontraron archivos v√°lidos")
        return None

    files_with_metrics = [f for f in valid_files if f['metrics'] is not None]
    if files_with_metrics:
        best_file = max(files_with_metrics, key=lambda x: x['metrics'])
        print(f"‚úÖ Seleccionado por mejores m√©tricas: {best_file['filename']} (m√©trica: {best_file['metrics']})")
    else:
        best_file = max(valid_files, key=lambda x: x['mod_time'])
        mod_time_str = pd.to_datetime(best_file['mod_time'], unit='s').strftime('%Y-%m-%d %H:%M')
        print(f"‚úÖ Seleccionado por fecha m√°s reciente: {best_file['filename']} ({mod_time_str})")

    return best_file['filename']

def load_and_compare_all_params(json_files):
    """
    Carga todos los archivos de par√°metros y selecciona el mejor basado en m√©tricas guardadas
    """
    if not json_files:
        print("‚ö†Ô∏è No se encontraron archivos de par√°metros")
        return None, None

    print(f"\nüîÑ Analizando {len(json_files)} archivos de par√°metros...")
    best_params = None
    best_file = None
    best_score = -float('inf')
    valid_files_count = 0

    for filename in json_files:
        try:
            if not os.path.exists(filename) or os.path.getsize(filename) == 0:
                print(f"  ‚ö†Ô∏è Archivo vac√≠o o inexistente: {filename}")
                continue

            with open(filename, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if not data:
                print(f"  ‚ö†Ô∏è Archivo JSON vac√≠o: {filename}")
                continue

            valid_files_count += 1

            score = 0
            score_source = "timestamp"

            if 'metrics' in data and isinstance(data['metrics'], dict):
                r2 = data['metrics'].get('r2', 0)
                accuracy = data['metrics'].get('accuracy', 0)

                if isinstance(r2, (int, float)) and isinstance(accuracy, (int, float)):
                    score = r2 * 0.6 + accuracy * 0.4
                    score_source = f"m√©tricas (R2:{r2:.3f}, Acc:{accuracy:.3f})"
                else:
                    score = os.path.getmtime(filename)
                    score_source = "timestamp (m√©tricas inv√°lidas)"

            elif 'best_value' in data and isinstance(data['best_value'], (int, float)):
                score = -data['best_value']
                score_source = f"Optuna best_value ({data['best_value']})"
            else:
                score = os.path.getmtime(filename)
                score_source = "timestamp (fallback)"

            print(f"  üìä {filename}: score={score:.4f} ({score_source})")

            if score > best_score:
                best_score = score
                if 'best_params' in data:
                    best_params = data['best_params']
                elif 'model_params' in data:
                    best_params = data['model_params']
                else:
                    best_params = data
                best_file = filename

        except json.JSONDecodeError as e:
            print(f"  ‚ùå Error JSON en {filename}: {e}")
        except Exception as e:
            print(f"  ‚ùå Error cargando {filename}: {e}")

    if best_file and best_params:
        print(f"‚úÖ Mejor archivo seleccionado: {best_file} (score: {best_score:.4f})")
        print(f"üìã Par√°metros cargados: {len(best_params)} elementos")
    else:
        print("‚ùå No se pudo seleccionar ning√∫n archivo v√°lido")

    return best_file, best_params

def save_metrics_with_timestamp(metrics_data, prefix="metrics", include_score=True):
    """
    Guarda las m√©tricas en un archivo JSON con timestamp
    """
    try:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{prefix}_{timestamp}"

        if include_score and 'metrics' in metrics_data:
            metrics = metrics_data['metrics']
            if 'r2' in metrics and 'accuracy' in metrics:
                r2 = metrics['r2']
                accuracy = metrics['accuracy']
                score = r2 * 0.6 + accuracy * 0.4
                filename += f"_score_{score:.4f}"

        filename += ".json"

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(metrics_data, f, indent=4, ensure_ascii=False)

        print(f"üíæ M√©tricas guardadas en: {filename}")
        return filename

    except Exception as e:
        print(f"‚ùå Error guardando m√©tricas: {e}")
        return None

def main_workflow():
    """
    Workflow principal automatizado
    """
    print("="*60)
    print("üöÄ INICIANDO WORKFLOW AUTOMATIZADO DE OPTIMIZACI√ìN")
    print("="*60)

    print("\nüìä Preparando datos...")

    if not os.path.exists("spill_data_cleaned.csv"):
        print("‚ùå No se encontr√≥ el archivo 'spill_data_cleaned.csv'")
        print("   Aseg√∫rate de que el archivo est√© en el directorio actual")
        return None, None

    df = pd.read_csv("spill_data_cleaned.csv", sep=';', encoding='latin-1')
    df = df.dropna(subset=['release_prod_water_edit'])
    df['log_release_prod_water_edit'] = np.log1p(df['release_prod_water_edit'])

    cat_cols = ['operator_edit', 'county_edit', 'type_operation', 'source', 'probable_cause_edit']
    for col in cat_cols:
        df[col] = df[col].fillna('unknown')

    df['date'] = pd.to_datetime(df['date_of_spill_edit'])
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df.drop(columns=['date'], inplace=True)

    y_reg = df['log_release_prod_water_edit']
    y_clf = df['probable_cause_edit']
    X = df[['year','month','operator_edit','county_edit','type_operation','source']]

    num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_features = X.select_dtypes(include=['object']).columns.tolist()

    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

    X_processed = preprocessor.fit_transform(X)

    le_causa = LabelEncoder()
    y_clf_enc = le_causa.fit_transform(y_clf)
    y_clf_oh = to_categorical(y_clf_enc)

    X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
        X_processed, y_reg, y_clf_oh, test_size=0.2, random_state=42
    )

    input_dim = X_train.shape[1]
    num_classes = y_clf_oh.shape[1]

    print(f"‚úÖ Datos preparados: {X_train.shape[0]} muestras de entrenamiento")

    json_files = [f for f in os.listdir('.') if f.startswith('best_params_') and f.endswith('.json')]

    if json_files:
        print(f"\nüîç Archivos de par√°metros encontrados: {len(json_files)}")
        best_file, best_params = load_and_compare_all_params(json_files)

        if best_params:
            print(f"‚úÖ Usando par√°metros del archivo: {best_file}")
            print("‚ö†Ô∏è Necesitas implementar build_model_from_params()")
        else:
            print("‚ùå Error cargando par√°metros, necesitas ejecutar optimizaci√≥n...")
            print("‚ö†Ô∏è Necesitas implementar run_optuna_optimization()")
    else:
        print("\nüÜï No se encontraron archivos de par√°metros - ejecutando optimizaci√≥n inicial...")
        print("‚ö†Ô∏è Necesitas implementar run_optuna_optimization()")

    print("\nüéØ Simulando entrenamiento y evaluaci√≥n...")

    r2 = 0.85
    mse = 0.25
    mae = 0.15
    accuracy = 0.78

    print(f"üìä M√©tricas simuladas:")
    print(f"   R¬≤ Score: {r2:.4f}")
    print(f"   MSE: {mse:.4f}")
    print(f"   MAE: {mae:.4f}")
    print(f"   Accuracy: {accuracy:.4f}")

    metrics_summary = {
        "timestamp": pd.Timestamp.now().isoformat(),
        "metrics": {
            "r2": r2,
            "mse": mse,
            "mae": mae,
            "accuracy": accuracy
        },
        "model_params": best_params if 'best_params' in locals() else {},
        "training_epochs": 93,
        "dataset_info": {
            "train_samples": X_train.shape[0],
            "test_samples": X_test.shape[0],
            "features": X_train.shape[1],
            "classes": num_classes
        }
    }

    local_filename = save_metrics_with_timestamp(metrics_summary, "best_params", True)

    if local_filename:
        print(f"üíæ M√©tricas guardadas localmente en: {local_filename}")

    print("\nüéâ Workflow completado!")
    print("\nüìã Pr√≥ximos pasos:")
    print("   1. Integra tus funciones build_model_from_params() y run_optuna_optimization()")
    print("   2. Reemplaza las m√©tricas simuladas con las reales de tu modelo")

    return None, metrics_summary

if __name__ == "__main__":
    print("üîß Ejecutando workflow de optimizaci√≥n...")

    required_files = ["spill_data_cleaned.csv"]
    missing_files = [f for f in required_files if not os.path.exists(f)]

    if missing_files:
        print(f"‚ùå Archivos faltantes: {missing_files}")
        print("   Aseg√∫rate de tener todos los archivos necesarios en el directorio")
    else:
        model, metrics = main_workflow()

In [None]:
def select_best_params_file(json_files):
    """
    Selecciona autom√°ticamente el mejor archivo de par√°metros basado en:
    1. Fecha de modificaci√≥n m√°s reciente
    2. Si hay archivos con m√©tricas en el nombre, usa el de mejores m√©tricas
    """
    if not json_files:
        return None

    print(f"üîç Se encontraron {len(json_files)} archivos de par√°metros:")

    # Mostrar archivos disponibles
    files_with_info = []
    for f in json_files:
        try:
            # Obtener informaci√≥n del archivo
            creation_time = os.path.getctime(f)
            modification_time = os.path.getmtime(f)

            # Intentar extraer m√©tricas del nombre del archivo si las tiene
            metrics_in_name = None
            if 'r2_' in f.lower() or 'acc_' in f.lower():
                # Extraer valores num√©ricos del nombre
                import re
                numbers = re.findall(r'[\d.]+', f)
                if numbers:
                    metrics_in_name = float(numbers[0])

            files_with_info.append({
                'filename': f,
                'mod_time': modification_time,
                'metrics': metrics_in_name
            })

            print(f"  üìÑ {f} (modificado: {pd.to_datetime(modification_time, unit='s').strftime('%Y-%m-%d %H:%M')})")

        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error leyendo {f}: {e}")
            files_with_info.append({
                'filename': f,
                'mod_time': 0,
                'metrics': None
            })

    # Estrategia de selecci√≥n:
    # 1. Si hay archivos con m√©tricas en el nombre, usar el de mejor m√©trica
    # 2. Si no, usar el m√°s reciente

    files_with_metrics = [f for f in files_with_info if f['metrics'] is not None]

    if files_with_metrics:
        # Usar el archivo con mejores m√©tricas (asumiendo que valores m√°s altos son mejores)
        best_file = max(files_with_metrics, key=lambda x: x['metrics'])
        print(f"‚úÖ Seleccionado por mejores m√©tricas: {best_file['filename']} (m√©trica: {best_file['metrics']})")
    else:
        # Usar el m√°s reciente
        best_file = max(files_with_info, key=lambda x: x['mod_time'])
        print(f"‚úÖ Seleccionado por fecha m√°s reciente: {best_file['filename']}")

    return best_file['filename']

def load_and_compare_all_params(json_files):
    """
    Carga todos los archivos de par√°metros y selecciona el mejor basado en m√©tricas guardadas
    """
    if not json_files:
        return None, None

    print(f"\nüîÑ Analizando {len(json_files)} archivos de par√°metros...")

    best_params = None
    best_file = None
    best_score = -float('inf')

    for filename in json_files:
        try:
            with open(filename, 'r') as f:
                data = json.load(f)

            # Buscar m√©tricas de evaluaci√≥n en el archivo
            score = 0
            if 'metrics' in data:
                # Si hay m√©tricas guardadas, usar R2 + Accuracy como score combinado
                r2 = data['metrics'].get('r2', 0)
                accuracy = data['metrics'].get('accuracy', 0)
                score = r2 * 0.6 + accuracy * 0.4  # Peso 60% R2, 40% Accuracy
            elif 'best_value' in data:
                # Si hay best_value de Optuna
                score = -data['best_value']  # Negativo porque Optuna minimiza
            else:
                # Usar timestamp como fallback
                score = os.path.getmtime(filename)

            print(f"  üìä {filename}: score={score:.4f}")

            if score > best_score:
                best_score = score
                best_params = data.get('best_params', data)
                best_file = filename

        except Exception as e:
            print(f"  ‚ùå Error cargando {filename}: {e}")

    if best_file:
        print(f"‚úÖ Mejor archivo seleccionado: {best_file} (score: {best_score:.4f})")

    return best_file, best_params

def main_workflow():
    """
    Workflow principal automatizado - sin preguntas interactivas
    """

    print("="*60)
    print("üöÄ INICIANDO WORKFLOW AUTOMATIZADO DE OPTIMIZACI√ìN")
    print("="*60)

    # Cargar y preparar datos (tu c√≥digo existente)
    print("\nüìä Preparando datos...")
    df = pd.read_csv("spill_data_cleaned.csv", sep=';', encoding='latin-1')
    df = df.dropna(subset=['release_prod_water_edit'])
    df['log_release_prod_water_edit'] = np.log1p(df['release_prod_water_edit'])

    # Preparar caracter√≠sticas
    cat_cols = ['operator_edit', 'county_edit', 'type_operation', 'source', 'probable_cause_edit']
    for col in cat_cols:
        df[col] = df[col].fillna('unknown')

    df['date'] = pd.to_datetime(df['date_of_spill_edit'])
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df.drop(columns=['date'], inplace=True)

    y_reg = df['log_release_prod_water_edit']
    y_clf = df['probable_cause_edit']
    X = df[['year','month','operator_edit','county_edit','type_operation','source']]

    # Preprocesamiento
    num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_features = X.select_dtypes(include=['object']).columns.tolist()

    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

    X_processed = preprocessor.fit_transform(X)

    le_causa = LabelEncoder()
    y_clf_enc = le_causa.fit_transform(y_clf)
    y_clf_oh = to_categorical(y_clf_enc)

    X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
        X_processed, y_reg, y_clf_oh, test_size=0.2, random_state=42
    )

    input_dim = X_train.shape[1]
    num_classes = y_clf_oh.shape[1]
    joblib.dump(le_causa, 'labelencoder_causa.pkl')
    print("LabelEncoder guardado")
    print(f"‚úÖ Datos preparados: {X_train.shape[0]} muestras de entrenamiento")

    # AUTOMATIZACI√ìN: Buscar y seleccionar autom√°ticamente el mejor archivo
    json_files = [f for f in os.listdir('.') if f.startswith('best_params_') and f.endswith('.json')]

    if json_files:
        print(f"\nüîç Archivos de par√°metros encontrados: {len(json_files)}")

        # ESTRATEGIA 1: Comparar todos los archivos y usar el mejor
        best_file, best_params = load_and_compare_all_params(json_files)

        if best_params:
            print(f"‚úÖ Usando par√°metros del archivo: {best_file}")
            model = build_model_from_params(best_params, input_dim, num_classes)
        else:
            print("‚ùå Error cargando par√°metros, ejecutando nueva optimizaci√≥n...")
            json_filename, best_params = run_optuna_optimization(X_train, y_reg_train, y_clf_train, n_trials=150)
            model = build_model_from_params(best_params, input_dim, num_classes)
    else:
        # PRIMERA VEZ: EJECUTAR OPTIMIZACI√ìN
        print("\nüÜï No se encontraron archivos de par√°metros - ejecutando optimizaci√≥n inicial...")
        json_filename, best_params = run_optuna_optimization(X_train, y_reg_train, y_clf_train, n_trials=150)
        model = build_model_from_params(best_params, input_dim, num_classes)

    print("\nüèãÔ∏è Entrenando modelo final...")

    batch_size = best_params.get('batch_size', 64)

    history = model.fit(
        X_train,
        {'regression': y_reg_train, 'classification': y_clf_train},
        validation_split=0.2,
        epochs=93,
        batch_size=batch_size,
        callbacks=[
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(patience=5, factor=0.5)
        ],
        verbose=1
    )
    # Guardar el modelo entrenado a disco justo aqu√≠:
    model.save("modelo_trained.h5")
    print("Modelo guardado en modelo_trained.h5")
    joblib.dump(preprocessor, 'preprocessor.pkl')
    print("Preprocessor guardado en preprocessor.pkl")
    print("\nüî¨ Evaluando modelo...")

    test_results = model.evaluate(
        X_test,
        {'regression': y_reg_test, 'classification': y_clf_test},
        verbose=0
    )

    predictions = model.predict(X_test, verbose=0)
    y_reg_pred = predictions[0].flatten()
    y_clf_pred = predictions[1]

    r2 = r2_score(y_reg_test, y_reg_pred)
    mse = mean_squared_error(y_reg_test, y_reg_pred)
    mae = np.mean(np.abs(y_reg_test - y_reg_pred))

    y_clf_pred_classes = np.argmax(y_clf_pred, axis=1)
    y_clf_test_classes = np.argmax(y_clf_test, axis=1)
    accuracy = accuracy_score(y_clf_test_classes, y_clf_pred_classes)
    
    # === NUEVAS SECCIONES: TABLAS PARA TODAS LAS M√âTRICAS ===
    print("\n" + "="*80)
    print("üìä TABLAS DETALLADAS DE M√âTRICAS POR √âPOCA")
    print("="*80)

    # Obtener todas las m√©tricas del historial
    epochs_range = range(1, len(history.history['loss']) + 1)

    # Funci√≥n auxiliar para mostrar tabla
    def show_metric_table(title, train_metric, val_metric, metric_name, show_diff=True):
        print(f"\n{title}")
        print("-" * 80)
        if show_diff:
            print(f"{'Epoch':<8} {'Train '+metric_name:<15} {'Val '+metric_name:<15} {'Diferencia':<15}")
        else:
            print(f"{'Epoch':<8} {'Train '+metric_name:<15} {'Val '+metric_name:<15}")
        print("-" * 80)

        for i in range(0, len(train_metric), 5):
            epoch = i + 1
            train_val = train_metric[i]
            val_val = val_metric[i]
            if show_diff:
                diff = abs(train_val - val_val)
                print(f"{epoch:<8} {train_val:<15.4f} {val_val:<15.4f} {diff:<15.4f}")
            else:
                print(f"{epoch:<8} {train_val:<15.4f} {val_val:<15.4f}")

        # Mostrar √∫ltima √©poca si no se mostr√≥
        if (len(train_metric) - 1) % 5 != 0:
            i = len(train_metric) - 1
            epoch = i + 1
            train_val = train_metric[i]
            val_val = val_metric[i]
            if show_diff:
                diff = abs(train_val - val_val)
                print(f"{epoch:<8} {train_val:<15.4f} {val_val:<15.4f} {diff:<15.4f}")
            else:
                print(f"{epoch:<8} {train_val:<15.4f} {val_val:<15.4f}")

        print("-" * 80)

        # Estad√≠sticas de resumen
        best_train = max(train_metric) if 'acc' in metric_name.lower() else min(train_metric)
        best_val = max(val_metric) if 'acc' in metric_name.lower() else min(val_metric)
        best_train_epoch = (train_metric.index(best_train) + 1) if 'acc' in metric_name.lower() else (train_metric.index(best_train) + 1)
        best_val_epoch = (val_metric.index(best_val) + 1) if 'acc' in metric_name.lower() else (val_metric.index(best_val) + 1)

        comparison = "Mejor" if 'acc' in metric_name.lower() else "Menor"
        print(f"\nRESUMEN {metric_name.upper()}:")
        print(f"{comparison} Train: {best_train:.4f} (√âpoca {best_train_epoch})")
        print(f"{comparison} Val:   {best_val:.4f} (√âpoca {best_val_epoch})")
        print(f"Final Train: {train_metric[-1]:.4f}")
        print(f"Final Val:   {val_metric[-1]:.4f}")

    # 1. TABLA DE P√âRDIDA TOTAL
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    show_metric_table("1. P√âRDIDA TOTAL vs √âPOCAS", train_loss, val_loss, "Loss")

    # 2. TABLA DE ACCURACY
    train_acc = history.history['classification_accuracy']
    val_acc = history.history['val_classification_accuracy']
    show_metric_table("2. ACCURACY vs √âPOCAS", train_acc, val_acc, "Accuracy")
    print(f"Accuracy en Test: {accuracy:.4f}")

    # 3. TABLA DE MAE REGRESI√ìN
    train_mae = history.history['regression_mae']
    val_mae = history.history['val_regression_mae']
    show_metric_table("3. MAE REGRESI√ìN vs √âPOCAS", train_mae, val_mae, "MAE")
    print(f"MAE en Test: {mae:.4f}")

    # 4. TABLA DE P√âRDIDA CLASIFICACI√ìN
    train_cls_loss = history.history['classification_loss']
    val_cls_loss = history.history['val_classification_loss']
    show_metric_table("4. P√âRDIDA CLASIFICACI√ìN vs √âPOCAS", train_cls_loss, val_cls_loss, "Cls_Loss")

    # 5. TABLA DE P√âRDIDA REGRESI√ìN
    train_reg_loss = history.history['regression_loss']
    val_reg_loss = history.history['val_regression_loss']
    show_metric_table("5. P√âRDIDA REGRESI√ìN vs √âPOCAS", train_reg_loss, val_reg_loss, "Reg_Loss")

    # 6. TABLA CONSOLIDADA (RESUMEN)
    print("\n" + "="*100)
    print("6. TABLA CONSOLIDADA - TODAS LAS M√âTRICAS (cada 10 √©pocas)")
    print("="*100)
    print(f"{'Epoch':<8} {'Total_Loss':<12} {'Accuracy':<12} {'Reg_MAE':<12} {'Cls_Loss':<12} {'Reg_Loss':<12}")
    print(f"{'':<8} {'T/V':<12} {'T/V':<12} {'T/V':<12} {'T/V':<12} {'T/V':<12}")
    print("-" * 100)

    for i in range(0, len(train_loss), 10):
        epoch = i + 1
        print(f"{epoch:<8} {train_loss[i]:<5.3f}/{val_loss[i]:<5.3f} {train_acc[i]:<5.3f}/{val_acc[i]:<5.3f} {train_mae[i]:<5.3f}/{val_mae[i]:<5.3f} {train_cls_loss[i]:<5.3f}/{val_cls_loss[i]:<5.3f} {train_reg_loss[i]:<5.3f}/{val_reg_loss[i]:<5.3f}")

    # Mostrar √∫ltima √©poca en tabla consolidada
    if (len(train_loss) - 1) % 10 != 0:
        i = len(train_loss) - 1
        epoch = i + 1
        print(f"{epoch:<8} {train_loss[i]:<5.3f}/{val_loss[i]:<5.3f} {train_acc[i]:<5.3f}/{val_acc[i]:<5.3f} {train_mae[i]:<5.3f}/{val_mae[i]:<5.3f} {train_cls_loss[i]:<5.3f}/{val_cls_loss[i]:<5.3f} {train_reg_loss[i]:<5.3f}/{val_reg_loss[i]:<5.3f}")

    print("-" * 100)
    print("T = Training, V = Validation")
    

    # === FIN DE NUEVAS SECCIONES ===

    # Visualizaci√≥n (mantener las existentes)
    plt.figure(figsize=(8, 6))
    plt.boxplot(df['log_release_prod_water_edit'].dropna())
    plt.title('Boxplot of Logarithm of release_prod_water_edit')
    plt.ylabel('Log(release_prod_water_edit)')
    plt.grid(True)
    print("Boxplot mostrado en pantalla")
    plt.show()

    # Gr√°ficos existentes + nuevo gr√°fico de accuracy
    plt.figure(figsize=(16, 8))  # Aumentar tama√±o para acomodar 4 subplots

    plt.subplot(2, 3, 1)
    plt.plot(history.history['loss'], label='Training')
    plt.plot(history.history['val_loss'], label='Validation')
    plt.title('P√©rdida Total')
    plt.legend()

    plt.subplot(2, 3, 2)
    plt.plot(history.history['regression_mae'], label='Training')
    plt.plot(history.history['val_regression_mae'], label='Validation')
    plt.title('MAE Regresi√≥n')
    plt.legend()

    plt.subplot(2, 3, 3)
    plt.scatter(y_reg_test, y_reg_pred, alpha=0.6)
    plt.plot([y_reg_test.min(), y_reg_test.max()], [y_reg_test.min(), y_reg_test.max()], 'r--')
    plt.xlabel('Real')
    plt.ylabel('Predicho')
    plt.title(f'Regresi√≥n (R¬≤={r2:.3f})')

    # NUEVO: Gr√°fico de Accuracy vs Epochs
    plt.subplot(2, 3, 4)
    epochs_range = range(1, len(history.history['classification_accuracy']) + 1)
    train_acc = history.history['classification_accuracy']
    val_acc = history.history['val_classification_accuracy']
    plt.plot(epochs_range, train_acc, label='Training', marker='o', markersize=2)
    plt.plot(epochs_range, val_acc, label='Validation', marker='s', markersize=2)
    plt.xlabel('√âpoca')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs √âpocas')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(2, 3, 5)
    plt.plot(history.history['classification_loss'], label='Training')
    plt.plot(history.history['val_classification_loss'], label='Validation')
    plt.title('P√©rdida Clasificaci√≥n')
    plt.legend()

    plt.subplot(2, 3, 6)
    plt.plot(history.history['regression_loss'], label='Training')
    plt.plot(history.history['val_regression_loss'], label='Validation')
    plt.title('P√©rdida Regresi√≥n')
    plt.legend()

    plt.tight_layout()
    print("Gr√°fico de m√©tricas")
    plt.show()

    print("\n" + "="*50)
    print("RESULTADOS FINALES")
    print("="*50)
    print(f"Regresi√≥n:")
    print(f"  R¬≤ Score: {r2:.4f}")
    print(f"  MSE: {mse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"\nClasificaci√≥n:")
    print(f"  Accuracy: {accuracy:.4f}")

    # === GUARDAR M√âTRICAS EN JSON CON TIMESTAMP ===
    metrics_summary = {
        "timestamp": pd.Timestamp.now().isoformat(),
        "metrics": {
            "r2": r2,
            "mse": mse,
            "mae": mae,
            "accuracy": accuracy
        },
        "model_params": best_params,
        "training_epochs": len(history.history['loss']),
        "final_training_loss": float(history.history['loss'][-1]),
        "final_validation_loss": float(history.history['val_loss'][-1])
    }

    print("\nResumen final de m√©tricas:")
    print(json.dumps(metrics_summary, indent=4))

    # === MOSTRAR PREDICCIONES EN PANTALLA (no guardar) ===
    reg_pred_original = np.expm1(y_reg_pred)

    df_preds = pd.DataFrame({
        'y_reg_real': np.expm1(y_reg_test.values),
        'y_reg_pred': reg_pred_original,
        'y_clf_real': le_causa.inverse_transform(y_clf_test_classes),
        'y_clf_pred': le_causa.inverse_transform(y_clf_pred_classes)
    })

    print("\nEjemplo de predicciones:")
    print(df_preds.head())

    # === GUARDAR PAR√ÅMETROS ACTUALIZADOS SI SE MEJOR√ì ===
    if 'best_file' in locals() and best_file:
        # Comparar m√©tricas actuales con las del archivo usado
        try:
            with open(best_file, 'r') as f:
                old_data = json.load(f)

            old_r2 = old_data.get('metrics', {}).get('r2', 0)
            old_acc = old_data.get('metrics', {}).get('accuracy', 0)
            old_score = old_r2 * 0.6 + old_acc * 0.4

            current_score = r2 * 0.6 + accuracy * 0.4

            if current_score > old_score:
                new_filename = f"best_params_improved_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json"
                with open(new_filename, 'w') as f:
                    json.dump(metrics_summary, f, indent=4)
                print(f"‚úÖ M√©tricas mejoradas! Guardado nuevo archivo: {new_filename}")
                print(f"   Score anterior: {old_score:.4f} ‚Üí Score actual: {current_score:.4f}")
            else:
                print(f"üìä M√©tricas actuales ({current_score:.4f}) no superaron las del archivo usado ({old_score:.4f})")

        except Exception as e:
            print(f"‚ö†Ô∏è  Error comparando m√©tricas: {e}")

    print("\nüéâ Workflow completado autom√°ticamente!")

    return model, best_params

# Ejecutar workflow
main_workflow()

In [None]:
import pandas as pd

# Carga del CSV con separador y codificaci√≥n espec√≠fica
df = pd.read_csv('spill_data_cleaned.csv', sep=';', encoding='latin1')

# Lista de columnas de inter√©s
columnas = ['operator_edit', 'county_edit', 'type_operation', 'source']

# Mostrar valores √∫nicos para cada columna
for col in columnas:
    print(f"\nValores √∫nicos de '{col}':")
    print(df[col].unique())

In [None]:
print("Valores √∫nicos de 'county_edit':")
print(df['county_edit'].dropna().unique())

In [None]:
print("\nValores √∫nicos de 'type_operation':")
print(df['type_operation'].dropna().unique())

In [None]:
print("\nValores √∫nicos de 'source':")
print(df['source'].dropna().unique())