<div style="width: 100%; clear: both;">
<div style="float: left; width: 50%;">
<img src="https://www.uoc.edu/content/dam/news/images/noticies/2016/202-nova-marca-uoc.jpg" align="left" width="45%">
</div>
<div style="float: right; width: 50%;">
<p style="margin: 0; padding-top: 22px; text-align:right;">Trabajo de Fin de Máster</p>
<p style="margin: 0; text-align:right;">Máster universitario en Ciencia de datos (Data science)</p>
<p style="margin: 0; text-align:right; padding-button: 100px;">Estudios de Informática, Multimedia y Telecomunicación</p>
</div>
</div>
<div style="width:100%;">&nbsp;</div>

# Machine Learning para predecir cancelaciones y mejorar la retención en seguros

## Optimización

In [2]:
import pandas as pd
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv').squeeze()
X_train_ros = pd.read_csv('X_train_ros.csv')
y_train_ros = pd.read_csv('y_train_ros.csv').squeeze()
X_train_rus = pd.read_csv('X_train_rus.csv')
y_train_rus = pd.read_csv('y_train_rus.csv').squeeze()
X_train_smote = pd.read_csv('X_train_smote.csv')
y_train_smote = pd.read_csv('y_train_smote.csv').squeeze()

In [3]:
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV

In [27]:
from sklearn.naive_bayes import GaussianNB

# Definimos el espacio de búsqueda para var_smoothing
param_grid = {
    'var_smoothing': [4e-1, 5e-1, 6e-1,7e-1, 8e-1, 9e-1,1 ,2, 10, 50, 100, 110, 115]
}

# Métrica principal a optimizar
scorer = make_scorer(f1_score)

# Configuramos la búsqueda en cuadrícula
grid_search = GridSearchCV(
    estimator=GaussianNB(),
    param_grid=param_grid,
    scoring=scorer,
    cv=2,
    n_jobs=-1,
    verbose=1
)

# Ajustamos sobre los datos submuestreados
grid_search.fit(X_train_rus, y_train_rus)

# Extraemos el mejor modelo
best_nb = grid_search.best_estimator_

# Predicciones con el conjunto de test original
y_pred_rus = best_nb.predict(X_test)
y_proba_rus = best_nb.predict_proba(X_test)[:, 1]

# Evaluación
print("\nResultados para RandomUnderSampler (GaussianNB) con Hypertuning:")
print(f"Mejor hiperparámetro (var_smoothing): {grid_search.best_params_}")
print(f"AUC: {roc_auc_score(y_test, y_proba_rus):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_rus):.4f}")
print(classification_report(y_test, y_pred_rus))


Fitting 2 folds for each of 13 candidates, totalling 26 fits

Resultados para RandomUnderSampler (GaussianNB) con Hypertuning:
Mejor hiperparámetro (var_smoothing): {'var_smoothing': 110}
AUC: 0.6802
F1-score: 0.2725
              precision    recall  f1-score   support

           0       0.93      0.60      0.73    422179
           1       0.17      0.65      0.27     54408

    accuracy                           0.60    476587
   macro avg       0.55      0.62      0.50    476587
weighted avg       0.84      0.60      0.67    476587



In [14]:
from sklearn.neighbors import KNeighborsClassifier

# Definimos el espacio de búsqueda para los hiperparámetros
param_grid = {
    'n_neighbors': [10, 15, 20],
    'weights': ['uniform', 'distance'],
}

# Definimos la métrica principal para optimizar (F1 en este caso)
scorer = make_scorer(f1_score)

# Configuramos la búsqueda en cuadrícula
grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    scoring=scorer,
    cv=2,
    n_jobs=-1,
    verbose=1
)

# Ejecutamos el ajuste sobre los datos submuestreados
grid_search.fit(X_train_rus, y_train_rus)

# Extraemos el mejor modelo
best_knn = grid_search.best_estimator_

# Predicciones con el conjunto de test original
y_pred_rus = best_knn.predict(X_test)
y_proba_rus = best_knn.predict_proba(X_test)[:, 1]

# Evaluación
print("\nResultados para RandomUnderSampler (KNN) con Hypertuning:")
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"AUC: {roc_auc_score(y_test, y_proba_rus):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_rus):.4f}")
print(classification_report(y_test, y_pred_rus))


Fitting 2 folds for each of 6 candidates, totalling 12 fits

Resultados para RandomUnderSampler (KNN) con Hypertuning:
Mejores hiperparámetros: {'n_neighbors': 20, 'weights': 'distance'}
AUC: 0.6798
F1-score: 0.2994
              precision    recall  f1-score   support

           0       0.93      0.70      0.80    422179
           1       0.20      0.58      0.30     54408

    accuracy                           0.69    476587
   macro avg       0.57      0.64      0.55    476587
weighted avg       0.85      0.69      0.74    476587



In [15]:
from sklearn.tree import DecisionTreeClassifier

# Definimos el espacio de búsqueda
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
}

# Métrica objetivo: F1
scorer = make_scorer(f1_score)

# Búsqueda con validación cruzada
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring=scorer,
    cv=2,
    n_jobs=-1,
    verbose=1
)

# Ajuste con datos SMOTE
grid_search.fit(X_train_smote, y_train_smote)

# Mejor modelo
best_dt = grid_search.best_estimator_

# Predicción en test original
y_pred_smote = best_dt.predict(X_test)
y_proba_smote = best_dt.predict_proba(X_test)[:, 1]

# Evaluación
print("\nResultados para SMOTE (Decision Tree) con Hypertuning:")
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"AUC: {roc_auc_score(y_test, y_proba_smote):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_smote):.4f}")
print(classification_report(y_test, y_pred_smote))


Fitting 2 folds for each of 18 candidates, totalling 36 fits

Resultados para SMOTE (Decision Tree) con Hypertuning:
Mejores hiperparámetros: {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2}
AUC: 0.6524
F1-score: 0.3963
              precision    recall  f1-score   support

           0       0.92      0.95      0.93    422179
           1       0.46      0.35      0.40     54408

    accuracy                           0.88    476587
   macro avg       0.69      0.65      0.66    476587
weighted avg       0.87      0.88      0.87    476587



In [16]:
from sklearn.ensemble import RandomForestClassifier

# Definimos el espacio de búsqueda para los hiperparámetros
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
}

# Definimos el scorer para optimizar el F1-score
scorer = make_scorer(f1_score)

# Configuramos la búsqueda en cuadrícula
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring=scorer,
    cv=2,
    n_jobs=-1,
    verbose=1
)

# Ajustamos el modelo con los datos de SMOTE
grid_search.fit(X_train_smote, y_train_smote)

# Extraemos el mejor modelo
best_rf = grid_search.best_estimator_

# Predicciones sobre el conjunto de prueba
y_pred_smote = best_rf.predict(X_test)
y_proba_smote = best_rf.predict_proba(X_test)[:, 1]

# Evaluación
print("\nResultados para SMOTE (Random Forest) con Hypertuning:")
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"AUC: {roc_auc_score(y_test, y_proba_smote):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_smote):.4f}")
print(classification_report(y_test, y_pred_smote))


Fitting 2 folds for each of 36 candidates, totalling 72 fits

Resultados para SMOTE (Random Forest) con Hypertuning:
Mejores hiperparámetros: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
AUC: 0.6937
F1-score: 0.4533
              precision    recall  f1-score   support

           0       0.93      0.93      0.93    422179
           1       0.46      0.45      0.45     54408

    accuracy                           0.88    476587
   macro avg       0.69      0.69      0.69    476587
weighted avg       0.88      0.88      0.88    476587



In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score, classification_report

# Definimos el espacio de búsqueda para los hiperparámetros
param_grid = {
    'n_estimators': [150, 200, 250],
    'learning_rate': [0.01, 0.1, 0.5, 1],
}

# Configuramos la búsqueda en cuadrícula
grid_search = GridSearchCV(
    estimator=AdaBoostClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1',  # Utilizamos F1-score como métrica
    cv=2,
    n_jobs=-1,
    verbose=1
)

# Ajustamos el modelo con los datos sobre muestreo RandomOverSampler
grid_search.fit(X_train_ros, y_train_ros)

# Extraemos el mejor modelo
best_ada = grid_search.best_estimator_

# Predicciones sobre el conjunto de prueba
y_pred_ros = best_ada.predict(X_test)
y_proba_ros = best_ada.predict_proba(X_test)[:, 1]

# Evaluación
print("\nResultados para RandomOverSampler (AdaBoost) con Hypertuning:")
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"AUC: {roc_auc_score(y_test, y_proba_ros):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_ros):.4f}")
print(classification_report(y_test, y_pred_ros))

Fitting 2 folds for each of 12 candidates, totalling 24 fits

Resultados para RandomOverSampler (AdaBoost) con Hypertuning:
Mejores hiperparámetros: {'learning_rate': 1, 'n_estimators': 250}
AUC: 0.6946
F1-score: 0.4114
              precision    recall  f1-score   support

           0       0.93      0.89      0.91    422179
           1       0.36      0.48      0.41     54408

    accuracy                           0.84    476587
   macro avg       0.65      0.68      0.66    476587
weighted avg       0.86      0.84      0.85    476587



In [28]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score, classification_report

# Definimos el espacio de búsqueda para los hiperparámetros
param_grid = {
    'n_estimators': [300, 400, 500],
    'learning_rate': [0.001, 0.1, 0.5, 0.82],
    'max_depth': [30, 40, 50, 60],
    'gamma': [0, 0.1, 0.2]
}

# Configuramos la búsqueda en cuadrícula
grid_search = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='f1',  # Utilizamos F1-score como métrica
    cv=2,
    n_jobs=-1,
    verbose=1
)

# Ajustamos el modelo con los datos sobre muestreo RandomOverSampler
grid_search.fit(X_train_ros, y_train_ros)

# Extraemos el mejor modelo
best_xgb = grid_search.best_estimator_

# Predicciones sobre el conjunto de prueba
y_pred_ros = best_xgb.predict(X_test)
y_proba_ros = best_xgb.predict_proba(X_test)[:, 1]

# Evaluación
print("\nResultados para RandomOverSampler (XGBoost) con Hypertuning:")
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"AUC: {roc_auc_score(y_test, y_proba_ros):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_ros):.4f}")
print(classification_report(y_test, y_pred_ros))

Fitting 2 folds for each of 144 candidates, totalling 288 fits


python(2722) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2957) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(3031) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(4135) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(4228) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(5210) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7435) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7682) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8345) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8668) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(48964) MallocStackLoggi

KeyboardInterrupt: 

In [5]:
import numpy as np
import tensorflow as tf
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# División estratificada para validación
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_smote, y_train_smote, test_size=0.2, random_state=42, stratify=y_train_smote
)

def objective(trial):
    # Arquitectura de red
    model = Sequential([
        Input(shape=(X_train_final.shape[1],)),
        Dense(trial.suggest_int('units1', 64, 256), activation='relu'),
        Dropout(trial.suggest_float('dropout1', 0.2, 0.6)),
        Dense(trial.suggest_int('units2', 32, 128), activation='relu'),
        Dropout(trial.suggest_float('dropout2', 0.1, 0.4)),
        Dense(1, activation='sigmoid')
    ])
    
    # Optimización de hiperparámetros
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=trial.suggest_float('lr', 1e-4, 1e-2)
        ),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Entrenamiento con early stopping
    history = model.fit(
        X_train_final, y_train_final,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=trial.suggest_categorical('batch_size', [64, 128, 256]),
        verbose=0,
        callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
    )
    
    # Cálculo de F1 score
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    return f1_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2025-05-06 04:43:28,334] A new study created in memory with name: no-name-19a82484-c0f0-41e0-873f-1015a930d84d


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 220us/step


[I 2025-05-06 05:01:22,823] Trial 0 finished with value: 0.6425449061790138 and parameters: {'units1': 250, 'dropout1': 0.2638138735924608, 'units2': 105, 'dropout2': 0.20603272797717542, 'lr': 0.00010616166714444989, 'batch_size': 128}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 216us/step


[I 2025-05-06 05:05:01,726] Trial 1 finished with value: 0.6221388343701675 and parameters: {'units1': 226, 'dropout1': 0.4451994247393244, 'units2': 38, 'dropout2': 0.18472860447750317, 'lr': 0.006541503635203153, 'batch_size': 256}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 213us/step


[I 2025-05-06 05:15:43,576] Trial 2 finished with value: 0.6243790307132369 and parameters: {'units1': 252, 'dropout1': 0.35089789588140224, 'units2': 111, 'dropout2': 0.3735930543796355, 'lr': 0.0022073460299882933, 'batch_size': 64}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 196us/step


[I 2025-05-06 05:27:29,146] Trial 3 finished with value: 0.6221830340047176 and parameters: {'units1': 72, 'dropout1': 0.46587506196038464, 'units2': 104, 'dropout2': 0.19882690144761161, 'lr': 0.0013328628673386434, 'batch_size': 64}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 210us/step


[I 2025-05-06 05:38:08,218] Trial 4 finished with value: 0.6255495668728571 and parameters: {'units1': 143, 'dropout1': 0.3120886114407305, 'units2': 99, 'dropout2': 0.15692590295551972, 'lr': 0.00255393468574977, 'batch_size': 64}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 206us/step


[I 2025-05-06 05:41:02,674] Trial 5 finished with value: 0.619456021972169 and parameters: {'units1': 159, 'dropout1': 0.5684978027714169, 'units2': 69, 'dropout2': 0.2623350209726344, 'lr': 0.006367037747712347, 'batch_size': 256}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 208us/step


[I 2025-05-06 05:45:57,084] Trial 6 finished with value: 0.6221076890038494 and parameters: {'units1': 237, 'dropout1': 0.383201864972257, 'units2': 41, 'dropout2': 0.36164176101218937, 'lr': 0.005127822008945567, 'batch_size': 128}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 198us/step


[I 2025-05-06 05:49:25,034] Trial 7 finished with value: 0.6127210897522762 and parameters: {'units1': 106, 'dropout1': 0.5784196078457041, 'units2': 61, 'dropout2': 0.33412283297446016, 'lr': 0.0051812499875424686, 'batch_size': 64}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 203us/step


[I 2025-05-06 05:52:33,910] Trial 8 finished with value: 0.6204241998338976 and parameters: {'units1': 167, 'dropout1': 0.46531940274539607, 'units2': 43, 'dropout2': 0.2021331214032452, 'lr': 0.0064875457269252205, 'batch_size': 128}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 197us/step


[I 2025-05-06 05:57:29,176] Trial 9 finished with value: 0.6231941370446673 and parameters: {'units1': 114, 'dropout1': 0.35749362993973616, 'units2': 39, 'dropout2': 0.19349896273199185, 'lr': 0.004837287429183998, 'batch_size': 64}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 213us/step


[I 2025-05-06 06:03:36,278] Trial 10 finished with value: 0.622012481898684 and parameters: {'units1': 203, 'dropout1': 0.2147240820305069, 'units2': 127, 'dropout2': 0.27968390073719085, 'lr': 0.00944643513348476, 'batch_size': 128}. Best is trial 0 with value: 0.6425449061790138.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 209us/step


[I 2025-05-06 06:15:51,227] Trial 11 finished with value: 0.6504044051670671 and parameters: {'units1': 168, 'dropout1': 0.24973073484818603, 'units2': 96, 'dropout2': 0.10410885353946545, 'lr': 0.00024227513354465252, 'batch_size': 128}. Best is trial 11 with value: 0.6504044051670671.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 208us/step


[I 2025-05-06 06:31:39,536] Trial 12 finished with value: 0.6887561824949536 and parameters: {'units1': 194, 'dropout1': 0.20363739980422105, 'units2': 87, 'dropout2': 0.11754454723046401, 'lr': 0.0002526691869671853, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 223us/step


[I 2025-05-06 06:44:10,323] Trial 13 finished with value: 0.6636097872327614 and parameters: {'units1': 195, 'dropout1': 0.2011484686104915, 'units2': 85, 'dropout2': 0.10073289334763282, 'lr': 0.00035536623862614974, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 207us/step


[I 2025-05-06 06:53:52,790] Trial 14 finished with value: 0.6274368231046932 and parameters: {'units1': 182, 'dropout1': 0.20000785164040932, 'units2': 85, 'dropout2': 0.11630853959575356, 'lr': 0.0032804340663020255, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 204us/step


[I 2025-05-06 07:08:27,364] Trial 15 finished with value: 0.6403763285039541 and parameters: {'units1': 203, 'dropout1': 0.283724180623677, 'units2': 80, 'dropout2': 0.13880552107242977, 'lr': 0.001109082829638066, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 214us/step


[I 2025-05-06 07:15:01,567] Trial 16 finished with value: 0.6252757602362519 and parameters: {'units1': 206, 'dropout1': 0.3110745416684445, 'units2': 59, 'dropout2': 0.14585173484897188, 'lr': 0.0035226259417244026, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 209us/step


[I 2025-05-06 07:19:22,554] Trial 17 finished with value: 0.6233245983675533 and parameters: {'units1': 135, 'dropout1': 0.23464874041897668, 'units2': 84, 'dropout2': 0.10212576177942878, 'lr': 0.008885703528533816, 'batch_size': 256}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 214us/step


[I 2025-05-06 07:31:43,172] Trial 18 finished with value: 0.6345173967705058 and parameters: {'units1': 187, 'dropout1': 0.5282165155649579, 'units2': 73, 'dropout2': 0.2370332367027376, 'lr': 0.0012025262458929013, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 221us/step


[I 2025-05-06 07:43:20,120] Trial 19 finished with value: 0.6281565048867669 and parameters: {'units1': 223, 'dropout1': 0.29803809459598823, 'units2': 120, 'dropout2': 0.2909790980734309, 'lr': 0.0020887543077334034, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 216us/step


[I 2025-05-06 07:48:58,797] Trial 20 finished with value: 0.6248832948299211 and parameters: {'units1': 216, 'dropout1': 0.20479091189211407, 'units2': 93, 'dropout2': 0.16208279644878454, 'lr': 0.007941610254918014, 'batch_size': 256}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 209us/step


[I 2025-05-06 07:58:44,800] Trial 21 finished with value: 0.6433904106320916 and parameters: {'units1': 177, 'dropout1': 0.2541901954738812, 'units2': 93, 'dropout2': 0.10097985054654608, 'lr': 0.0003509765189697649, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 214us/step


[I 2025-05-06 08:14:50,137] Trial 22 finished with value: 0.6386153520883119 and parameters: {'units1': 152, 'dropout1': 0.24162218339982933, 'units2': 92, 'dropout2': 0.12831864427007655, 'lr': 0.00017346384769260768, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 213us/step


[I 2025-05-06 08:28:12,523] Trial 23 finished with value: 0.6354069977001815 and parameters: {'units1': 196, 'dropout1': 0.2662378588340079, 'units2': 71, 'dropout2': 0.1669809803810584, 'lr': 0.0011124565624082347, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 213us/step


[I 2025-05-06 08:42:22,400] Trial 24 finished with value: 0.629511473267479 and parameters: {'units1': 166, 'dropout1': 0.23090130396833522, 'units2': 114, 'dropout2': 0.12246956123879757, 'lr': 0.0033314744013472055, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.


[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 208us/step


[I 2025-05-06 08:53:31,229] Trial 25 finished with value: 0.6266008026379692 and parameters: {'units1': 117, 'dropout1': 0.34608812546844925, 'units2': 88, 'dropout2': 0.22955462228969054, 'lr': 0.0018212080528347279, 'batch_size': 128}. Best is trial 12 with value: 0.6887561824949536.
[W 2025-05-06 09:05:11,687] Trial 26 failed with parameters: {'units1': 181, 'dropout1': 0.20180740177426942, 'units2': 77, 'dropout2': 0.1307805246966578, 'lr': 0.0009135089099540973, 'batch_size': 128} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/luisocanahober/anaconda3/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/c9/t4s6bts95ybb23rnpkn7kc100000gn/T/ipykernel_2241/1178642045.py", line 35, in objective
    history = model.fit(
              ^^^^^^^^^^
  File "/Users/luisocanahober/anaconda3/lib/python3.11/site-packages/ker

KeyboardInterrupt: 

In [6]:
# Recuperar los mejores hiperparámetros
best_params = {
    'units1': 117,
    'dropout1': 0.34608812546844925,
    'units2': 88,
    'dropout2': 0.22955462228969054,
    'lr': 0.0018212080528347279,
    'batch_size': 128
}

# Reconstruir el modelo con los hiperparámetros óptimos
final_model = Sequential([
    Input(shape=(X_train_smote.shape[1],)),
    Dense(best_params['units1'], activation='relu'),
    Dropout(best_params['dropout1']),
    Dense(best_params['units2'], activation='relu'),
    Dropout(best_params['dropout2']),
    Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=best_params['lr'])
final_model.compile(optimizer=optimizer,
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

# Ajuste del modelo
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
final_model.fit(
    X_train_final, y_train_final,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=best_params['batch_size'],
    callbacks=[early_stop],
    verbose=0
)

from sklearn.metrics import roc_auc_score, f1_score, classification_report

# Predicciones finales sobre el conjunto de prueba
y_pred_test = (final_model.predict(X_test) > 0.5).astype(int).flatten()
y_proba_test = final_model.predict(X_test).flatten()

# Métricas de evaluación
print("\nResultados para el modelo neuronal (Optuna + SMOTE):")
print(f"Mejores hiperparámetros: {best_params}")
print(f"AUC: {roc_auc_score(y_test, y_proba_test):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_test):.4f}")
print(classification_report(y_test, y_pred_test))

[1m14894/14894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 205us/step
[1m14894/14894[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 205us/step

Resultados para el modelo neuronal (Optuna + SMOTE):
Mejores hiperparámetros: {'units1': 117, 'dropout1': 0.34608812546844925, 'units2': 88, 'dropout2': 0.22955462228969054, 'lr': 0.0018212080528347279, 'batch_size': 128}
AUC: 0.6923
F1-score: 0.4302
              precision    recall  f1-score   support

           0       0.93      0.91      0.92    422179
           1       0.40      0.46      0.43     54408

    accuracy                           0.86    476587
   macro avg       0.67      0.69      0.68    476587
weighted avg       0.87      0.86      0.86    476587



In [8]:
import optuna
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import f1_score

# Definir la función de objetivo para Optuna
def objective(trial):
    # Definir el espacio de búsqueda de hiperparámetros
    n_neurons_1 = trial.suggest_int('n_neurons_1', 32, 128)  # Neuronas en la primera capa
    n_neurons_2 = trial.suggest_int('n_neurons_2', 16, 64)   # Neuronas en la segunda capa
    dropout_1 = trial.suggest_float('dropout_1', 0.1, 0.5)    # Tasa de Dropout en la primera capa
    dropout_2 = trial.suggest_float('dropout_2', 0.1, 0.5)    # Tasa de Dropout en la segunda capa
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje

    # Crear el modelo
    model = Sequential([
        Input(shape=(X_train_ros.shape[1],)),
        Dense(n_neurons_1, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_1),
        Dense(n_neurons_2, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_2),
        Dense(1, activation='sigmoid')
    ])

    # Compilar el modelo con la tasa de aprendizaje elegida
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

    # Definir los callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6, verbose=1)

    # Dividir los datos para entrenamiento y validación
    X_ros_train, X_ros_val, y_ros_train, y_ros_val = train_test_split(
        X_train_ros, y_train_ros, test_size=0.2, stratify=y_train_ros, random_state=42
    )

    # Entrenamiento del modelo
    history = model.fit(X_ros_train, y_ros_train,
                        validation_data=(X_ros_val, y_ros_val),
                        epochs=100,
                        batch_size=256,
                        callbacks=[early_stop, reduce_lr],
                        verbose=0)

    # Evaluar el modelo con F1-score
    y_pred_ros_nn = model.predict(X_ros_val).ravel()
    y_pred_class = (y_pred_ros_nn >= 0.5).astype(int)  # Convertir probabilidades en predicciones binarias

    f1 = f1_score(y_ros_val, y_pred_class)
    
    # Se retorna el F1-score como métrica de optimización
    return f1

In [9]:
# Crear el estudio de Optuna
study = optuna.create_study(direction='maximize')  # Queremos maximizar el F1-score
study.optimize(objective, n_trials=10)  # Realiza 10 intentos

# Mostrar los mejores hiperparámetros
print(f"Mejores hiperparámetros: {study.best_params}")

[I 2025-05-14 21:57:20,622] A new study created in memory with name: no-name-36b8ed2e-f65c-4b04-a1eb-94a7aa6d6013
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 26: ReduceLROnPlateau reducing learning rate to 0.0004060380626469851.

Epoch 37: ReduceLROnPlateau reducing learning rate to 8.120761485770346e-05.

Epoch 44: ReduceLROnPlateau reducing learning rate to 1.62415235536173e-05.

Epoch 49: ReduceLROnPlateau reducing learning rate to 3.24830471072346e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 156us/step


[I 2025-05-14 22:00:46,508] Trial 0 finished with value: 0.6001143503019046 and parameters: {'n_neurons_1': 66, 'n_neurons_2': 50, 'dropout_1': 0.30444220193205873, 'dropout_2': 0.14072033584312837, 'learning_rate': 0.0020301903765699352}. Best is trial 0 with value: 0.6001143503019046.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 37: ReduceLROnPlateau reducing learning rate to 2.3777037858963014e-05.

Epoch 43: ReduceLROnPlateau reducing learning rate to 4.755407644552179e-06.

Epoch 48: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 168us/step


[I 2025-05-14 22:04:00,544] Trial 1 finished with value: 0.6003167031335213 and parameters: {'n_neurons_1': 46, 'n_neurons_2': 57, 'dropout_1': 0.433285768290593, 'dropout_2': 0.447831136640483, 'learning_rate': 0.0001188851883982773}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 26: ReduceLROnPlateau reducing learning rate to 0.00041863122023642066.

Epoch 34: ReduceLROnPlateau reducing learning rate to 8.372624288313092e-05.

Epoch 40: ReduceLROnPlateau reducing learning rate to 1.6745248285587876e-05.

Epoch 45: ReduceLROnPlateau reducing learning rate to 3.349049802636728e-06.

Epoch 50: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 155us/step


[I 2025-05-14 22:07:11,936] Trial 2 finished with value: 0.5995459337274743 and parameters: {'n_neurons_1': 50, 'n_neurons_2': 51, 'dropout_1': 0.42880329092907, 'dropout_2': 0.26134519550236485, 'learning_rate': 0.002093156210604909}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 21: ReduceLROnPlateau reducing learning rate to 0.00028527444228529933.

Epoch 37: ReduceLROnPlateau reducing learning rate to 5.705488729290664e-05.

Epoch 46: ReduceLROnPlateau reducing learning rate to 1.1410977458581329e-05.

Epoch 51: ReduceLROnPlateau reducing learning rate to 2.2821954189566895e-06.

Epoch 56: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 155us/step


[I 2025-05-14 22:12:30,099] Trial 3 finished with value: 0.5990592462699678 and parameters: {'n_neurons_1': 86, 'n_neurons_2': 49, 'dropout_1': 0.4008048480719718, 'dropout_2': 0.424792532347314, 'learning_rate': 0.0014263721712105047}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 63: ReduceLROnPlateau reducing learning rate to 1.2674160825554282e-05.

Epoch 69: ReduceLROnPlateau reducing learning rate to 2.5348321287310686e-06.

Epoch 78: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 153us/step


[I 2025-05-14 22:19:17,476] Trial 4 finished with value: 0.598895577334461 and parameters: {'n_neurons_1': 69, 'n_neurons_2': 56, 'dropout_1': 0.21670007384075382, 'dropout_2': 0.20684365112919087, 'learning_rate': 6.337080554218237e-05}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 89: ReduceLROnPlateau reducing learning rate to 2.575485450506676e-06.

Epoch 96: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 152us/step


[I 2025-05-14 22:27:54,930] Trial 5 finished with value: 0.6002767158678846 and parameters: {'n_neurons_1': 105, 'n_neurons_2': 54, 'dropout_1': 0.20332625668450183, 'dropout_2': 0.34732467847616955, 'learning_rate': 1.2877426958349146e-05}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 82: ReduceLROnPlateau reducing learning rate to 1.1977765825577081e-05.

Epoch 93: ReduceLROnPlateau reducing learning rate to 2.395553201495204e-06.

Epoch 98: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 150us/step


[I 2025-05-14 22:33:52,251] Trial 6 finished with value: 0.5986155724866601 and parameters: {'n_neurons_1': 52, 'n_neurons_2': 40, 'dropout_1': 0.27610470657752906, 'dropout_2': 0.34652364211016917, 'learning_rate': 5.988883028304895e-05}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 30: ReduceLROnPlateau reducing learning rate to 7.338767172768713e-05.

Epoch 49: ReduceLROnPlateau reducing learning rate to 1.4677534636575729e-05.

Epoch 56: ReduceLROnPlateau reducing learning rate to 2.9355069273151457e-06.

Epoch 61: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154us/step


[I 2025-05-14 22:39:13,148] Trial 7 finished with value: 0.5991542414063226 and parameters: {'n_neurons_1': 76, 'n_neurons_2': 46, 'dropout_1': 0.2370409300380798, 'dropout_2': 0.17732141376058236, 'learning_rate': 0.00036693836271725703}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 96: ReduceLROnPlateau reducing learning rate to 5.621584205073305e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 152us/step


[I 2025-05-14 22:45:09,507] Trial 8 finished with value: 0.5999334622365253 and parameters: {'n_neurons_1': 72, 'n_neurons_2': 21, 'dropout_1': 0.1600627802915284, 'dropout_2': 0.265430451847538, 'learning_rate': 2.810792065892616e-05}. Best is trial 1 with value: 0.6003167031335213.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Tasa de aprendizaje



Epoch 26: ReduceLROnPlateau reducing learning rate to 0.00010319027351215483.

Epoch 38: ReduceLROnPlateau reducing learning rate to 2.063805441139266e-05.

Epoch 44: ReduceLROnPlateau reducing learning rate to 4.1276107367593795e-06.

Epoch 49: ReduceLROnPlateau reducing learning rate to 1e-06.
[1m12314/12314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 156us/step


[I 2025-05-14 22:49:35,994] Trial 9 finished with value: 0.5990216399966836 and parameters: {'n_neurons_1': 63, 'n_neurons_2': 53, 'dropout_1': 0.16779733045276513, 'dropout_2': 0.2808276788160956, 'learning_rate': 0.0005159513634286087}. Best is trial 1 with value: 0.6003167031335213.


Mejores hiperparámetros: {'n_neurons_1': 46, 'n_neurons_2': 57, 'dropout_1': 0.433285768290593, 'dropout_2': 0.447831136640483, 'learning_rate': 0.0001188851883982773}


In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

X_ros_train, X_ros_val, y_ros_train, y_ros_val = train_test_split(
        X_train_ros, y_train_ros, test_size=0.2, stratify=y_train_ros, random_state=42
    )

# Mejores hiperparámetros encontrados por Optuna
best_params = {
    'n_neurons_1': 46,
    'n_neurons_2': 57,
    'dropout_1': 0.433285768290593,
    'dropout_2': 0.447831136640483,
    'learning_rate': 0.0001188851883982773
}

# Reconstrucción del modelo
final_model = Sequential([
    Input(shape=(X_train_ros.shape[1],)),
    Dense(best_params['n_neurons_1'], activation='relu'),
    BatchNormalization(),
    Dropout(best_params['dropout_1']),
    Dense(best_params['n_neurons_2'], activation='relu'),
    BatchNormalization(),
    Dropout(best_params['dropout_2']),
    Dense(1, activation='sigmoid')
])

# Compilación
optimizer = Adam(learning_rate=best_params['learning_rate'])
final_model.compile(optimizer=optimizer,
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6, verbose=1)

# Entrenamiento
final_model.fit(
    X_train_ros, y_train_ros,
    validation_data=(X_ros_val, y_ros_val),
    epochs=100,
    batch_size=256,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Evaluación
y_pred_test = (final_model.predict(X_test) > 0.5).astype(int).flatten()
y_proba_test = final_model.predict(X_test).flatten()

print("\nResultados para el modelo neuronal (Optuna + RandomOverSampler):")
print(f"Mejores hiperparámetros: {best_params}")
print(f"AUC: {roc_auc_score(y_test, y_proba_test):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_test):.4f}")
print(classification_report(y_test, y_pred_test))


Epoch 1/100
[1m7696/7696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 590us/step - accuracy: 0.5962 - loss: 0.7353 - val_accuracy: 0.6759 - val_loss: 0.6041 - learning_rate: 1.1889e-04
Epoch 2/100
[1m7696/7696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 558us/step - accuracy: 0.6756 - loss: 0.6124 - val_accuracy: 0.6858 - val_loss: 0.5951 - learning_rate: 1.1889e-04
Epoch 3/100
[1m7696/7696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 603us/step - accuracy: 0.6841 - loss: 0.6027 - val_accuracy: 0.6870 - val_loss: 0.5933 - learning_rate: 1.1889e-04
Epoch 4/100
[1m7696/7696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 605us/step - accuracy: 0.6855 - loss: 0.5994 - val_accuracy: 0.6876 - val_loss: 0.5924 - learning_rate: 1.1889e-04
Epoch 5/100
[1m7696/7696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 599us/step - accuracy: 0.6857 - loss: 0.5985 - val_accuracy: 0.6880 - val_loss: 0.5921 - learning_rate: 1.1889e-04
Epoch 6/100
[1m7696/7696