In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score
import optuna
import plotly.express as px
import numpy as np

# Cargar datos
df = pd.read_csv('../datasets/heart-disease-data/heart_disease_uci.csv')

# Revisar valores únicos en la columna target
print(df['num'].value_counts())

# Si 'num' tiene más de 2 clases, lo binarizamos (ej: 0 = no enfermedad, 1 = enfermedad)
df['target'] = (df['num'] > 0).astype(int)

# Eliminar columnas innecesarias
df = df.drop(columns=['id', 'dataset', 'num'])

# Convertir valores booleanos de texto a enteros si es necesario
df['fbs'] = df['fbs'].astype(float)
df['exang'] = df['exang'].astype(float)

df

num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,target
0,63,Male,typical angina,145.0,233.0,1.0,lv hypertrophy,150.0,0.0,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,0.0,lv hypertrophy,108.0,1.0,1.5,flat,3.0,normal,1
2,67,Male,asymptomatic,120.0,229.0,0.0,lv hypertrophy,129.0,1.0,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,0.0,normal,187.0,0.0,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,0.0,lv hypertrophy,172.0,0.0,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,1.0,st-t abnormality,154.0,0.0,0.0,,,,1
916,62,Male,typical angina,,139.0,0.0,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,1.0,st-t abnormality,100.0,0.0,0.0,,,fixed defect,1
918,58,Male,asymptomatic,,385.0,1.0,lv hypertrophy,,,,,,,0


In [11]:
# Convertir las variables categóricas a variables dummy
df = pd.get_dummies(df, drop_first=True)

# Separar features y target
X = df.drop(columns=['target'])
y = df['target']

# Dividir dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

display(X)

Unnamed: 0,age,trestbps,chol,fbs,thalch,exang,oldpeak,ca,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,63,145.0,233.0,1.0,150.0,0.0,2.3,0.0,True,False,False,True,False,False,False,False,False,False
1,67,160.0,286.0,0.0,108.0,1.0,1.5,3.0,True,False,False,False,False,False,True,False,True,False
2,67,120.0,229.0,0.0,129.0,1.0,2.6,2.0,True,False,False,False,False,False,True,False,False,True
3,37,130.0,250.0,0.0,187.0,0.0,3.5,0.0,True,False,True,False,True,False,False,False,True,False
4,41,130.0,204.0,0.0,172.0,0.0,1.4,0.0,False,True,False,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,127.0,333.0,1.0,154.0,0.0,0.0,,False,False,False,False,False,True,False,False,False,False
916,62,,139.0,0.0,,,,,True,False,False,True,False,True,False,False,False,False
917,55,122.0,223.0,1.0,100.0,0.0,0.0,,True,False,False,False,False,True,False,False,False,False
918,58,,385.0,1.0,,,,,True,False,False,False,False,False,False,False,False,False


### **Optimizing known metric**

In [12]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Definir función de objetivo para Optuna
def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 20, 50),
        max_depth=trial.suggest_int('max_depth', 3, 8),
        min_samples_split=trial.suggest_float('min_samples_split', 0.05, 1.0),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 2, 20),
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    y_test_pred_prob = model.predict_proba(X_test)[:, 1]

    score = roc_auc_score(y_test, y_test_pred_prob)

    print(f'Trial {trial.number} - Score: {score:.4f}')

    return score

# Optimizar con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Obtener los mejores parámetros de Optuna
best_params = study.best_params

# Mostrar resultados
print('Mejor modelo en  trial:', study.best_trial.number)
print(f'Parámetros:\n{best_params}')
print('Mejor ROC_AUC:', study.best_value)

# Entrenar el modelo con los mejores hiperparámetros encontrados
model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
)

# Ajustar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

y_test_pred_prob = model.predict_proba(X_test)[:, 1]
test_results = pd.DataFrame({'y_true': y_test, 'y_pred_prob': y_test_pred_prob})

test_results.head()

Trial 0 - Score: 0.5000
Trial 1 - Score: 0.8909
Trial 2 - Score: 0.8841
Trial 3 - Score: 0.5000
Trial 4 - Score: 0.5000
Trial 5 - Score: 0.8879
Trial 6 - Score: 0.9010
Trial 7 - Score: 0.5000
Trial 8 - Score: 0.8950
Trial 9 - Score: 0.8907
Trial 10 - Score: 0.9201
Trial 11 - Score: 0.9191
Trial 12 - Score: 0.9224
Trial 13 - Score: 0.9138
Trial 14 - Score: 0.9033
Trial 15 - Score: 0.5000
Trial 16 - Score: 0.8990
Trial 17 - Score: 0.9106
Trial 18 - Score: 0.8926
Trial 19 - Score: 0.9065
Trial 20 - Score: 0.8809
Trial 21 - Score: 0.9193
Trial 22 - Score: 0.9182
Trial 23 - Score: 0.8882
Trial 24 - Score: 0.9121
Trial 25 - Score: 0.8977
Trial 26 - Score: 0.9079
Trial 27 - Score: 0.8902
Trial 28 - Score: 0.5000
Trial 29 - Score: 0.9131
Trial 30 - Score: 0.9115
Trial 31 - Score: 0.9133
Trial 32 - Score: 0.9003
Trial 33 - Score: 0.9142
Trial 34 - Score: 0.9078
Trial 35 - Score: 0.8891
Trial 36 - Score: 0.9006
Trial 37 - Score: 0.9155
Trial 38 - Score: 0.9144
Trial 39 - Score: 0.8919
Trial 40 -

Unnamed: 0,y_true,y_pred_prob
514,1,0.34165
825,1,0.939946
854,1,0.605229
804,1,0.938397
887,0,0.723924


In [13]:
fig = px.histogram(test_results, x='y_pred_prob', color='y_true', nbins=10, 
                   title='Distribución de Predicciones', 
                   labels={'y_pred_prob': 'Probabilidad Predicha'},
                   barmode='relative', opacity=0.7, width=600)

fig.show()

In [14]:
y_test_pred = (y_test_pred_prob > 0.5).astype(int)
print('F1 Score:', round(f1_score(y_test, y_test_pred), 4))

F1 Score: 0.8558


### **Optimizing custom metric**

In [None]:
# Definir función de objetivo para Optuna
def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 20, 50),
        max_depth=trial.suggest_int('max_depth', 3, 8),
        min_samples_split=trial.suggest_float('min_samples_split', 0.05, 1.0),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 2, 20),
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    y_test_pred_prob = model.predict_proba(X_test)[:, 1]
    
    test_results = pd.DataFrame({'y_true': y_test, 'y_pred_prob': y_test_pred_prob})

    dist_test_1 = np.percentile(test_results[test_results['y_true'] == 1]['y_pred_prob'], 0.2) 
    - np.percentile(test_results[test_results['y_true'] == 0]['y_pred_prob'], 0.2)
    
    dist_test_2 = np.mean(test_results[test_results['y_true'] == 1]['y_pred_prob']) 
    - np.mean(test_results[test_results['y_true'] == 0]['y_pred_prob'])
    
    dist_test_3 = np.percentile(test_results[test_results['y_true'] == 1]['y_pred_prob'], 0.8)
    - np.percentile(test_results[test_results['y_true'] == 0]['y_pred_prob'], 0.8)
    
    std = test_results[test_results['y_true'] == 1]['y_pred_prob'].std() *
    test_results[test_results['y_true'] == 0]['y_pred_prob'].std()

    score = ((1 + dist_test_1) + (1 + dist_test_2) + (1 + dist_test_3)) / (1 + std)

    print(f'Trial {trial.number} - Score: {score:.4f}')

    return score

# Optimizar con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Obtener los mejores parámetros de Optuna
best_params = study.best_params

# Mostrar resultados
print('Mejor modelo en  trial:', study.best_trial.number)
print(f'Parámetros:\n{best_params}')
print('Mejor Distancia:', study.best_value)

# Entrenar el modelo con los mejores hiperparámetros encontrados
model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
)

# Ajustar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

y_test_pred_prob = model.predict_proba(X_test)[:, 1]

test_results = pd.DataFrame({'y_true': y_test, 'y_pred_prob': y_test_pred_prob})

test_results.head()

Trial 0 - Score: 3.3818
Trial 1 - Score: 3.3387
Trial 2 - Score: 3.0000
Trial 3 - Score: 3.2905
Trial 4 - Score: 3.2841
Trial 5 - Score: 3.0000
Trial 6 - Score: 3.2905
Trial 7 - Score: 3.5523
Trial 8 - Score: 3.0000
Trial 9 - Score: 3.2353
Trial 10 - Score: 3.7075
Trial 11 - Score: 3.7285
Trial 12 - Score: 3.6410
Trial 13 - Score: 3.4230
Trial 14 - Score: 3.0000
Trial 15 - Score: 3.7983
Trial 16 - Score: 3.3411
Trial 17 - Score: 3.4595
Trial 18 - Score: 3.4912
Trial 19 - Score: 3.3833
Trial 20 - Score: 3.3983
Trial 21 - Score: 3.7456
Trial 22 - Score: 3.5788
Trial 23 - Score: 3.4667
Trial 24 - Score: 3.6565
Trial 25 - Score: 3.4330
Trial 26 - Score: 3.3022
Trial 27 - Score: 3.4312
Trial 28 - Score: 3.6513
Trial 29 - Score: 3.3856
Trial 30 - Score: 3.5892
Trial 31 - Score: 3.5875
Trial 32 - Score: 3.6709
Trial 33 - Score: 3.4379
Trial 34 - Score: 3.6115
Trial 35 - Score: 3.3434
Trial 36 - Score: 3.3088
Trial 37 - Score: 3.4408
Trial 38 - Score: 3.6624
Trial 39 - Score: 3.0000
Trial 40 -

Unnamed: 0,y_true,y_pred_prob
514,1,0.296744
825,1,0.878262
854,1,0.596712
804,1,0.87836
887,0,0.71863


In [16]:
fig = px.histogram(test_results, x='y_pred_prob', color='y_true', nbins=10, 
                   title='Distribución de Predicciones', 
                   labels={'y_pred_prob': 'Probabilidad Predicha'},
                   barmode='relative', opacity=0.7, width=600)

fig.show()

In [17]:
y_test_pred = (y_test_pred_prob > 0.5).astype(int)
print('F1 Score:', round(f1_score(y_test, y_test_pred), 4))

F1 Score: 0.8612


### **History of Optuna's trials**

In [19]:
import optuna.visualization as vis
vis.plot_optimization_history(study).show()
# vis.plot_param_importances(study).show()