In [276]:
from pathlib import Path
import numpy as np
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pprint
from sklearn.model_selection import GridSearchCV, cross_val_score
import joblib


In [242]:
DATA_DIR = Path.cwd() / "datos"
datos = pd.read_csv(DATA_DIR / "02_datos_exploratorios_medical_noshow.csv")

## Selección de columnas

In [243]:
columnas_seleccionadas = [
"Gender",
"Age",
"ScheduledDay",
"AppointmentDay",
"Neighbourhood",
"Scholarship",
"Hipertension",
"Diabetes",
"Alcoholism",
"Handcap",
"SMS_received",
"No-show",
]

In [244]:
df_noshow = datos[columnas_seleccionadas]
df_noshow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71959 entries, 0 to 71958
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Gender          71959 non-null  object
 1   Age             71959 non-null  int64 
 2   ScheduledDay    71959 non-null  object
 3   AppointmentDay  71959 non-null  object
 4   Neighbourhood   71959 non-null  object
 5   Scholarship     71959 non-null  int64 
 6   Hipertension    71959 non-null  int64 
 7   Diabetes        71959 non-null  int64 
 8   Alcoholism      71959 non-null  int64 
 9   Handcap         71959 non-null  int64 
 10  SMS_received    71959 non-null  int64 
 11  No-show         71959 non-null  object
dtypes: int64(7), object(5)
memory usage: 6.6+ MB


## Ajustes a los datos

In [245]:
df_noshow['Age'] = df_noshow['Age'].mask(df_noshow['Age'] < 0, np.nan)
df_noshow['No-show'] = df_noshow['No-show'].map({'No': 0, 'Yes': 1})

fechas = ['AppointmentDay', 'ScheduledDay']
for col in fechas:
    df_noshow[col] = pd.to_datetime(df_noshow[col], errors='coerce')  # convierte a datetime
    df_noshow[f"{col}_year"] = df_noshow[col].dt.year
    df_noshow[f"{col}_month"] = df_noshow[col].dt.month
    df_noshow[f"{col}_day"] = df_noshow[col].dt.day

df_noshow.drop(columns=fechas, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_noshow['Age'] = df_noshow['Age'].mask(df_noshow['Age'] < 0, np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_noshow['No-show'] = df_noshow['No-show'].map({'No': 0, 'Yes': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_noshow[col] = pd.to_datetime(df_noshow[col], errors='coerce') 

In [246]:
df_noshow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71959 entries, 0 to 71958
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Gender                71959 non-null  object
 1   Age                   71959 non-null  int64 
 2   Neighbourhood         71959 non-null  object
 3   Scholarship           71959 non-null  int64 
 4   Hipertension          71959 non-null  int64 
 5   Diabetes              71959 non-null  int64 
 6   Alcoholism            71959 non-null  int64 
 7   Handcap               71959 non-null  int64 
 8   SMS_received          71959 non-null  int64 
 9   No-show               71959 non-null  int64 
 10  AppointmentDay_year   71959 non-null  int32 
 11  AppointmentDay_month  71959 non-null  int32 
 12  AppointmentDay_day    71959 non-null  int32 
 13  ScheduledDay_year     71959 non-null  int32 
 14  ScheduledDay_month    71959 non-null  int32 
 15  ScheduledDay_day      71959 non-null

## Preparación de datos

In [247]:
# Separar características y objetivo
X = df_noshow.drop(columns=["No-show"])
y = df_noshow["No-show"].astype(int)

# División entrenamiento-prueba (80-20)
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Conjunto de entrenamiento", x_train.shape, y_train.shape)
print("Conjunto de testeo", x_test.shape, y_test.shape)

Conjunto de entrenamiento (57567, 15) (57567,)
Conjunto de testeo (14392, 15) (14392,)


## Pipelines

In [248]:
# Columnas por tipo
numericas_f = ["Age","Scholarship", "Hipertension", "Diabetes", "Alcoholism", "SMS_received"]
ordinales_f = ["Handcap"]
categoricas_f_solo_genero = ["Gender"]
categoricas_f_genero_neigubourhood = ["Gender","Neighbourhood"]

# Pipeline para variables numéricas: imputación con promedio  ---- Si bien no hay datos nulos en las variables numéricas del actual DF, en otros conjuntos de prueba (o datos sinteticos) se pueden presentar valores faltantes
t_numerico = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('Scaler',StandardScaler())
])

# Pipeline para variable ordinal: imputación y encoding ordinal ---- Si bien no hay datos nulos en las variables ordinales del actual DF, en otros conjuntos de prueba (o datos sinteticos) se pueden presentar valores faltantes
t_ordinal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[[0, 1, 2, 3, 4]]))
])

# Pipeline para variables categóricas: imputación + one-hot encoding ---- Si bien no hay datos nulos en las variables categoricas del actual DF, en otros conjuntos de prueba (o datos sinteticos) se pueden presentar valores faltantes
t_categoricas = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])


## Preprocesadores

In [249]:
preprocessor_1 = ColumnTransformer(
    transformers=[
        ('num', t_numerico, numericas_f),
        ('ord', t_ordinal, ordinales_f),
        ('cat', t_categoricas, categoricas_f_solo_genero)
    ])

preprocessor_1

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[[0, 1, ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [250]:
# Entrenar el Pipeline 
preprocessor_1.fit(x_test)
feature_names = preprocessor_1.get_feature_names_out()

x_test_transformed = preprocessor_1.transform(x_test)
x_test_transformed = pd.DataFrame(x_test_transformed, columns=feature_names)
x_test_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14392 entries, 0 to 14391
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   num__Age           14392 non-null  float64
 1   num__Scholarship   14392 non-null  float64
 2   num__Hipertension  14392 non-null  float64
 3   num__Diabetes      14392 non-null  float64
 4   num__Alcoholism    14392 non-null  float64
 5   num__SMS_received  14392 non-null  float64
 6   ord__Handcap       14392 non-null  float64
 7   cat__Gender_M      14392 non-null  float64
dtypes: float64(8)
memory usage: 899.6 KB


In [251]:
preprocessor_2 = ColumnTransformer(
    transformers=[
        ('num', t_numerico, numericas_f),
        ('ord', t_ordinal, ordinales_f),
        ('cat', t_categoricas, categoricas_f_genero_neigubourhood)
    ])

preprocessor_2

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[[0, 1, ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [252]:
# Entrenar el Pipeline 
preprocessor_2.fit(x_test)
feature_names = preprocessor_2.get_feature_names_out()

x_test_transformed = preprocessor_2.transform(x_test)
x_test_transformed = pd.DataFrame(x_test_transformed, columns=feature_names)
x_test_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14392 entries, 0 to 14391
Data columns (total 85 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   num__Age                                14392 non-null  float64
 1   num__Scholarship                        14392 non-null  float64
 2   num__Hipertension                       14392 non-null  float64
 3   num__Diabetes                           14392 non-null  float64
 4   num__Alcoholism                         14392 non-null  float64
 5   num__SMS_received                       14392 non-null  float64
 6   ord__Handcap                            14392 non-null  float64
 7   cat__Gender_M                           14392 non-null  float64
 8   cat__Neighbourhood_ANDORINHAS           14392 non-null  float64
 9   cat__Neighbourhood_ANTÔNIO HONÓRIO      14392 non-null  float64
 10  cat__Neighbourhood_ARIOVALDO FAVALESSA  14392 non-null  fl

In [253]:
def resumen_clasificación(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)

    return {"accurancy": acc,
            "precision": prec,
            "recall": recall,
            "f1": f1,
            "roc": roc}

In [254]:
# Modelos definidos a probar -con los hiperparametros que tienen por defecto
modelos = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "lightgbm": LGBMClassifier(random_state=42, objective='binary'),
    "xgboost": XGBClassifier(random_state=42,),
    "Extra_tree": ExtraTreesClassifier(random_state=42)
}

In [255]:
# Crear pipelines completos para cada combinación de modelo y preprocesador
pipelines = {}

for modelo_nombre, modelo in modelos.items():
    # Pipeline 1 con OneHotEncoder solo para el género
    pipelines[f"{modelo_nombre}_SG"] = Pipeline([
        ("preprocessing", preprocessor_1),
        ("classifier", modelo)
    ])
    # Pipeline con OneHotEncoder para género y Neighbourhood
    pipelines[f"{modelo_nombre}_GN"] = Pipeline([
        ("preprocessing", preprocessor_2),
        ("classifier", modelo)
    ])

In [256]:
pipelines

{'RandomForest_SG': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer()),
                                                                   ('Scaler',
                                                                    StandardScaler())]),
                                                   ['Age', 'Scholarship',
                                                    'Hipertension', 'Diabetes',
                                                    'Alcoholism',
                                                    'SMS_received']),
                                                  ('ord',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='most_frequent')),
                   

In [257]:
# Entrenar y evaluar cada pipeline
resultados = {}

for nombre_pipeline, pipeline in pipelines.items():
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    resultados[nombre_pipeline] = resumen_clasificación(y_test, y_pred)

In [258]:
print("Resumen de Métricas para Cada Pipeline:")
pprint.pprint(resultados)

Resumen de Métricas para Cada Pipeline:
{'Extra_tree_GN': {'accurancy': 0.6596720400222346,
                   'f1': 0.2553967771359076,
                   'precision': 0.3395311236863379,
                   'recall': 0.2046783625730994,
                   'roc': 0.522926273043937},
 'Extra_tree_SG': {'accurancy': 0.7099777654252363,
                   'f1': 0.05693628558517849,
                   'precision': 0.391304347826087,
                   'recall': 0.03070175438596491,
                   'roc': 0.5058252162287522},
 'RandomForest_GN': {'accurancy': 0.6545997776542524,
                     'f1': 0.27568118898440913,
                     'precision': 0.3428778542950344,
                     'recall': 0.23050682261208577,
                     'roc': 0.5271410473869138},
 'RandomForest_SG': {'accurancy': 0.7091439688715954,
                     'f1': 0.06604194556001784,
                     'precision': 0.3915343915343915,
                     'recall': 0.036062378167641324,
    

In [259]:
df_resultados = pd.DataFrame(resultados).T
df_resultados_sorted = df_resultados.sort_values(by="precision", ascending=False)
df_resultados_sorted

Unnamed: 0,accurancy,precision,recall,f1,roc
lightgbm_GN,0.715467,0.555556,0.010965,0.021505,0.503733
lightgbm_SG,0.715189,0.540984,0.008041,0.015846,0.50266
xgboost_SG,0.715258,0.532609,0.01194,0.023356,0.50388
xgboost_GN,0.71366,0.468635,0.030945,0.058057,0.508474
RandomForest_SG,0.709144,0.391534,0.036062,0.066042,0.506853
Extra_tree_SG,0.709978,0.391304,0.030702,0.056936,0.505825
RandomForest_GN,0.6546,0.342878,0.230507,0.275681,0.527141
Extra_tree_GN,0.659672,0.339531,0.204678,0.255397,0.522926


En el contexto de este problema las clases de la variable objetivo están desbalanceadas --> Hay más personas que sí asistieron (79.80%) de las que no asistieron (20.19%). Por lo anterior, el accurancy no es una métrica ideal para este problema.
En cambio, se ha decidio optimizar la precision dado que puede ser de mayor interés conocer cuantos realmente NO asisten para mejorar la gestión de los costos y mejorar la planeación de las citas. En este sentido, el mejor modelo es una regresión logística con el preprocesador 2

## Validación cruzada

In [272]:
# DataFrame para guardar los resultados de cada fold
df_cv_results = pd.DataFrame(columns=["pipeline", "fold", "precision"])

# Número de folds
cv_folds = 5

for pipeline_name, pipeline_obj in pipelines.items():
    # cross_val_score entrena y evalúa en 5 folds
    scores = cross_val_score(pipeline_obj, x_train, y_train, cv=cv_folds, scoring="precision")

    # Crear un DataFrame temporal con la información de cada fold
    temp_df = pd.DataFrame({
        "pipeline": [pipeline_name]*cv_folds,
        "fold": list(range(1, cv_folds+1)),
        "precision": scores
    })

    # Concatenar al DataFrame global
    df_cv_results = pd.concat([df_cv_results, temp_df], ignore_index=True)


In [None]:
df_cv_results.head(40)

Unnamed: 0,pipeline,fold,precision
0,RandomForest_SG,1,0.394495
1,RandomForest_SG,2,0.419753
2,RandomForest_SG,3,0.335731
3,RandomForest_SG,4,0.398671
4,RandomForest_SG,5,0.411215
5,RandomForest_GN,1,0.33968
6,RandomForest_GN,2,0.360792
7,RandomForest_GN,3,0.336043
8,RandomForest_GN,4,0.348647
9,RandomForest_GN,5,0.35029


In [None]:
# Creamos el pipeline que mejor desempeño tuvo y el clasificador (LGBM)
pipeline_GNM = Pipeline([
    ("preprocessing", preprocessor_2),
    ("classifier", LGBMClassifier(random_state=42))
])

# Grilla de hiperparámetros adaptada a LightGBM
param_grid = {
    "classifier__n_estimators": [100, 300, 500],         # número de árboles
    "classifier__learning_rate": [0.01, 0.05, 0.1],      # tasa de aprendizaje
    "classifier__num_leaves": [10, 15, 40],              # número de hojas por árbol
    "classifier__max_depth": [5, 10, 20],            # profundidad máxima
}

In [274]:
# Configuramos GridSearchCV para evaluar con 4 folds y usando la métrica "precision"
grid_search = GridSearchCV(
    pipeline_GNM,
    param_grid,
    cv=4,
    scoring="precision",
    n_jobs=-1 
)

In [275]:
# Ejecutamos el grid search usando los datos de entrenamiento
grid_search.fit(x_train, y_train)

# Mostramos los mejores parámetros y el mejor accuracy obtenido en validación cruzada
print("Mejores parámetros:", grid_search.best_params_)
print("Mejor precisión:", grid_search.best_score_)

Mejores parámetros: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 10, 'classifier__n_estimators': 300, 'classifier__num_leaves': 10}
Mejor precisión: 0.7416719375922411


## Guardar el modelo

In [None]:
DATA_DIR = Path.cwd() / "Modelos"

mejor_modelo = grid_search.best_estimator_
joblib.dump(mejor_modelo, DATA_DIR / "Classification_medical_no_show-LGBM.joblib")