In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sys
sys.path.append("..")

from utils.funciones_toolbox_ml_final import *
from utils.modulos import *

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer,OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score, balanced_accuracy_score, make_scorer
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


## Problema de negocio

- Identificar a los pacientes que tienen Síndrome Metabólico

## Objetivo técnico

- Crear un modelo supervisado de clasificación

## Métricas

- La métrica a utilizar será el recall de la clase 1, es decir de los pacientes que tienen Síndrome Metábolico

## Obtención de los datos

In [2]:
df = pd.read_csv("../data/MetabolicSyndrome.csv")
df.head()

Unnamed: 0,seqn,Age,Sex,Marital,Income,Race,WaistCirc,BMI,Albuminuria,UrAlbCr,UricAcid,BloodGlucose,HDL,Triglycerides,MetabolicSyndrome
0,62161,22,Male,Single,8200.0,White,81.0,23.3,0,3.88,4.9,92,41,84,0
1,62164,44,Female,Married,4500.0,White,80.1,23.2,0,8.55,4.5,82,28,56,0
2,62169,21,Male,Single,800.0,Asian,69.6,20.1,0,5.07,5.4,107,43,78,0
3,62172,43,Female,Single,2000.0,Black,120.4,33.3,0,5.22,5.0,104,73,141,0
4,62177,51,Male,Married,,Asian,81.1,20.1,0,8.13,5.0,95,43,126,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2401 entries, 0 to 2400
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   seqn               2401 non-null   int64  
 1   Age                2401 non-null   int64  
 2   Sex                2401 non-null   object 
 3   Marital            2193 non-null   object 
 4   Income             2284 non-null   float64
 5   Race               2401 non-null   object 
 6   WaistCirc          2316 non-null   float64
 7   BMI                2375 non-null   float64
 8   Albuminuria        2401 non-null   int64  
 9   UrAlbCr            2401 non-null   float64
 10  UricAcid           2401 non-null   float64
 11  BloodGlucose       2401 non-null   int64  
 12  HDL                2401 non-null   int64  
 13  Triglycerides      2401 non-null   int64  
 14  MetabolicSyndrome  2401 non-null   int64  
dtypes: float64(5), int64(7), object(3)
memory usage: 281.5+ KB


In [4]:
describe_df(df)

Unnamed: 0,seqn,Age,Sex,Marital,Income,Race,WaistCirc,BMI,Albuminuria,UrAlbCr,UricAcid,BloodGlucose,HDL,Triglycerides,MetabolicSyndrome
Tipo,int64,int64,object,object,float64,object,float64,float64,int64,float64,float64,int64,int64,int64,int64
Porcentaje_Nulos,0.0,0.0,0.0,8.663057,4.87297,0.0,3.540192,1.082882,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Valores_Unicos,2401,61,2,5,14,6,635,322,3,1383,88,187,93,345,2
Porcentaje_Cardinalidad,100.0,2.540608,0.083299,0.208247,0.58309,0.249896,26.447314,13.411079,0.124948,57.601,3.66514,7.788421,3.873386,14.369013,0.083299


In [5]:
tabla_descriptiva = tipifica_variables(df,10,30)
tabla_descriptiva

Unnamed: 0,nombre_variable,tipo_sugerido
0,seqn,Numerica Continua
1,Age,Numerica Discreta
2,Sex,Binaria
3,Marital,Categorica
4,Income,Numerica Discreta
5,Race,Categorica
6,WaistCirc,Numerica Discreta
7,BMI,Numerica Discreta
8,Albuminuria,Categorica
9,UrAlbCr,Numerica Continua


In [6]:
descripcion = {"Descripcion":["Índice único", "Edad", "Género", 
                              "Estado civil","Ingresos", "Raza",
                              "Circunferencia Abdominal", "Índice de Masa Corporal",
                              "Medida de Albumina en orina", "Relación Albumina-Creatitina en orina",
                              "Ácido Úrico", "Azúcar en sangre", 
                              "Niveles de colesterol de lipoproteínas de alta densidad('colesterol bueno')",
                              "Triglicéridos", "Indica la presencia (1) o ausencia (0) de síndrome metabólico"
                              ]}
df_descripcion = pd.DataFrame(descripcion)

tabla_descriptiva = pd.concat([tabla_descriptiva, df_descripcion], axis=1)

In [7]:
tabla_descriptiva
tabla_descriptiva

Unnamed: 0,nombre_variable,tipo_sugerido,Descripcion
0,seqn,Numerica Continua,Índice único
1,Age,Numerica Discreta,Edad
2,Sex,Binaria,Género
3,Marital,Categorica,Estado civil
4,Income,Numerica Discreta,Ingresos
5,Race,Categorica,Raza
6,WaistCirc,Numerica Discreta,Circunferencia Abdominal
7,BMI,Numerica Discreta,Índice de Masa Corporal
8,Albuminuria,Categorica,Medida de Albumina en orina
9,UrAlbCr,Numerica Continua,Relación Albumina-Creatitina en orina


In [8]:
# Verificamos que no hayan duplicados
print(df.duplicated().sum())

0


- Voy a eliminar las columnas que no me aportan nada a la hora de realizar el modelo, "seqn" porque es un índice de los pacientes y tiene una cardinalidad del 100%, y elimino "UrAlbCr" porque aporta la misma información que Albuminuria

## Mini EDA

#### Definimos el target

- El target está desbalanceado, hay mayor concentración de pacientes hacía la clase 0 es decir los pacientes sanos, tendré que tener éste factor en cuenta a la hora de realizar el modelo porque lo que nos interesa conseguir es la mayor precisión en el grupo 1.

## Pipeline

In [9]:
target = 'MetabolicSyndrome' 

X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73, stratify=y)

In [10]:
def categorize_BMI(bmi_list):
    df = pd.DataFrame(bmi_list, columns=['BMI'])
    bins = [-np.inf, 18.5, 24.9, 29.9, np.inf]
    labels = ['Bajo peso', 'Normal', 'Sobrepeso', 'Obesidad']
    df['BMI'] = pd.cut(df['BMI'], bins=bins, labels=labels)  # Reemplaza la columna 'BMI'
    return df[['BMI']]  # Retorna solo la columna transformada

def categorize_BloodGlucose(blood_glucose_list):
    df = pd.DataFrame(blood_glucose_list, columns=['BloodGlucose'])
    bins = [-np.inf, 99, 126, np.inf]
    labels = ['Normal', 'Prediabetes', 'Diabetes']
    df['BloodGlucose'] = pd.cut(df['BloodGlucose'], bins=bins, labels=labels)  # Reemplaza 'BloodGlucose'
    return df[['BloodGlucose']]  # Retorna solo la columna transformada

def categorize_Triglycerides(data):
    df = pd.DataFrame(data, columns=['Age', 'Triglycerides'])
    def categorize_row(row):
        age, triglycerides = row
        if 10 <= age <= 19:
            if triglycerides < 90:
                return 'Nivel normal'
            elif triglycerides < 150:
                return 'Niveles ligeramente altos'
            else:
                return 'Niveles altos'
        else:
            if triglycerides < 150:
                return 'Nivel normal'
            elif triglycerides < 200:
                return 'Niveles ligeramente altos'
            elif triglycerides < 500:
                return 'Niveles altos'
            else:
                return 'Niveles muy altos'

    df['Triglycerides'] = df[['Age', 'Triglycerides']].apply(categorize_row, axis=1)
    return df[['Triglycerides']]

# Función para categorizar el HDL
def categorize_HDL(data):
    df = pd.DataFrame(data, columns=['Sex', 'HDL'])
    def categorize_row(row):
        sex, hdl = row
        return 'Valor_Bajo' if (sex == 'Male' and hdl < 40) or (sex == 'Female' and hdl < 50) else 'Normal'

    df['HDL'] = df[['Sex', 'HDL']].apply(categorize_row, axis=1)
    return df[['HDL']]

# Función para categorizar la circunferencia de cintura (WaistCirc)
def categorize_WaistCirc(data):
    df = pd.DataFrame(data, columns=['Sex', 'WaistCirc'])
    def categorize_row(row):
        sex, waistcirc = row
        return 'Riesgo Elevado' if (sex == 'Female' and waistcirc > 88) or (sex == 'Male' and waistcirc > 102) else 'Normal'
    
    df['WaistCirc'] = df[['Sex', 'WaistCirc']].apply(categorize_row, axis=1)
    return df[['WaistCirc']]


--------

In [11]:
# Definición de los transformadores para cada columna
bmi_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('func_transform', FunctionTransformer(categorize_BMI, validate=False)),
    ('one_hot', OneHotEncoder())
])

blood_glucose_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('func_transform', FunctionTransformer(categorize_BloodGlucose)),
    ('one_hot', OneHotEncoder())
])

triglycerides_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('func_transform', FunctionTransformer(categorize_Triglycerides)),
    ('one_hot', OneHotEncoder())
])

hdl_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('func_transform', FunctionTransformer(categorize_HDL)),
    ('one_hot', OneHotEncoder())
])

waist_circ_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('func_transform', FunctionTransformer(categorize_WaistCirc)),
    ('one_hot', OneHotEncoder())
])

income_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('power_transform', PowerTransformer())
])

age_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('power_transform', PowerTransformer())
])

sex_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder())
])

marital_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder())
])

race_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder())
])

albuminuria_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('power_transform', PowerTransformer())
])

uricacid_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('power_transform', PowerTransformer())
])

# ColumnTransformer para aplicar las transformaciones a las columnas específicas
preprocessor = ColumnTransformer(
    transformers=[
        ('bmi', bmi_transformer, ['BMI']),
        ('blood_glucose', blood_glucose_transformer, ['BloodGlucose']),
        ('triglycerides', triglycerides_transformer, ['Age','Triglycerides']),
        ('hdl', hdl_transformer, ['Sex','HDL']),
        ('waist_circ', waist_circ_transformer, ['Sex','WaistCirc']),
        ('income', income_transformer, ['Income']),
        ('age', age_transformer, ['Age']),
        ('sex', sex_transformer, ['Sex']),
        ('marital', marital_transformer, ['Marital']),
        ('race', race_transformer, ['Race']),
        ('albuminuria', albuminuria_transformer, ['Albuminuria']),
        ('uricacid', uricacid_transformer, ['UricAcid']),
    ], remainder='drop'
)

In [12]:
modelos = {
    'RandomForest': RandomForestClassifier(random_state=42, class_weight="balanced"),
    'XGBoost': XGBClassifier(verbosity=0, random_state=42, scale_pos_weight=70/30),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-100, class_weight='balanced'),
    'LogisticRegression': LogisticRegression(max_iter=10000, class_weight='balanced'),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=False, auto_class_weights='Balanced'),
    'DecisionTree': DecisionTreeClassifier(random_state=42, class_weight="balanced"),
    'SVC': SVC(random_state=42, class_weight='balanced'),
    'KNeighbors': KNeighborsClassifier(n_neighbors=4)
}

In [13]:
resultados = {}

for nombre, modelo in modelos.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', modelo)
    ])

    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='recall')
    media_scores = scores.mean()
    resultados[nombre] = [media_scores]

df_resultados = pd.DataFrame(resultados, index=['Media Recall']).T.sort_values(by='Media Recall', ascending=False)


In [14]:
df_resultados

Unnamed: 0,Media Recall
SVC,0.942112
LogisticRegression,0.925364
CatBoost,0.891892
LightGBM,0.850821
XGBoost,0.84926
RandomForest,0.82496
DecisionTree,0.789949
KNeighbors,0.601238


In [15]:
param_grid_rf = {
    "classifier__n_estimators": [50,100,200],
    "classifier__max_depth": [None,20,30],
    "classifier__min_samples_split": [2, 10, 20],
    "classifier__min_samples_leaf": [1,5,10],
    "classifier__max_features": ["sqrt","log2",None],
    "classifier__class_weight": ["balanced", None],
}
param_grid_xgb= {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0]
}
param_grid_lgb= {
    "classifier__max_depth": [-1,5,10],
    "classifier__num_leaves": [31, 50],
    "classifier__learning_rate": [0.1, 0.01],
    "classifier__n_estimators": [100, 200],
    "classifier__class_weight": ["balanced", None], 
    "classifier__min_child_samples": [20, 30],
    "classifier__subsample": [0.8, 1.0],
    "classifier__colsample_bytree": [0.8, 1.0]
}
param_grid_tree = {
    "classifier__criterion": ["gini","entropy"],
    "classifier__splitter": ["best", "random"],
    "classifier__max_depth": [None,20,30,40],
    "classifier__min_samples_split":[2,10,20],
    "classifier__min_samples_leaf":[1,5,10],
    "classifier__max_features": ["sqrt","log2",None],
    "classifier__class_weight": ["balanced", None]
}
param_grid_lg= {
    "classifier__C":[0.01, 0.1, 1, 10],
    "classifier__max_iter":[1000,2000,5000],
    "classifier__class_weight":["balanced",None]
}                
param_grid_knn= {
    "classifier__n_neighbors":[3,4,5],
    "classifier__weights":['uniform', 'distance'],
    "classifier__metric":["manhattan","euclidean","chebyshev"]
}
param_grid_cat= {
    "classifier__iterations": [100, 300], 
    "classifier__learning_rate": [0.01, 0.05, 0.1],  
    "classifier__depth": [4, 6, 8],  
    "classifier__l2_leaf_reg": [1, 3, 5],  
    "classifier__bagging_temperature": [0, 1, 10],
    "classifier__auto_class_weights": ["Balanced"]
}
param_grid_svc= {
    "classifier__C":[0.01, 0.1, 1, 10, 100],
    "classifier__kernel":['linear', 'poly', 'rbf', 'sigmoid'],
    "classifier__gamma":['scale', 'auto'],
    "classifier__class_weight":["balanced",None]
}

In [16]:
modelos = {
    'Random_Forest': (RandomForestClassifier(random_state=42), param_grid_rf),
    'XGBoost': (XGBClassifier(verbosity=0, random_state=42, scale_pos_weight=70/30), param_grid_xgb),
    'LightGBM':(LGBMClassifier(random_state= 42, verbose = -100), param_grid_lgb),
    'DecisionTree':(DecisionTreeClassifier(random_state= 42), param_grid_tree),
    'LogisticRegression':(LogisticRegression(random_state=42), param_grid_lg),
    'KNeighbors':(KNeighborsClassifier(), param_grid_knn),
    'CatBoost':(CatBoostClassifier(random_state= 42, verbose= False), param_grid_cat),
    'SVC':(SVC(random_state= 42), param_grid_svc),
}

resultados = {}

for nombre, (modelo, param_grid) in modelos.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', modelo)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring="recall")
    grid_search.fit(X_train, y_train)
    resultados[nombre] = grid_search.best_score_

# Crear DataFrame para los resultados
df_resultadosGS = pd.DataFrame.from_dict(resultados, orient='index', columns=['Media Recall'])
df_resultadosGS.sort_values(by='Media Recall', ascending=False, inplace=True)
df_resultadosGS


Unnamed: 0,Media Recall
XGBoost,0.971039
SVC,0.961959
CatBoost,0.9452
LightGBM,0.942135
LogisticRegression,0.937578
Random_Forest,0.931483
DecisionTree,0.919269
KNeighbors,0.774705
