# Importación de librerías  

In [72]:
import pandas as pd
import numpy as np
from IPython.display import Image
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate 

import warnings
warnings.filterwarnings('ignore')

# Cargar datos

In [45]:
#data = pd.read_csv("../data/clean/personas_limpio.csv")
data = pd.read_csv("https://raw.githubusercontent.com/paulguz261/MIAD_20242_proyecto_despliegue_aplicaciones/main/data/clean/personas_limpio.csv", sep=",")
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [21]:
# Dado que las variables binarias quedaron como númericas, se procede a convertirlas de nuevo
def convert_to_categorical(df, columns):
    """
    Convierte las columnas especificadas de un DataFrame a tipo categórico.

    Parámetros:
    df (pd.DataFrame): El DataFrame en el que se realizarán las conversiones.
    columns (list): Lista de nombres de columnas a convertir a tipo categórico.

    Retorna:
    pd.DataFrame: El DataFrame con las columnas especificadas convertidas a tipo categórico.
    """
    for column in columns:
        if column in df.columns:
            df[column] = pd.Categorical(df[column])
        else:
            print(f"Columna '{column}' no encontrada en el DataFrame.")
    return df

categorical_columns = ['hypertension', 'heart_disease','stroke']
data = convert_to_categorical(data, categorical_columns)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   gender             5110 non-null   object  
 1   age                5110 non-null   float64 
 2   hypertension       5110 non-null   category
 3   heart_disease      5110 non-null   category
 4   ever_married       5110 non-null   object  
 5   work_type          5110 non-null   object  
 6   Residence_type     5110 non-null   object  
 7   avg_glucose_level  5110 non-null   float64 
 8   bmi                5110 non-null   float64 
 9   smoking_status     5110 non-null   object  
 10  stroke             5110 non-null   category
dtypes: category(3), float64(3), object(5)
memory usage: 334.8+ KB


# Selección de variables

In [49]:
# Variables seleccionadas para la aplicación del modelo
FEATURES = [
    'gender', 
    'age', 
    'hypertension', 
    'heart_disease', 
    'ever_married',
    'work_type', 
    'Residence_type', 
    'avg_glucose_level', 
    'bmi',
    'smoking_status'
    ]

# Procesamiento de los datos

In [23]:
# Conteo de registros por clase
data['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [47]:
# Definición de las variables predictoras y a predecir
X = data.drop(columns=['stroke'])  # Eliminar variable a predecir
y = data['stroke']

In [50]:
# Seleccionar solo variables elegidas
X_new = X[FEATURES]

In [51]:
# Conversión de variables categoricas a numericas con One-Hot Encoding
data_dummies = pd.get_dummies(X_new,columns=X_new.select_dtypes(include=['object','category']).columns.to_list())


In [61]:
# Aplicar técnica de balanceo de clases
# Oversampling con SMOTE
undersample = SMOTE(sampling_strategy=0.5)  # Reduce la clase mayoritaria a una proporción de 0.5
X_resampled, y_resampled = undersample.fit_resample(data_dummies, y)

In [62]:
# Conteo de clases después de aplicar balanceo de clases
y_resampled.value_counts()

stroke
0    4861
1    2430
Name: count, dtype: int64

In [63]:
# Estandarización de las variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

In [64]:
# Reducción de dimensionalidad aplicando PCA
pca = PCA(n_components=0.95)  # Conservar el 95% de la varianza
X_pca = pca.fit_transform(X_scaled)

print(f"Dimensiones antes de PCA: {X_scaled.shape}")
print(f"Dimensiones después de PCA: {X_pca.shape}")

Dimensiones antes de PCA: (7291, 21)
Dimensiones después de PCA: (7291, 16)


# Modelo final

In [65]:
# Separación de variables predictoras (X) y variable de interés (y) en set de entrenamiento y test usandola función train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_resampled, test_size=0.33, random_state=40)

In [69]:
# Modelo con los mejores hiperparametros encontrados
clf = XGBClassifier(objective='binary:logistic',
                    n_estimators=312,
                    max_depth=9,
                    learning_rate=0.078,
                    subsample=0.909,
                    colsample_bytree=0.895,
                    min_child_weight=4,
                    reg_lambda=9,
                    reg_alpha=6)

In [70]:
# Entrenamiento (fit) y desempeño del modelo XGBClassifier
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1 = metrics.f1_score(y_pred, y_test.values)
acc = metrics.accuracy_score(y_pred, y_test.values)
conf_matrix = metrics.confusion_matrix(y_pred, y_test.values)
prec = metrics.precision_score(y_pred, y_test.values)
recall = metrics.recall_score(y_pred, y_test.values)
roc = metrics.roc_auc_score(y_pred, y_test.values)

In [71]:
print('Accuracy Score', acc)
print('Precision', prec)
print('Recall', recall)
print('F1-Score', f1)
print('ROC Score', roc)
print(conf_matrix)

Accuracy Score 0.9597008724553386
Precision 0.9078455790784558
Recall 0.9694148936170213
F1-Score 0.9376205787781351
ROC Score 0.9623509513402327
[[1581   74]
 [  23  729]]


In [73]:
# Validación cruzada para validar que el modelo no este sobreajustado
cv_results = cross_validate(clf, X_pca, y_resampled, scoring = ('f1', 'accuracy', 'roc_auc'), cv = 8)
sorted(cv_results.keys())

['fit_time', 'score_time', 'test_accuracy', 'test_f1', 'test_roc_auc']

In [74]:
cv_results['test_f1'] 

array([0.3423913 , 0.97560976, 0.9822294 , 0.97553018, 0.99507389,
       0.98531811, 0.99176277, 0.96710526])

In [75]:
cv_results['test_accuracy'] 

array([0.73464912, 0.98355263, 0.9879386 , 0.98353458, 0.99670692,
       0.99012075, 0.99451153, 0.9780461 ])