In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [48]:
# Cargar el conjunto de datos de Breast Cancer
df = pd.read_csv('../data/processed/df_encoded.csv', sep=';')


In [49]:
# Identificar columnas categóricas
categorical_cols = df.select_dtypes(include=['object']).columns

# Codificación de variables categóricas usando One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Verificar las primeras filas del dataset codificado
df_encoded.head()

Unnamed: 0,DIAS_HASTA_INICIO_TRATAMIENTO,SOBREVIVE,DIAS_DESDE_NACIMIENTO_A_DIAGNO,DIAS_DESDE_INGRESO_A_DIAGNO,DIAS_DESDE_DIAGNO_A_COMITE,DIAS_DESDE_TOM_MUESTRA_A_DIAGNO,DIAS_DESDE_DIAGNO_TRATAMIENTO_2,DIAS_DESDE_TRATAMIENTO_1_A_TRATAMIENTO_2,DIAS_TRATAMIENTO_1,DIAS_TRATAMIENTO_2,...,N_N3,N_NX,M_M,M_M0,M_M0|P,M_M0|Y,M_M1,M_M1b,M_MX,M_MX|Y
0,468,0.0,29509,0,468.0,0.0,,,3260.0,,...,False,False,False,True,False,False,False,False,False,False
1,97,0.0,27405,0,34.0,0.0,,,79.0,,...,False,True,False,True,False,False,False,False,False,False
2,65,1.0,22430,41,2870.0,0.0,-2984.0,2919.0,0.0,55.0,...,False,False,False,True,False,False,False,False,False,False
3,55,0.0,25558,36,55.0,15.0,,,189.0,,...,False,False,False,True,False,False,False,False,False,False
4,421,0.0,25984,6,441.0,0.0,-505.0,-1036.0,1120.0,61.0,...,False,False,False,True,False,False,False,False,False,False


In [50]:
from sklearn.preprocessing import StandardScaler

# Identificar columnas numéricas
numeric_features = df_encoded.select_dtypes(include=['float64', 'int64']).columns

# Normalizar los datos numéricos
scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])

# Verificar las primeras filas del dataset normalizado
df_encoded.head()

Unnamed: 0,DIAS_HASTA_INICIO_TRATAMIENTO,SOBREVIVE,DIAS_DESDE_NACIMIENTO_A_DIAGNO,DIAS_DESDE_INGRESO_A_DIAGNO,DIAS_DESDE_DIAGNO_A_COMITE,DIAS_DESDE_TOM_MUESTRA_A_DIAGNO,DIAS_DESDE_DIAGNO_TRATAMIENTO_2,DIAS_DESDE_TRATAMIENTO_1_A_TRATAMIENTO_2,DIAS_TRATAMIENTO_1,DIAS_TRATAMIENTO_2,...,N_N3,N_NX,M_M,M_M0,M_M0|P,M_M0|Y,M_M1,M_M1b,M_MX,M_MX|Y
0,0.622586,-2.170388,1.525265,0.45804,0.645291,-0.12467,,,9.428965,,...,False,False,False,True,False,False,False,False,False,False
1,-0.502385,-2.170388,0.773702,0.45804,-0.422637,-0.12467,,,-0.146071,,...,False,True,False,True,False,False,False,False,False,False
2,-0.599418,0.460747,-1.003403,0.579836,6.555808,-0.12467,-6.59762,6.59319,-0.383866,-0.337514,...,False,False,False,True,False,False,False,False,False,False
3,-0.62974,-2.170388,0.11394,0.564983,-0.370963,0.180652,,,0.185037,,...,False,False,False,True,False,False,False,False,False,False
4,0.48007,-2.170388,0.266111,0.475864,0.578853,-0.12467,-0.369753,-2.105464,2.987413,-0.319513,...,False,False,False,True,False,False,False,False,False,False


In [52]:
#Separación de variables independientes y dependiente
y = df["SOBREVIVE"]
X = df_encoded.drop("SOBREVIVE", axis=1)

In [53]:
#División de datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Entrenamiento del modelo XGBoost
modelo = XGBClassifier()
modelo.fit(X_train, y_train)

In [57]:
# Evaluación del Modelo XGBoost

from sklearn.metrics import f1_score, recall_score

y_pred = modelo.predict(X_test)

precision = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Precisión:", precision)
print("AUC-ROC:", auc)
print("F1:", f1)
print("Recall:", recall)

Precisión: 0.8960302457466919
AUC-ROC: 0.7844256518675123
F1: 0.9377123442808608
Recall: 0.9627906976744186
