In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from time import time

from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sksurv.metrics import concordance_index_censored

# Modelos
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

## Parametros

In [2]:
client_bq = bigquery.Client()

In [3]:
table = 'Diabetes_avicena_survival.diabetes_final_4_annos'

variables_with_outliers = ['edad','IMC','HDL','LDL','trigliceridos','perimetro_abdominal']

numeric_columns = ['edad','IMC','HDL','LDL','trigliceridos','perimetro_abdominal']
categoric_columns = ['genero_paciente','raza_paciente','nivel_academico_paciente','ant_cardio','med_hipertension','ant_familiar_dm','hace_ejercicio']
columns_not_in_count = ['ant_familiar_dm', 'raza_paciente','hace_ejercicio']
target = 'diabetes'

dict_var_categoricas = {
    # Nivel Academico
    "Ninguno" : 'ninguno',

    "Básica secundaria" : 'educacion_basica', 
    "Básica primaria" : 'educacion_basica',

    "Normalista" : 'educacion_media',
    "Bachillerato técnico" : 'educacion_media',
    "Técnica profesional" : 'educacion_media',
    "Tecnológica" : 'educacion_media',
    "Media académica o clásica" : 'educacion_media',

    "Profesional" : 'educacion_superior',
    "Especialización" : 'educacion_superior',
    "Preescolar" : 'educacion_superior',
    "Doctorado" : 'educacion_superior',
    "Maestría" : 'educacion_superior',
        
    # Ejercicio
    'Nunca' : 'No',
    '20 minutos' : '20 min',
    '40 minutos' : 'Mas de 20 min',
    '60 minutos' : 'Mas de 20 min',

    # Dicotomicas
    "1" : 'Si',
    '0' : 'No'
    

}

## Funciones

In [4]:
def escalar(data):
    
    # Crear el escalador
    scaler = MinMaxScaler()
    
    # Entrenar el escalador
    scaler.fit(data)
    
    # Re-escalar los datos
    df_escalado = pd.DataFrame(scaler.transform(data))
    
    return df_escalado

def outlier_label(value, limit):
    if value < limit[0]:
        return 'Abajo'
    elif value > limit[1]:
        return 'Arriba'
    else:
        return 'No'

def tag_outliers(data, variable):
    Q1 = data[variable].quantile(0.25)
    Q3 = data[variable].quantile(0.75)
    RIC = Q3 - Q1
    limit_inf = Q1 - (1.5 * RIC)
    limit_sup = Q3 + (1.5 * RIC)
    data[f'{variable}_outlier'] = data[variable].apply(lambda x: outlier_label(x, [limit_inf,limit_sup]))
    return data

def take_out_outliers(data, variables_with_outliers, verbose = True):
    
    data[variables_with_outliers] = data[variables_with_outliers].astype(float)
    for variable in variables_with_outliers:
        data_label = tag_outliers(data, variable)

    if verbose:
        for variable in variables_with_outliers:
            print(variable)
            conteos = data_label[f'{variable}_outlier'].value_counts().reset_index()
            total = conteos['count'].sum()
            conteos['Porcentaje'] = (conteos['count'] / total)*100
            display(conteos)

    columns_to_drop = [column + '_outlier' for column in variables_with_outliers]

    data_clean_outliers = data_label[(data_label.edad_outlier == 'No') &
                                     (data_label.IMC_outlier == 'No') &
                                     (data_label.HDL_outlier == 'No') &
                                     (data_label.LDL_outlier == 'No') &
                                     (data_label.trigliceridos_outlier == 'No') &
                                     (data_label.perimetro_abdominal_outlier == 'No')
                                    ]

    data_clean_outliers = data_clean_outliers.drop(columns = columns_to_drop)

    return data_clean_outliers

## Carga de datos

In [5]:
data = client_bq.query(f'SELECT * FROM {table}').result().to_dataframe()
print(f'Se trajo {data.shape} datos de pacientes')
data.head()

Se trajo (850302, 21) datos de pacientes


Unnamed: 0,numero_identificacion_paciente,year,month,fecha,edad,peso,talla,IMC,HDL,LDL,...,perimetro_abdominal,genero_paciente,raza_paciente,nivel_academico_paciente,ant_cardio,med_hipertension,ant_familiar_dm,hace_ejercicio,diabetes,time_to_event
0,72045699,2021,2,2021-02-01,51,109.0,1.75,35.591836735,50.0,116.0,...,118.0,Masculino,Mestizo,Ninguno,0,0,0,,0,40
1,72131717,2021,2,2021-02-01,56,93.0,1.67,33.346480691,39.5,108.0,...,,Masculino,Otros,Normalista,1,1,0,,1,7
2,72138809,2021,2,2021-02-01,55,106.0,1.7,36.678200692,30.0,65.0,...,110.0,Masculino,Otros,Normalista,1,1,0,,0,39
3,72148676,2021,2,2021-02-01,54,87.7,1.74,28.96683842,39.0,188.0,...,104.0,Masculino,Otros,Profesional,1,1,0,,0,13
4,74359767,2021,2,2021-02-01,44,91.0,1.75,29.714285714,33.0,140.0,...,103.0,Masculino,Otros,Profesional,0,0,0,,0,47


## Proceso

In [6]:
# Dataframe con los datos numericos
df_numerico = data[numeric_columns]
df_numerico['IMC'] = np.round(df_numerico['IMC'].astype(float),2)

df_numerico.head()

Unnamed: 0,edad,IMC,HDL,LDL,trigliceridos,perimetro_abdominal
0,51,35.59,50.0,116.0,301.0,118.0
1,56,33.35,39.5,108.0,225.0,
2,55,36.68,30.0,65.0,168.0,110.0
3,54,28.97,39.0,188.0,143.0,104.0
4,44,29.71,33.0,140.0,220.0,103.0


In [7]:
# Dataframe con los datos categoricos
df_categorico = data[categoric_columns].astype(str)

df_categorico.hace_ejercicio = df_categorico.hace_ejercicio.replace(dict_var_categoricas)
df_categorico.nivel_academico_paciente = df_categorico.nivel_academico_paciente.replace(dict_var_categoricas)
df_categorico.ant_cardio = df_categorico.ant_cardio.replace(dict_var_categoricas)
df_categorico.med_hipertension = df_categorico.med_hipertension.replace(dict_var_categoricas)

# df_categorico = df_categorico.drop(columns = columns_not_in_count)

df_categorico

Unnamed: 0,genero_paciente,raza_paciente,nivel_academico_paciente,ant_cardio,med_hipertension,ant_familiar_dm,hace_ejercicio
0,Masculino,Mestizo,ninguno,No,No,0,
1,Masculino,Otros,educacion_media,Si,Si,0,
2,Masculino,Otros,educacion_media,Si,Si,0,
3,Masculino,Otros,educacion_superior,Si,Si,0,
4,Masculino,Otros,educacion_superior,No,No,0,
...,...,...,...,...,...,...,...
850297,Masculino,Otros,educacion_superior,Si,Si,0,
850298,Masculino,Otros,educacion_superior,No,No,0,
850299,Masculino,Afrocolombiano,educacion_superior,No,No,0,
850300,Masculino,Otros,ninguno,No,No,0,


In [8]:
# Conteo de las categorias de cada variable, con el re agrupamiento
for variable in categoric_columns + ['diabetes']:
    print(variable)
    if variable == 'diabetes':
        conteos = data[f'{variable}'].value_counts().reset_index()
    else:
        conteos = df_categorico[f'{variable}'].value_counts().reset_index()
    total = conteos['count'].sum()
    conteos['Porcentaje'] = (conteos['count'] / total)*100
    display(conteos)

genero_paciente


Unnamed: 0,genero_paciente,count,Porcentaje
0,Femenino,526495,61.918589
1,Masculino,323807,38.081411


raza_paciente


Unnamed: 0,raza_paciente,count,Porcentaje
0,Otros,788385,92.718234
1,Mestizo,39433,4.637529
2,Afrocolombiano,10587,1.245087
3,Raizales,4311,0.506996
4,Indígena,4029,0.473832
5,Palenquero,2093,0.246148
6,Rom/Gitano,1322,0.155474
7,,142,0.0167


nivel_academico_paciente


Unnamed: 0,nivel_academico_paciente,count,Porcentaje
0,educacion_basica,348298,40.961682
1,ninguno,222674,26.187637
2,educacion_media,169710,19.958791
3,educacion_superior,109472,12.874485
4,,148,0.017406


ant_cardio


Unnamed: 0,ant_cardio,count,Porcentaje
0,No,509896,59.966459
1,Si,340406,40.033541


med_hipertension


Unnamed: 0,med_hipertension,count,Porcentaje
0,No,547419,64.379362
1,Si,302883,35.620638


ant_familiar_dm


Unnamed: 0,ant_familiar_dm,count,Porcentaje
0,0,848538,99.792544
1,1,1764,0.207456


hace_ejercicio


Unnamed: 0,hace_ejercicio,count,Porcentaje
0,,814473,95.786321
1,No,20340,2.392091
2,20 min,7911,0.930375
3,Mas de 20 min,7578,0.891213


diabetes


Unnamed: 0,diabetes,count,Porcentaje
0,0,656044,77.154235
1,1,194258,22.845765


## Entrenamiento

In [9]:
# Diccionario con los cambios a las variables categoricas por numericas
dict_catergoricas = {
    'Femenino' : 0,
    'Masculino' : 1,
    "Mestizo" : 0,
    "Otros" : 1,
    "Afrocolombiano" : 2,
    "Raizales" : 3,
    "Indígena" : 4,
    "Palenquero" : 5,
    "Rom/Gitano" : 6,
    "Bachillerato técnico" : 0,
    "Básica secundaria" : 1,
    "Tecnológica" : 2,
    "Técnica profesional" : 3,
    "Profesional" : 4,
    "Ninguno" : 5,
    "Básica primaria" : 6,
    "Media académica o clásica" : 7,
    "Normalista" : 8,
    "Especialización" : 9,
    "Preescolar" : 10,
    "Doctorado" : 11,
    "Maestría" : 12,
    "20 minutos" : 0,
    "Nunca" : 1,
    "40 minutos" : 2,
    "60 minutos" : 3,
    'No' : 0,
    'Si' : 1,
    'educacion_media' : 2, 
    'educacion_superior' : 3, 
    'ninguno' : 0,
    'educacion_basica' : 1, 
    'None': 0 
}

In [10]:
# Cambiar datos nulos por el promedio de la columna
df_numerico_with_no_nan = df_numerico[::]
for column in df_numerico_with_no_nan.columns:
    df_numerico_with_no_nan[column] = df_numerico_with_no_nan[column].fillna(df_numerico_with_no_nan[column].mean())

# Escalar los valores
df_escalado = escalar(df_numerico_with_no_nan)
df_escalado.head()

Unnamed: 0,0,1,2,3,4,5
0,0.354839,0.035732,0.006601,0.006707,0.00078,0.443439
1,0.408602,0.033477,0.005215,0.006473,0.000583,0.332171
2,0.397849,0.036829,0.00396,0.005219,0.000435,0.40724
3,0.387097,0.029067,0.005149,0.008808,0.00037,0.38009
4,0.27957,0.029812,0.004356,0.007407,0.00057,0.375566


In [11]:
# Definir las variables categoricas con las que se va atrabajar
df_categorico_no_columns = df_categorico.drop(columns = columns_not_in_count)
df_categorico_no_columns

Unnamed: 0,genero_paciente,nivel_academico_paciente,ant_cardio,med_hipertension
0,Masculino,ninguno,No,No
1,Masculino,educacion_media,Si,Si
2,Masculino,educacion_media,Si,Si
3,Masculino,educacion_superior,Si,Si
4,Masculino,educacion_superior,No,No
...,...,...,...,...
850297,Masculino,educacion_superior,Si,Si
850298,Masculino,educacion_superior,No,No
850299,Masculino,educacion_superior,No,No
850300,Masculino,ninguno,No,No


In [12]:
# unir el df escalado y las variables categoricas
data_to_train = df_escalado.reset_index().merge(df_categorico_no_columns.reset_index(), on = 'index', how = 'left')
data_to_train.drop(columns = ['index'], inplace=True)

# Convertir el nombre de las columnas a string
data_to_train.columns = data_to_train.columns.astype(str)

# Cambiar variables categoricas a numericas
data_to_train.genero_paciente = data_to_train.genero_paciente.replace(dict_catergoricas)
data_to_train.nivel_academico_paciente = data_to_train.nivel_academico_paciente.replace(dict_catergoricas)
data_to_train.ant_cardio = data_to_train.ant_cardio.replace(dict_catergoricas)
data_to_train.med_hipertension = data_to_train.med_hipertension.replace(dict_catergoricas)
# data_to_train.hace_ejercicio = data_to_train.hace_ejercicio.replace(dict_catergoricas)
# data_to_train.raza_paciente = data_to_train.raza_paciente.replace(dict_catergoricas)

# Mostrar datos a usar en el entrenamiento
data_to_train

Unnamed: 0,0,1,2,3,4,5,genero_paciente,nivel_academico_paciente,ant_cardio,med_hipertension
0,0.354839,0.035732,0.006601,0.006707,0.000780,0.443439,1,0,0,0
1,0.408602,0.033477,0.005215,0.006473,0.000583,0.332171,1,2,1,1
2,0.397849,0.036829,0.003960,0.005219,0.000435,0.407240,1,2,1,1
3,0.387097,0.029067,0.005149,0.008808,0.000370,0.380090,1,3,1,1
4,0.279570,0.029812,0.004356,0.007407,0.000570,0.375566,1,3,0,0
...,...,...,...,...,...,...,...,...,...,...
850297,0.311828,0.028604,0.005149,0.007028,0.000503,0.316742,1,3,1,1
850298,0.268817,0.033567,0.005360,0.007923,0.000382,0.408597,1,3,0,0
850299,0.408602,0.044844,0.004647,0.004781,0.000254,0.332171,1,3,0,0
850300,0.322581,0.031181,0.007295,0.007095,0.000718,0.332171,1,0,0,0


In [13]:
t_1 = time()
y = data[['diabetes','time_to_event']]
y['target'] = y.apply(lambda x: (bool(x.diabetes), x.time_to_event), axis = 1)
y = y['target']
y = np.array(y, dtype=[('event', np.bool_), ('time', np.int32)])

display(y[:5])

X = data_to_train

display(X.head(5))

random_state = 20

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)

print(f'Las dimensiones del entrenamiento son {X_train.shape} para X_train, {y_train.shape} para y_train')
print(f'Las dimensiones del entrenamiento son {X_test.shape} para X_test, {y_test.shape} para y_test')

# rsf = RandomSurvivalForest(
#     n_estimators=1000, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, random_state=random_state
# )

rsf = RandomSurvivalForest(
    max_depth=100, min_samples_leaf=50, min_samples_split=30,
                     n_estimators=50, n_jobs=-1
)

rsf.fit(X_train, y_train)

print('El modelo tiene un valor score de :',rsf.score(X_test, y_test))

surv = rsf.predict_survival_function(X_test)
tiempos_supervivencia = [int(y_test[i][1]) for i in range(len(y_test))]
eventos = [bool(y_test[i][0]) for i in range(len(y_test))]
tiempo_mediano = np.median(tiempos_supervivencia)
puntuaciones_riesgo = -np.log([f(tiempo_mediano) for f in surv])

c_index = concordance_index_censored(eventos, tiempos_supervivencia, puntuaciones_riesgo)

print(c_index[0])

print(f'Se demoro un total de {((time() - t_1)/60)} minutos')

array([(False, 40), ( True,  7), (False, 39), (False, 13), (False, 47)],
      dtype=[('event', '?'), ('time', '<i4')])

Unnamed: 0,0,1,2,3,4,5,genero_paciente,nivel_academico_paciente,ant_cardio,med_hipertension
0,0.354839,0.035732,0.006601,0.006707,0.00078,0.443439,1,0,0,0
1,0.408602,0.033477,0.005215,0.006473,0.000583,0.332171,1,2,1,1
2,0.397849,0.036829,0.00396,0.005219,0.000435,0.40724,1,2,1,1
3,0.387097,0.029067,0.005149,0.008808,0.00037,0.38009,1,3,1,1
4,0.27957,0.029812,0.004356,0.007407,0.00057,0.375566,1,3,0,0


Las dimensiones del entrenamiento son (637726, 10) para X_train, (637726,) para y_train
Las dimensiones del entrenamiento son (212576, 10) para X_test, (212576,) para y_test
El modelo tiene un valor score de : 0.5814794902452343
0.5831207473768317
Se demoro un total de 23.807964007059734 minutos


## Entrenamiento sin outliers

In [14]:
df_no_outliers = take_out_outliers(data, variables_with_outliers, verbose = False)

In [15]:
df_no_outliers.shape

(759374, 21)

In [16]:
# Dataframe con los datos numericos
df_numerico = df_no_outliers[numeric_columns]
df_numerico['IMC'] = np.round(df_numerico['IMC'].astype(float),2)

# print(df_numerico.shape)
# display(df_numerico.head())

# Dataframe con los datos categoricos
df_categorico = df_no_outliers[categoric_columns].astype(str)

df_categorico.hace_ejercicio = df_categorico.hace_ejercicio.replace(dict_var_categoricas)
df_categorico.nivel_academico_paciente = df_categorico.nivel_academico_paciente.replace(dict_var_categoricas)
df_categorico.ant_cardio = df_categorico.ant_cardio.replace(dict_var_categoricas)
df_categorico.med_hipertension = df_categorico.med_hipertension.replace(dict_var_categoricas)

# df_categorico = df_categorico.drop(columns = columns_not_in_count)

# print(df_categorico.shape)
# display(df_categorico.head())

# Cambiar datos nulos por el promedio de la columna
df_numerico_with_no_nan = df_numerico[::]
for column in df_numerico_with_no_nan.columns:
    df_numerico_with_no_nan[column] = df_numerico_with_no_nan[column].fillna(df_numerico_with_no_nan[column].mean())

# Escalar los valores
df_escalado = escalar(df_numerico_with_no_nan)
df_escalado.head()

# Definir las variables categoricas con las que se va atrabajar
df_categorico_no_columns = df_categorico.drop(columns = columns_not_in_count).reset_index()
df_categorico_no_columns.drop(columns = 'index', inplace = True)
df_categorico_no_columns

# unir el df escalado y las variables categoricas
data_to_train = df_escalado.reset_index().merge(df_categorico_no_columns.reset_index(), on = 'index', how = 'left')
data_to_train.drop(columns = ['index'], inplace=True)

# Convertir el nombre de las columnas a string
data_to_train.columns = data_to_train.columns.astype(str)

# Cambiar variables categoricas a numericas
data_to_train.genero_paciente = data_to_train.genero_paciente.replace(dict_catergoricas)
data_to_train.nivel_academico_paciente = data_to_train.nivel_academico_paciente.replace(dict_catergoricas)
data_to_train.ant_cardio = data_to_train.ant_cardio.replace(dict_catergoricas)
data_to_train.med_hipertension = data_to_train.med_hipertension.replace(dict_catergoricas)
# data_to_train.hace_ejercicio = data_to_train.hace_ejercicio.replace(dict_catergoricas)
# data_to_train.raza_paciente = data_to_train.raza_paciente.replace(dict_catergoricas)

# Mostrar datos a usar en el entrenamiento
data_to_train

Unnamed: 0,0,1,2,3,4,5,genero_paciente,nivel_academico_paciente,ant_cardio,med_hipertension
0,0.402439,0.837928,0.551282,0.497703,0.917683,0.890625,1,0,0,0
1,0.463415,0.744361,0.383013,0.459008,0.685976,0.491905,1,2,1,1
2,0.451220,0.883459,0.230769,0.251028,0.512195,0.765625,1,2,1,1
3,0.439024,0.561404,0.375000,0.845949,0.435976,0.671875,1,3,1,1
4,0.317073,0.592314,0.278846,0.613785,0.670732,0.656250,1,3,0,0
...,...,...,...,...,...,...,...,...,...,...
759369,0.512195,0.374687,0.647436,0.550907,0.564024,0.515625,1,2,0,1
759370,0.353659,0.542189,0.375000,0.550907,0.591463,0.453125,1,3,1,1
759371,0.304878,0.748120,0.400641,0.699250,0.450091,0.770312,1,3,0,0
759372,0.365854,0.649123,0.635577,0.562128,0.844817,0.491905,1,0,0,0


In [17]:
# Conteo de las categorias de cada variable, con el re agrupamiento
for variable in categoric_columns + ['diabetes']:
    print(variable)
    if variable == 'diabetes':
        conteos = df_no_outliers[f'{variable}'].value_counts().reset_index()
    else:
        conteos = df_categorico[f'{variable}'].value_counts().reset_index()
    total = conteos['count'].sum()
    conteos['Porcentaje'] = (conteos['count'] / total)*100
    display(conteos)

genero_paciente


Unnamed: 0,genero_paciente,count,Porcentaje
0,Femenino,470391,61.944575
1,Masculino,288983,38.055425


raza_paciente


Unnamed: 0,raza_paciente,count,Porcentaje
0,Otros,703767,92.677258
1,Mestizo,35502,4.675167
2,Afrocolombiano,9421,1.240627
3,Raizales,3867,0.509235
4,Indígena,3612,0.475655
5,Palenquero,1879,0.247441
6,Rom/Gitano,1196,0.157498
7,,130,0.017119


nivel_academico_paciente


Unnamed: 0,nivel_academico_paciente,count,Porcentaje
0,educacion_basica,311779,41.057371
1,ninguno,198584,26.151014
2,educacion_media,151038,19.889804
3,educacion_superior,97839,12.884165
4,,134,0.017646


ant_cardio


Unnamed: 0,ant_cardio,count,Porcentaje
0,No,452904,59.641758
1,Si,306470,40.358242


med_hipertension


Unnamed: 0,med_hipertension,count,Porcentaje
0,No,487516,64.199722
1,Si,271858,35.800278


ant_familiar_dm


Unnamed: 0,ant_familiar_dm,count,Porcentaje
0,0,757822,99.795621
1,1,1552,0.204379


hace_ejercicio


Unnamed: 0,hace_ejercicio,count,Porcentaje
0,,727846,95.848159
1,No,17734,2.335345
2,20 min,7093,0.934059
3,Mas de 20 min,6701,0.882437


diabetes


Unnamed: 0,diabetes,count,Porcentaje
0,0,588328,77.475394
1,1,171046,22.524606


In [None]:
t_1 = time()

y = df_no_outliers[['diabetes','time_to_event']]
y['target'] = y.apply(lambda x: (bool(x.diabetes), x.time_to_event), axis = 1)
y = y['target']
y = np.array(y, dtype=[('event', np.bool_), ('time', np.int32)])

display(y[:5])

X = data_to_train

display(X.head(5))

random_state = 20

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)

print(f'Las dimensiones del entrenamiento son {X_train.shape} para X_train, {y_train.shape} para y_train')
print(f'Las dimensiones del entrenamiento son {X_test.shape} para X_test, {y_test.shape} para y_test')

# rsf = RandomSurvivalForest(
#     n_estimators=1000, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, random_state=random_state
# )

rsf = RandomSurvivalForest(
    max_depth=100, min_samples_leaf=50, min_samples_split=30,
                     n_estimators=50, n_jobs=-1
)

rsf.fit(X_train, y_train)

print('El modelo tiene un valor score de :',rsf.score(X_test, y_test))

surv = rsf.predict_survival_function(X_test)
tiempos_supervivencia = [int(y_test[i][1]) for i in range(len(y_test))]
eventos = [bool(y_test[i][0]) for i in range(len(y_test))]
tiempo_mediano = np.median(tiempos_supervivencia)
puntuaciones_riesgo = -np.log([f(tiempo_mediano) for f in surv])

c_index = concordance_index_censored(eventos, tiempos_supervivencia, puntuaciones_riesgo)

print(c_index[0])

print(f'Se demoro un total de {((time() - t_1)/60)} minutos')

array([(False, 40), ( True,  7), (False, 39), (False, 13), (False, 47)],
      dtype=[('event', '?'), ('time', '<i4')])

Unnamed: 0,0,1,2,3,4,5,genero_paciente,nivel_academico_paciente,ant_cardio,med_hipertension
0,0.402439,0.837928,0.551282,0.497703,0.917683,0.890625,1,0,0,0
1,0.463415,0.744361,0.383013,0.459008,0.685976,0.491905,1,2,1,1
2,0.45122,0.883459,0.230769,0.251028,0.512195,0.765625,1,2,1,1
3,0.439024,0.561404,0.375,0.845949,0.435976,0.671875,1,3,1,1
4,0.317073,0.592314,0.278846,0.613785,0.670732,0.65625,1,3,0,0


Las dimensiones del entrenamiento son (569530, 10) para X_train, (569530,) para y_train
Las dimensiones del entrenamiento son (189844, 10) para X_test, (189844,) para y_test


In [20]:
print('El modelo tiene un valor score de :',rsf.score(X_test, y_test))

surv = rsf.predict_survival_function(X_test)
tiempos_supervivencia = [int(y_test[i][1]) for i in range(len(y_test))]
eventos = [bool(y_test[i][0]) for i in range(len(y_test))]
tiempo_mediano = np.median(tiempos_supervivencia)
puntuaciones_riesgo = -np.log([f(tiempo_mediano) for f in surv])

c_index = concordance_index_censored(eventos, tiempos_supervivencia, puntuaciones_riesgo)

print(c_index[0])

print(f'Se demoro un total de {((time() - t_1)/60)} minutos')

El modelo tiene un valor score de : 0.577203963245016
0.5780446815356218
Se demoro un total de 131.130395591259 minutos
