In [8]:
import pickle
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
import random

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from params import *
from utilities import *

import warnings
warnings.filterwarnings('ignore')

## Parametros

In [77]:
# Cliente de Bigquery
client_storage = storage.Client()

# Parametro para samplear o no los datos de entrenamiento
sample = True

# Parametro para crear clusters en los datos de entrenamiento
cluster = True

## Carga de datos entrenamiento

In [48]:
data_original = bring_data_from_bq(table = tables[0], client_bq = client_bq, save = True, read_local = False)
data = prepare_data(data_original)

data.head(5)

Leyendo datos de la tabla: diabetes
Informacion guardada en el archivo: diabetes.parquet


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
0,0.0,64.0,0.0,11001.0,0.0,0.0,57.6,1.5,25.6,58.0,87.43,60.64,0.0,0.0,0.0,0.0,78.0,2.0,0.0
1,0.0,54.0,0.0,54001.0,1.0,0.0,57.6,1.5,25.6,57.0,30.0,374.0,0.0,0.0,0.0,0.0,75.0,2.0,0.0
2,0.0,69.0,0.0,15001.0,2.0,0.0,57.6,1.5,25.6,66.0,171.0,91.0,0.0,0.0,0.0,0.0,89.0,1.0,0.0
3,0.0,57.0,0.0,8001.0,3.0,0.0,57.6,1.5,25.6,39.0,143.0,280.0,0.0,0.0,0.0,0.0,95.0,2.0,1.0
4,0.0,50.0,0.0,19001.0,4.0,0.0,57.6,1.5,25.6,59.0,140.0,186.0,0.0,0.0,0.0,0.0,87.0,2.0,0.0


## Clusters

In [78]:
## Funciones para crear clusters

def peso_edad(value):
    if value < 45:
        return 0
    elif value < 54:
        return 1
    elif value <= 64:
        return 2
    else:
        return 3

def peso_imc(value):
    if value < 30:
        return 0
    elif value < 35:
        return 1
    elif value < 40:
        return 2
    else:
        return 3

def peso_pa(value, gender):

    if gender == 0:
        if value < 90:
            return 0
        elif value < 95:
            return 1
        else:
            return 2
    else:
        if value < 94:
            return 0
        elif value < 100:
            return 1
        else:
            return 2

def peso_ejercicio(value):
    if value == 2:
        return 2
    else:
        return 0

def peso_var_bivalente(value):
    if value == 1:
        return 2
    else:
        return 0

def peso_HDL(value):
    if value < 40:
        return 2
    elif value < 60:
        return 1
    else:
        return 0

def peso_LDL(value):
    if value < 130:
        return 0
    elif value < 160:
        return 1
    elif value < 190:
        return 2
    else:
        return 3

def peso_trigliceridos(value):
    if value < 150:
        return 0
    elif value < 200:
        return 1
    elif value < 500:
        return 2
    else:
        return 3

    
## Diccionario con las columnas y su respectiva funcion de cluster
dict_variables = {
    'edad' : peso_edad,
    'imc' : peso_imc,
    'PERIMETRO_ABDOMINAL' : peso_pa,
    'hace_ejercicio' : peso_ejercicio,
    'med_hipertension' : peso_var_bivalente,
    'familiar_dm' : peso_var_bivalente,
    'HDL' : peso_HDL,
    'LDL' : peso_LDL,
    'trigliceridos' : peso_trigliceridos,
    'ant_cardiovascular' : peso_var_bivalente,
    'dm_gestacional' : peso_var_bivalente,
}

In [79]:
df_cluster = pd.DataFrame()

if cluster:
    print('Se crearan clusters para las variables')
    print(list(dict_variables.keys()))
    for genero in [0,1]:
        print(f'Creando clusters para el genero: {genero}')
        data_temp = data[data.genero == genero].copy()
        for key in dict_variables:
            if key == 'PERIMETRO_ABDOMINAL':
                data_temp[key] = data_temp[key].map(lambda x: dict_variables[key](x, genero))
            else:
                data_temp[key] = data_temp[key].map(lambda x: dict_variables[key](x))
        df_cluster = pd.concat([df_cluster,data_temp])
        
df_cluster.head(5)

Se crearan clusters para las variables
['edad', 'imc', 'PERIMETRO_ABDOMINAL', 'hace_ejercicio', 'med_hipertension', 'familiar_dm', 'HDL', 'LDL', 'trigliceridos', 'ant_cardiovascular', 'dm_gestacional']
Creando clusters para el genero: 0
Creando clusters para el genero: 1


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
0,0.0,2,0.0,11001.0,0.0,0.0,57.6,1.5,0,1,0,0,0,0,0,0,0,2,0.0
1,0.0,2,0.0,54001.0,1.0,0.0,57.6,1.5,0,1,0,2,0,0,0,0,0,2,0.0
2,0.0,3,0.0,15001.0,2.0,0.0,57.6,1.5,0,0,2,0,0,0,0,0,0,0,0.0
3,0.0,2,0.0,8001.0,3.0,0.0,57.6,1.5,0,2,1,2,0,0,0,0,2,2,1.0
4,0.0,1,0.0,19001.0,4.0,0.0,57.6,1.5,0,1,1,1,0,0,0,0,0,2,0.0


## Ajustes modelo

In [80]:
# Conjunto de variables a usar en el entrenamiento

variables_to_train = ['edad',
                      'genero',
                      'codigo_ciudad_sucursal', # Mirar
                      'imc',
                      'HDL',
                      'LDL',
                      'trigliceridos',
                      'med_hipertension',
                      'familiar_dm',
                      'ant_cardiovascular',
                      'dm_gestacional',
                      'PERIMETRO_ABDOMINAL',
                      'hace_ejercicio',
                      'nivel_academico_paciente', # Mirar
                      'raza_paciente', # Mirar
                      'diabetes']

In [81]:
## Consolidacion de los datos de entrenamiento

# Se define si se trabaja con clusters o sin ellos
if cluster:
    df_to_begin = df_cluster
else:
    df_to_begin = data.copy()

# Sampleo de los datos para igualar la cantidad de pacientes con y sin diabetes
if sample:
    df = df_to_begin[df_to_begin.diabetes == 1]
    df_sample = df_to_begin[df_to_begin.diabetes == 0].sample(len(df))
    df = pd.concat([df,df_sample]).dropna()
else:
    df = df_to_begin

# Filtrar DataFrame con las variables a entrenar
df_to_train = df[variables_to_train].copy()
df_to_train.head(5)

Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente,diabetes
3,2,0.0,8001.0,0,2,1,2,0,0,0,0,2,2,3.0,0.0,1.0
11,0,0.0,50001.0,2,1,0,2,0,0,0,2,2,0,0.0,0.0,1.0
21,0,0.0,11001.0,0,0,0,0,0,0,0,2,0,2,3.0,0.0,1.0
26,0,0.0,20001.0,0,1,0,0,0,0,0,0,0,0,4.0,0.0,1.0
31,0,0.0,20001.0,0,1,0,0,0,0,0,0,0,0,4.0,0.0,1.0


In [82]:
## Entrenamiento del modelo

# Definir X e y para entrenar el modelo
X = df_to_train[variables_to_train[:-1]]
y = df_to_train[variables_to_train[-1]]

print('Datos usados en el entrenamiento')
display(X.head(5))

# Segmentar la data en conjuntos de entrenamiento y de validacion
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Crear el modelo Naive Bayes
model_nb = GaussianNB()

# Entrenar el modelo
model_nb.fit(X_train, y_train)

Datos usados en el entrenamiento


Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente
3,2,0.0,8001.0,0,2,1,2,0,0,0,0,2,2,3.0,0.0
11,0,0.0,50001.0,2,1,0,2,0,0,0,2,2,0,0.0,0.0
21,0,0.0,11001.0,0,0,0,0,0,0,0,2,0,2,3.0,0.0
26,0,0.0,20001.0,0,1,0,0,0,0,0,0,0,0,4.0,0.0
31,0,0.0,20001.0,0,1,0,0,0,0,0,0,0,0,4.0,0.0


In [83]:
## Validacion del modelo usando las metricas de acuraccy, f1-score y la matriz de confusion

# Valores predichos
y_pred = model_nb.predict(X_test)

# Calculo de metricas
metrica = round(model_nb.score(X_test, y_test)*100,2)
f1 = round(f1_score(y_test, y_pred)*100,2)
matriz_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
matriz_confusion.columns = ['Negativo','Positivo']
matriz_confusion.index = ['Negativo','Positivo']

print(f'Modelo entrenado con una precision de: {metrica}')
print(f'Modelo entrenado con f1 de: {f1}')
print('La matriz de confusion es:')
display(matriz_confusion)

Modelo entrenado con una precision de: 61.48
Modelo entrenado con f1 de: 65.53
La matriz de confusion es:


Unnamed: 0,Negativo,Positivo
Negativo,14474,14734
Positivo,7692,21316


In [76]:
y_test.sum()

29027.0

In [22]:
## Validar casos particulares

print('Validacion con casos particulares \n')

for i in range(5):
    print('--------------------------------------------------------------------------------------------------------------------')
    row = random.choice(list(X_test.index))
    values = X_test.loc[[row]]
    meta = y_test.loc[[row]]
    indice = values.index[0]

    print(f'La prediccion (en porcentaje) de tener diabetes es de: {round(model_nb.predict_proba(values)[0][1]*100,2)}%')
    print(f'La prediccion (en categoria) de tener diabetes es de: {model_nb.predict(values)[0]}')
    print(f'Los valores predichos son:')
    display(data.iloc[[indice]])

Validacion con casos particulares 

--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 1.99%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
63995,0.0,38.0,0.0,54001.0,0.0,0.0,52.5,1.49,23.647583,50.7,107.5,109.0,0.0,0.0,0.0,0.0,85.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 32.5%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
231429,0.0,66.0,1.0,8001.0,3.0,0.0,82.1,1.74,27.117189,35.4,37.8,63.0,1.0,0.0,1.0,0.0,95.0,3.0,1.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 2.12%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
92605,0.0,32.0,0.0,5001.0,9.0,3.0,97.0,1.7,33.564014,36.0,85.0,79.9,0.0,0.0,1.0,0.0,95.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 15.36%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
273335,0.0,89.0,1.0,50001.0,0.0,0.0,56.0,1.547,23.399544,49.77,108.81,122.6,1.0,0.0,1.0,0.0,93.0,1.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 22.26%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
116829,0.0,82.0,0.0,11001.0,0.0,0.0,60.7,1.52,26.272507,29.7,169.0,207.0,1.0,0.0,1.0,0.0,84.0,2.0,1.0


## Sin las variables 
código ciudad sucursal, nivel academico y raza paciente

In [84]:
# Conjunto de variables a usar en el entrenamiento
# Sin código ciudad sucursal, nivel academico y raza paciente

variables_to_train = ['edad',
                      'genero',
                      'imc',
                      'HDL',
                      'LDL',
                      'trigliceridos',
                      'med_hipertension',
                      'familiar_dm',
                      'ant_cardiovascular',
                      'dm_gestacional',
                      'PERIMETRO_ABDOMINAL',
                      'hace_ejercicio',
                      'diabetes']

In [85]:
## Consolidacion de los datos de entrenamiento

# Se define si se trabaja con clusters o sin ellos
if cluster:
    df_to_begin = df_cluster
else:
    df_to_begin = data.copy()

# Sampleo de los datos para igualar la cantidad de pacientes con y sin diabetes
if sample:
    df = df_to_begin[df_to_begin.diabetes == 1]
    df_sample = df_to_begin[df_to_begin.diabetes == 0].sample(len(df))
    df = pd.concat([df,df_sample]).dropna()
else:
    df = df_to_begin

# Filtrar DataFrame con las variables a entrenar
df_to_train = df[variables_to_train].copy()
df_to_train.head(5)

Unnamed: 0,edad,genero,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
3,2,0.0,0,2,1,2,0,0,0,0,2,2,1.0
11,0,0.0,2,1,0,2,0,0,0,2,2,0,1.0
21,0,0.0,0,0,0,0,0,0,0,2,0,2,1.0
26,0,0.0,0,1,0,0,0,0,0,0,0,0,1.0
31,0,0.0,0,1,0,0,0,0,0,0,0,0,1.0


In [86]:
## Entrenamiento del modelo

# Definir X e y para entrenar el modelo
X = df_to_train[variables_to_train[:-1]]
y = df_to_train[variables_to_train[-1]]

print('Datos usados en el entrenamiento')
display(X.head(5))

# Segmentar la data en conjuntos de entrenamiento y de validacion
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Crear el modelo Naive Bayes
model_nb = GaussianNB()

# Entrenar el modelo
model_nb.fit(X_train, y_train)

Datos usados en el entrenamiento


Unnamed: 0,edad,genero,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio
3,2,0.0,0,2,1,2,0,0,0,0,2,2
11,0,0.0,2,1,0,2,0,0,0,2,2,0
21,0,0.0,0,0,0,0,0,0,0,2,0,2
26,0,0.0,0,1,0,0,0,0,0,0,0,0
31,0,0.0,0,1,0,0,0,0,0,0,0,0


In [87]:
## Validacion del modelo usando las metricas de acuraccy, f1-score y la matriz de confusion

# Valores predichos
y_pred = model_nb.predict(X_test)

# Calculo de metricas
metrica = round(model_nb.score(X_test, y_test)*100,2)
f1 = round(f1_score(y_test, y_pred)*100,2)
matriz_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
matriz_confusion.columns = ['Negativo','Positivo']
matriz_confusion.index = ['Negativo','Positivo']

print(f'Modelo entrenado con una precision de: {metrica}')
print(f'Modelo entrenado con f1 de: {f1}')
print('La matriz de confusion es:')
display(matriz_confusion)

Modelo entrenado con una precision de: 50.89
Modelo entrenado con f1 de: 2.85
La matriz de confusion es:


Unnamed: 0,Negativo,Positivo
Negativo,29208,0
Positivo,28588,420


In [27]:
## Validar casos particulares

print('Validacion con casos particulares \n')

for i in range(5):
    print('--------------------------------------------------------------------------------------------------------------------')
    row = random.choice(list(X_test.index))
    values = X_test.loc[[row]]
    meta = y_test.loc[[row]]
    indice = values.index[0]

    print(f'La prediccion (en porcentaje) de tener diabetes es de: {round(model_nb.predict_proba(values)[0][1]*100,2)}%')
    print(f'La prediccion (en categoria) de tener diabetes es de: {model_nb.predict(values)[0]}')
    print(f'Los valores predichos son:')
    display(data.iloc[[indice]])

Validacion con casos particulares 

--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 13.11%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
124417,0.0,66.0,1.0,13001.0,3.0,0.0,74.9,1.72,25.317739,57.0,85.0,107.0,1.0,0.0,1.0,0.0,105.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 10.66%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
97343,0.0,78.0,0.0,8001.0,3.0,0.0,82.0,1.52,35.49169,56.0,122.0,108.0,1.0,0.0,1.0,0.0,90.0,3.0,1.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 19.77%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
271276,0.0,70.0,1.0,23001.0,5.0,0.0,80.0,1.7,27.681661,37.5,73.5,96.3,1.0,0.0,1.0,0.0,109.0,2.0,1.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 20.65%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
82356,0.0,75.0,0.0,63001.0,0.0,0.0,78.0,1.63,29.357522,44.5,61.3,247.3,1.0,0.0,1.0,0.0,101.0,2.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 14.49%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
134057,0.0,77.0,0.0,52001.0,1.0,1.0,69.0,1.53,29.475843,47.0,118.0,185.0,1.0,0.0,1.0,0.0,93.0,0.0,1.0


## Sample = False // Cluster = True

## Todas las variables


In [37]:
# Conjunto de variables a usar en el entrenamiento

variables_to_train = ['edad',
                      'genero',
                      'codigo_ciudad_sucursal', # Mirar
                      'imc',
                      'HDL',
                      'LDL',
                      'trigliceridos',
                      'med_hipertension',
                      'familiar_dm',
                      'ant_cardiovascular',
                      'dm_gestacional',
                      'PERIMETRO_ABDOMINAL',
                      'hace_ejercicio',
                      'nivel_academico_paciente', # Mirar
                      'raza_paciente', # Mirar
                      'diabetes']

In [38]:
## Consolidacion de los datos de entrenamiento

# Se define si se trabaja con clusters o sin ellos
if cluster:
    df_to_begin = df_cluster
else:
    df_to_begin = data.copy()

# Sampleo de los datos para igualar la cantidad de pacientes con y sin diabetes
if sample:
    df = df_to_begin[df_to_begin.diabetes == 1]
    df_sample = df_to_begin[df_to_begin.diabetes == 0].sample(len(df))
    df = pd.concat([df,df_sample]).dropna()
else:
    df = df_to_begin

# Filtrar DataFrame con las variables a entrenar
df_to_train = df[variables_to_train].copy()
df_to_train.head(5)

Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente,diabetes
0,2,0.0,11001.0,0,1,0,0,0,0,0,0,0,2,0.0,0.0,0.0
1,2,0.0,54001.0,0,1,0,2,0,0,0,0,0,2,1.0,0.0,0.0
2,3,0.0,15001.0,0,0,2,0,0,0,0,0,0,0,2.0,0.0,0.0
3,2,0.0,8001.0,0,2,1,2,0,0,0,0,2,2,3.0,0.0,1.0
4,1,0.0,19001.0,0,1,1,1,0,0,0,0,0,2,4.0,0.0,0.0


In [45]:
## Entrenamiento del modelo

#Eliminar filas con NaN en X
X_clean = X.dropna()
y_clean = y[X_clean.index]

# Definir X e y para entrenar el modelo
X = df_to_train[variables_to_train[:-1]]
y = df_to_train[variables_to_train[-1]]

print('Datos usados en el entrenamiento')
display(X.head(5))

# Segmentar la data en conjuntos de entrenamiento y de validacion
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Crear el modelo Naive Bayes
model_nb = GaussianNB()

# Entrenar el modelo
model_nb.fit(X_train, y_train)

Datos usados en el entrenamiento


Unnamed: 0,edad,genero,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio
0,2,0.0,0,1,0,0,0,0,0,0,0,2
1,2,0.0,0,1,0,2,0,0,0,0,0,2
2,3,0.0,0,0,2,0,0,0,0,0,0,0
3,2,0.0,0,2,1,2,0,0,0,0,2,2
4,1,0.0,0,1,1,1,0,0,0,0,0,2


In [46]:
## Validacion del modelo usando las metricas de acuraccy, f1-score y la matriz de confusion

# Valores predichos
y_pred = model_nb.predict(X_test)

# Calculo de metricas
metrica = round(model_nb.score(X_test, y_test)*100,2)
f1 = round(f1_score(y_test, y_pred)*100,2)
matriz_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
matriz_confusion.columns = ['Negativo','Positivo']
matriz_confusion.index = ['Negativo','Positivo']

print(f'Modelo entrenado con una precision de: {metrica}')
print(f'Modelo entrenado con f1 de: {f1}')
print('La matriz de confusion es:')
display(matriz_confusion)

Modelo entrenado con una precision de: 70.3
Modelo entrenado con f1 de: 2.92
La matriz de confusion es:


Unnamed: 0,Negativo,Positivo
Negativo,67433,0
Positivo,28673,431


In [None]:
## Validar casos particulares

print('Validacion con casos particulares \n')

for i in range(5):
    print('--------------------------------------------------------------------------------------------------------------------')
    row = random.choice(list(X_test.index))
    values = X_test.loc[[row]]
    meta = y_test.loc[[row]]
    indice = values.index[0]

    print(f'La prediccion (en porcentaje) de tener diabetes es de: {round(model_nb.predict_proba(values)[0][1]*100,2)}%')
    print(f'La prediccion (en categoria) de tener diabetes es de: {model_nb.predict(values)[0]}')
    print(f'Los valores predichos son:')
    display(data.iloc[[indice]])

## Sin las variables 
código ciudad sucursal, nivel academico y raza paciente

In [40]:
# Conjunto de variables a usar en el entrenamiento
# Sin código ciudad sucursal, nivel academico y raza paciente

variables_to_train = ['edad',
                      'genero',
                      'imc',
                      'HDL',
                      'LDL',
                      'trigliceridos',
                      'med_hipertension',
                      'familiar_dm',
                      'ant_cardiovascular',
                      'dm_gestacional',
                      'PERIMETRO_ABDOMINAL',
                      'hace_ejercicio',
                      'diabetes']

In [41]:
## Consolidacion de los datos de entrenamiento

# Se define si se trabaja con clusters o sin ellos
if cluster:
    df_to_begin = df_cluster
else:
    df_to_begin = data.copy()

# Sampleo de los datos para igualar la cantidad de pacientes con y sin diabetes
if sample:
    df = df_to_begin[df_to_begin.diabetes == 1]
    df_sample = df_to_begin[df_to_begin.diabetes == 0].sample(len(df))
    df = pd.concat([df,df_sample]).dropna()
else:
    df = df_to_begin

# Filtrar DataFrame con las variables a entrenar
df_to_train = df[variables_to_train].copy()
df_to_train.head(5)

Unnamed: 0,edad,genero,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
0,2,0.0,0,1,0,0,0,0,0,0,0,2,0.0
1,2,0.0,0,1,0,2,0,0,0,0,0,2,0.0
2,3,0.0,0,0,2,0,0,0,0,0,0,0,0.0
3,2,0.0,0,2,1,2,0,0,0,0,2,2,1.0
4,1,0.0,0,1,1,1,0,0,0,0,0,2,0.0


In [42]:
## Entrenamiento del modelo

# Definir X e y para entrenar el modelo
X = df_to_train[variables_to_train[:-1]]
y = df_to_train[variables_to_train[-1]]

print('Datos usados en el entrenamiento')
display(X.head(5))

# Segmentar la data en conjuntos de entrenamiento y de validacion
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Crear el modelo Naive Bayes
model_nb = GaussianNB()

# Entrenar el modelo
model_nb.fit(X_train, y_train)

Datos usados en el entrenamiento


Unnamed: 0,edad,genero,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio
0,2,0.0,0,1,0,0,0,0,0,0,0,2
1,2,0.0,0,1,0,2,0,0,0,0,0,2
2,3,0.0,0,0,2,0,0,0,0,0,0,0
3,2,0.0,0,2,1,2,0,0,0,0,2,2
4,1,0.0,0,1,1,1,0,0,0,0,0,2


In [43]:
## Validacion del modelo usando las metricas de acuraccy, f1-score y la matriz de confusion

# Valores predichos
y_pred = model_nb.predict(X_test)

# Calculo de metricas
metrica = round(model_nb.score(X_test, y_test)*100,2)
f1 = round(f1_score(y_test, y_pred)*100,2)
matriz_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
matriz_confusion.columns = ['Negativo','Positivo']
matriz_confusion.index = ['Negativo','Positivo']

print(f'Modelo entrenado con una precision de: {metrica}')
print(f'Modelo entrenado con f1 de: {f1}')
print('La matriz de confusion es:')
display(matriz_confusion)

Modelo entrenado con una precision de: 70.3
Modelo entrenado con f1 de: 2.92
La matriz de confusion es:


Unnamed: 0,Negativo,Positivo
Negativo,67433,0
Positivo,28673,431


In [44]:
## Validar casos particulares

print('Validacion con casos particulares \n')

for i in range(5):
    print('--------------------------------------------------------------------------------------------------------------------')
    row = random.choice(list(X_test.index))
    values = X_test.loc[[row]]
    meta = y_test.loc[[row]]
    indice = values.index[0]

    print(f'La prediccion (en porcentaje) de tener diabetes es de: {round(model_nb.predict_proba(values)[0][1]*100,2)}%')
    print(f'La prediccion (en categoria) de tener diabetes es de: {model_nb.predict(values)[0]}')
    print(f'Los valores predichos son:')
    display(data.iloc[[indice]])

Validacion con casos particulares 

--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 0.01%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
179719,0.0,42.0,0.0,76147.0,1.0,0.0,88.0,1.53,37.592379,40.0,102.0,144.0,1.0,0.0,1.0,0.0,100.0,2.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 0.04%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
132118,0.0,71.0,0.0,11001.0,7.0,0.0,71.8,1.48,32.779401,53.0,106.5,154.26,1.0,0.0,1.0,0.0,101.0,2.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 0.03%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
146705,0.0,64.0,1.0,5001.0,5.0,0.0,69.9,1.72,23.627637,32.5,106.0,306.0,1.0,0.0,1.0,0.0,90.0,0.0,1.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 0.0%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
48853,0.0,50.0,1.0,5001.0,0.0,0.0,99.0,1.85,28.926224,46.0,178.0,141.0,0.0,0.0,0.0,0.0,109.0,1.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 0.02%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
137553,0.0,78.0,1.0,5001.0,0.0,0.0,76.0,1.72,25.689562,44.9,38.8,61.4,1.0,0.0,1.0,0.0,94.0,1.0,0.0


## Sample = Flase // Cluster = False

## Todas las variables 

In [59]:
# Conjunto de variables a usar en el entrenamiento

variables_to_train_all = ['edad',
                      'genero',
                      'codigo_ciudad_sucursal', # Mirar
                      'imc',
                      'HDL',
                      'LDL',
                      'trigliceridos',
                      'med_hipertension',
                      'familiar_dm',
                      'ant_cardiovascular',
                      'dm_gestacional',
                      'PERIMETRO_ABDOMINAL',
                      'hace_ejercicio',
                      'nivel_academico_paciente', # Mirar
                      'raza_paciente', # Mirar
                      'diabetes']

In [60]:
## Consolidacion de los datos de entrenamiento

# Se define si se trabaja con clusters o sin ellos
if cluster:
    df_to_begin = df_cluster
else:
    df_to_begin = data.copy()

# Sampleo de los datos para igualar la cantidad de pacientes con y sin diabetes
if sample:
    df = df_to_begin[df_to_begin.diabetes == 1]
    df_sample = df_to_begin[df_to_begin.diabetes == 0].sample(len(df))
    df = pd.concat([df,df_sample]).dropna()
else:
    df = df_to_begin

# Filtrar DataFrame con las variables a entrenar
df_to_train = df[variables_to_train_all].copy()
df_to_train.head(5)

Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente,diabetes
0,64.0,0.0,11001.0,25.6,58.0,87.43,60.64,0.0,0.0,0.0,0.0,78.0,2.0,0.0,0.0,0.0
1,54.0,0.0,54001.0,25.6,57.0,30.0,374.0,0.0,0.0,0.0,0.0,75.0,2.0,1.0,0.0,0.0
2,69.0,0.0,15001.0,25.6,66.0,171.0,91.0,0.0,0.0,0.0,0.0,89.0,1.0,2.0,0.0,0.0
3,57.0,0.0,8001.0,25.6,39.0,143.0,280.0,0.0,0.0,0.0,0.0,95.0,2.0,3.0,0.0,1.0
4,50.0,0.0,19001.0,25.6,59.0,140.0,186.0,0.0,0.0,0.0,0.0,87.0,2.0,4.0,0.0,0.0


In [67]:
# Definir X e y para entrenar el modelo
X = df_to_train[variables_to_train_all[:-1]]
y = df_to_train[variables_to_train_all[-1]]

# Eliminar filas con NaN en X
X_clean = X.dropna()
y_clean = y[X_clean.index]

# Ahora usa X_clean y y_clean para el entrenamiento
X = X_clean
y = y_clean

print('Datos usados en el entrenamiento')
display(X.head(5))

# Segmentar la data en conjuntos de entrenamiento y de validación
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Crear el modelo Naive Bayes
model_nb = GaussianNB()

# Entrenar el modelo
model_nb.fit(X_train, y_train)

print("Modelo entrenado con éxito")


Datos usados en el entrenamiento


Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente
0,64.0,0.0,11001.0,25.6,58.0,87.43,60.64,0.0,0.0,0.0,0.0,78.0,2.0,0.0,0.0
1,54.0,0.0,54001.0,25.6,57.0,30.0,374.0,0.0,0.0,0.0,0.0,75.0,2.0,1.0,0.0
2,69.0,0.0,15001.0,25.6,66.0,171.0,91.0,0.0,0.0,0.0,0.0,89.0,1.0,2.0,0.0
3,57.0,0.0,8001.0,25.6,39.0,143.0,280.0,0.0,0.0,0.0,0.0,95.0,2.0,3.0,0.0
4,50.0,0.0,19001.0,25.6,59.0,140.0,186.0,0.0,0.0,0.0,0.0,87.0,2.0,4.0,0.0


Modelo entrenado con éxito


In [68]:
## Validacion del modelo usando las metricas de acuraccy, f1-score y la matriz de confusion

# Valores predichos
y_pred = model_nb.predict(X_test)

# Calculo de metricas
metrica = round(model_nb.score(X_test, y_test)*100,2)
f1 = round(f1_score(y_test, y_pred)*100,2)
matriz_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
matriz_confusion.columns = ['Negativo','Positivo']
matriz_confusion.index = ['Negativo','Positivo']

print(f'Modelo entrenado con una precision de: {metrica}')
print(f'Modelo entrenado con f1 de: {f1}')
print('La matriz de confusion es:')
display(matriz_confusion)

Modelo entrenado con una precision de: 69.07
Modelo entrenado con f1 de: 30.05
La matriz de confusion es:


Unnamed: 0,Negativo,Positivo
Negativo,60258,7365
Positivo,22497,6414


In [69]:
## Validar casos particulares

print('Validacion con casos particulares \n')

for i in range(5):
    print('--------------------------------------------------------------------------------------------------------------------')
    row = random.choice(list(X_test.index))
    values = X_test.loc[[row]]
    meta = y_test.loc[[row]]
    indice = values.index[0]

    print(f'La prediccion (en porcentaje) de tener diabetes es de: {round(model_nb.predict_proba(values)[0][1]*100,2)}%')
    print(f'La prediccion (en categoria) de tener diabetes es de: {model_nb.predict(values)[0]}')
    print(f'Los valores predichos son:')
    display(data.iloc[[indice]])

Validacion con casos particulares 

--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 13.09%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
95557,0.0,90.0,0.0,76001.0,4.0,0.0,59.0,1.55,24.557752,71.1,135.02,94.4,1.0,0.0,1.0,0.0,85.0,3.0,1.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 30.24%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
285683,0.0,59.0,0.0,52001.0,0.0,1.0,64.8,1.56,26.627219,46.0,159.0,131.0,1.0,0.0,1.0,0.0,84.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 44.06%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
171533,0.0,64.0,0.0,54001.0,0.0,0.0,90.4,1.65,33.204775,30.0,145.0,195.0,1.0,0.0,1.0,0.0,108.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 11.12%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
143835,0.0,27.0,1.0,50001.0,0.0,0.0,81.0,1.64,30.116002,42.11,137.83,147.4,0.0,0.0,1.0,0.0,102.0,1.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 14.45%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
56937,0.0,28.0,0.0,8001.0,0.0,0.0,113.5,1.67,40.697049,36.3,87.5,104.0,0.0,0.0,0.0,0.0,115.0,3.0,0.0


## Sin las variables 
código ciudad sucursal, nivel academico y raza paciente

In [70]:
# Conjunto de variables a usar en el entrenamiento
# Sin código ciudad sucursal, nivel academico y raza paciente

variables_to_train_sin = ['edad',
                      'genero',
                      'imc',
                      'HDL',
                      'LDL',
                      'trigliceridos',
                      'med_hipertension',
                      'familiar_dm',
                      'ant_cardiovascular',
                      'dm_gestacional',
                      'PERIMETRO_ABDOMINAL',
                      'hace_ejercicio',
                      'diabetes']

In [71]:
## Consolidacion de los datos de entrenamiento

# Se define si se trabaja con clusters o sin ellos
if cluster:
    df_to_begin = df_cluster
else:
    df_to_begin = data.copy()

# Sampleo de los datos para igualar la cantidad de pacientes con y sin diabetes
if sample:
    df = df_to_begin[df_to_begin.diabetes == 1]
    df_sample = df_to_begin[df_to_begin.diabetes == 0].sample(len(df))
    df = pd.concat([df,df_sample]).dropna()
else:
    df = df_to_begin

# Filtrar DataFrame con las variables a entrenar
df_to_train = df[variables_to_train_sin].copy()
df_to_train.head(5)

Unnamed: 0,edad,genero,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
0,64.0,0.0,25.6,58.0,87.43,60.64,0.0,0.0,0.0,0.0,78.0,2.0,0.0
1,54.0,0.0,25.6,57.0,30.0,374.0,0.0,0.0,0.0,0.0,75.0,2.0,0.0
2,69.0,0.0,25.6,66.0,171.0,91.0,0.0,0.0,0.0,0.0,89.0,1.0,0.0
3,57.0,0.0,25.6,39.0,143.0,280.0,0.0,0.0,0.0,0.0,95.0,2.0,1.0
4,50.0,0.0,25.6,59.0,140.0,186.0,0.0,0.0,0.0,0.0,87.0,2.0,0.0


In [72]:
## Entrenamiento del modelo

# Definir X e y para entrenar el modelo
X = df_to_train[variables_to_train_sin[:-1]]
y = df_to_train[variables_to_train_sin[-1]]

print('Datos usados en el entrenamiento')
display(X.head(5))

# Segmentar la data en conjuntos de entrenamiento y de validacion
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Crear el modelo Naive Bayes
model_nb = GaussianNB()

# Entrenar el modelo
model_nb.fit(X_train, y_train)

Datos usados en el entrenamiento


Unnamed: 0,edad,genero,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio
0,64.0,0.0,25.6,58.0,87.43,60.64,0.0,0.0,0.0,0.0,78.0,2.0
1,54.0,0.0,25.6,57.0,30.0,374.0,0.0,0.0,0.0,0.0,75.0,2.0
2,69.0,0.0,25.6,66.0,171.0,91.0,0.0,0.0,0.0,0.0,89.0,1.0
3,57.0,0.0,25.6,39.0,143.0,280.0,0.0,0.0,0.0,0.0,95.0,2.0
4,50.0,0.0,25.6,59.0,140.0,186.0,0.0,0.0,0.0,0.0,87.0,2.0


In [73]:
## Validacion del modelo usando las metricas de acuraccy, f1-score y la matriz de confusion

# Valores predichos
y_pred = model_nb.predict(X_test)

# Calculo de metricas
metrica = round(model_nb.score(X_test, y_test)*100,2)
f1 = round(f1_score(y_test, y_pred)*100,2)
matriz_confusion = pd.DataFrame(confusion_matrix(y_test, y_pred))
matriz_confusion.columns = ['Negativo','Positivo']
matriz_confusion.index = ['Negativo','Positivo']

print(f'Modelo entrenado con una precision de: {metrica}')
print(f'Modelo entrenado con f1 de: {f1}')
print('La matriz de confusion es:')
display(matriz_confusion)

Modelo entrenado con una precision de: 70.35
Modelo entrenado con f1 de: 5.01
La matriz de confusion es:


Unnamed: 0,Negativo,Positivo
Negativo,67164,346
Positivo,28273,754


In [74]:
## Validar casos particulares

print('Validacion con casos particulares \n')

for i in range(5):
    print('--------------------------------------------------------------------------------------------------------------------')
    row = random.choice(list(X_test.index))
    values = X_test.loc[[row]]
    meta = y_test.loc[[row]]
    indice = values.index[0]

    print(f'La prediccion (en porcentaje) de tener diabetes es de: {round(model_nb.predict_proba(values)[0][1]*100,2)}%')
    print(f'La prediccion (en categoria) de tener diabetes es de: {model_nb.predict(values)[0]}')
    print(f'Los valores predichos son:')
    display(data.iloc[[indice]])

Validacion con casos particulares 

--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 3.46%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
227625,0.0,49.0,0.0,76001.0,0.0,0.0,74.0,1.52,32.029086,48.0,133.0,312.0,1.0,0.0,1.0,0.0,90.0,2.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 0.02%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
62365,0.0,28.0,0.0,11001.0,7.0,0.0,56.0,1.52,24.238227,36.1,58.18,54.39,0.0,0.0,0.0,0.0,76.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 0.83%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
194557,0.0,80.0,1.0,54001.0,1.0,0.0,53.0,1.59,20.964361,69.8,69.4,89.0,1.0,0.0,1.0,0.0,86.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 3.02%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
190946,0.0,77.0,1.0,11001.0,0.0,0.0,63.7,1.59,25.196788,35.0,75.0,87.0,1.0,0.0,1.0,0.0,91.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 2.13%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
136058,0.0,64.0,0.0,19001.0,3.0,0.0,59.4,1.52,25.709834,45.2,113.0,199.6,1.0,0.0,1.0,0.0,86.0,3.0,0.0
