In [15]:
import pickle
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
import random

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from params import *
from utilities import *

import warnings
warnings.filterwarnings('ignore')

In [17]:
client_storage = storage.Client()
client_storage

<google.cloud.storage.client.Client at 0x7f171ef24d90>

In [2]:
data_original = bring_data_from_bq(table = tables[0], client_bq = client_bq, save = True, read_local = True)
data = prepare_data(data_original)

data.head(5)

Leyendo datos de la tabla: diabetes


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
0,0.0,64.0,0.0,11001.0,0.0,0.0,57.6,1.5,25.6,58.0,87.43,60.64,0.0,0.0,0.0,0.0,78.0,2.0,0.0
1,0.0,54.0,0.0,54001.0,1.0,0.0,57.6,1.5,25.6,57.0,30.0,374.0,0.0,0.0,0.0,0.0,75.0,2.0,0.0
2,0.0,69.0,0.0,15001.0,2.0,0.0,57.6,1.5,25.6,66.0,171.0,91.0,0.0,0.0,0.0,0.0,89.0,1.0,0.0
3,0.0,57.0,0.0,8001.0,3.0,0.0,57.6,1.5,25.6,39.0,143.0,280.0,0.0,0.0,0.0,0.0,95.0,2.0,1.0
4,0.0,50.0,0.0,19001.0,4.0,0.0,57.6,1.5,25.6,59.0,140.0,186.0,0.0,0.0,0.0,0.0,87.0,2.0,0.0


In [27]:
variables_to_train = ['edad',
                      'genero',
                      'codigo_ciudad_sucursal',
                      'imc',
                      'HDL',
                      'LDL',
                      'trigliceridos',
                      'med_hipertension',
                      'familiar_dm',
                      'ant_cardiovascular',
                      'dm_gestacional',
                      'PERIMETRO_ABDOMINAL',
                      'hace_ejercicio',
                      'nivel_academico_paciente',
                      'raza_paciente',
                      'diabetes']

In [31]:
df = data[data.diabetes == 1]

sample = data[data.diabetes == 0].sample(len(df))

df = pd.concat([df,sample]).dropna()

df_to_train = df[variables_to_train].copy()

X = df_to_train[variables_to_train[:-1]]
y = df_to_train[variables_to_train[-1]]

print('Datos usados en el entrenamiento')
display(X.head(5))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model = RandomForestClassifier(n_estimators=200, min_samples_split = 5, min_samples_leaf = 4, max_features = 'auto', max_depth = 20, bootstrap = True)
display(X_train)
display(y_train)
model.fit(X_train, y_train)

filename = 'diabetes_avicena.pkl'
pickle.dump(model, open(filename, 'wb'))

y_pred = model.predict(X_test)

metrica = round(model.score(X_test, y_test)*100,2)
f1 = f1_score(y_test, y_pred)

print(f'Modelo entrenado con una precision de: {metrica}')
print(f'Modelo entrenado con f1 de: {f1}')
print('La matriz de confusion es:')
display(pd.DataFrame(confusion_matrix(y_test, y_pred)))

print('Validacion con casos particulares \n')

for i in range(5):
    print('--------------------------------------------------------------------------------------------------------------------')
    row = random.choice(list(X_test.index))
    values = X_test.loc[[row]]
    meta = y_test.loc[[row]]
    indice = values.index[0]

    print(f'La prediccion (en porcentaje) de tener diabetes es de: {round(model.predict_proba(values)[0][1]*100,2)}%')
    print(f'La prediccion (en categoria) de tener diabetes es de: {model.predict(values)[0]}')
    print(f'Los valores predichos son:')
    display(data.iloc[[indice]])

Datos usados en el entrenamiento


Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente
3,57.0,0.0,8001.0,25.6,39.0,143.0,280.0,0.0,0.0,0.0,0.0,95.0,2.0,3.0,0.0
11,43.0,0.0,50001.0,35.785147,44.92,121.0,321.0,0.0,0.0,0.0,1.0,126.0,3.0,0.0,0.0
21,32.0,0.0,11001.0,18.961927,81.0,83.0,34.0,0.0,0.0,0.0,1.0,74.0,2.0,3.0,0.0
26,40.0,0.0,20001.0,27.2,51.0,125.0,87.0,0.0,0.0,0.0,0.0,70.0,3.0,4.0,0.0
31,40.0,0.0,20001.0,27.2,51.0,125.0,87.0,0.0,0.0,0.0,0.0,70.0,1.0,4.0,0.0


Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente
183350,91.0,0.0,5001.0,30.381944,30.0,116.0,140.0,1.0,0.0,1.0,0.0,88.0,3.0,0.0,0.0
282359,83.0,0.0,52356.0,22.832879,78.0,52.0,97.0,1.0,0.0,1.0,0.0,90.0,2.0,3.0,0.0
10562,47.0,0.0,8001.0,23.875115,65.0,72.0,35.0,1.0,0.0,0.0,0.0,95.0,3.0,3.0,0.0
115344,72.0,0.0,5001.0,22.959088,71.4,58.9,112.0,1.0,0.0,1.0,0.0,88.0,1.0,0.0,0.0
209887,60.0,1.0,15001.0,24.979592,44.7,139.1,186.4,1.0,0.0,1.0,0.0,96.0,2.0,11.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228012,66.0,1.0,52001.0,24.158818,47.0,2.6,147.0,1.0,0.0,1.0,0.0,89.0,0.0,11.0,1.0
65962,33.0,0.0,19001.0,22.633745,44.0,89.0,64.0,0.0,0.0,0.0,0.0,75.0,3.0,9.0,0.0
56367,36.0,0.0,25817.0,32.672758,55.0,83.4,88.0,1.0,0.0,0.0,0.0,102.0,0.0,3.0,0.0
249614,28.0,1.0,50001.0,42.323089,42.0,115.0,104.0,1.0,0.0,1.0,0.0,121.0,0.0,0.0,0.0


183350    0.0
282359    1.0
10562     0.0
115344    1.0
209887    1.0
         ... 
228012    0.0
65962     0.0
56367     0.0
249614    0.0
173915    0.0
Name: diabetes, Length: 118194, dtype: float64

Modelo entrenado con una precision de: 69.94
Modelo entrenado con f1 de: 0.7109878133359754
La matriz de confusion es:


Unnamed: 0,0,1
0,19185,9972
1,7530,21528


Validacion con casos particulares 

--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 14.88%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
65955,0.0,45.0,1.0,19001.0,1.0,1.0,61.4,1.6,23.984375,31.8,112.0,235.0,0.0,0.0,0.0,0.0,86.0,2.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 29.64%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
26542,0.0,24.0,0.0,54001.0,5.0,0.0,47.2,1.62,17.985063,43.7,22.1,785.0,0.0,0.0,0.0,0.0,76.0,2.0,1.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 33.15%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
216146,0.0,63.0,0.0,52001.0,4.0,1.0,62.5,1.6,24.414062,38.0,246.0,99.0,1.0,0.0,1.0,0.0,76.0,3.0,0.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 72.16%
La prediccion (en categoria) de tener diabetes es de: 1.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
244617,0.0,59.0,1.0,76001.0,0.0,0.0,98.8,1.64,36.734087,37.0,122.0,202.0,1.0,0.0,1.0,0.0,120.0,3.0,1.0


--------------------------------------------------------------------------------------------------------------------
La prediccion (en porcentaje) de tener diabetes es de: 43.72%
La prediccion (en categoria) de tener diabetes es de: 0.0
Los valores predichos son:


Unnamed: 0,tipo_identificacion_paciente,edad,genero,codigo_ciudad_sucursal,nivel_academico_paciente,raza_paciente,peso,talla,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,diabetes
181299,0.0,74.0,1.0,23001.0,4.0,1.0,66.0,1.72,22.309356,45.9,174.82,129.9,1.0,0.0,1.0,0.0,91.0,2.0,0.0


In [19]:
def load_to_gcs(file, bucket_name, destination_blob_name):
    storage_client = client_storage
    bucket = storage_client.bucket(bucket_name)
    print(f'Saving at {destination_blob_name}')
    blob = bucket.blob(destination_blob_name)
    # joblib.dump(file, 'temp.pkl')
    blob.upload_from_filename('diabetes_avicena.pkl')
    # os.remove('temp.pkl')

In [20]:
load_to_gcs('file', 'co-keralty-models', 'portafolio/cds/pred_diagnostico/diabetes/diabetes_avicena/diabetes_avicena.pkl')

Saving at portafolio/cds/pred_diagnostico/diabetes/diabetes_avicena/diabetes_avicena.pkl


In [21]:
X_test

Unnamed: 0,edad,genero,codigo_ciudad_sucursal,imc,HDL,LDL,trigliceridos,med_hipertension,familiar_dm,ant_cardiovascular,dm_gestacional,PERIMETRO_ABDOMINAL,hace_ejercicio,nivel_academico_paciente,raza_paciente
124960,54.0,1.0,11001.0,28.316405,38.6,45.0,116.0,1.0,0.0,1.0,0.0,92.0,2.0,3.0,0.0
257253,70.0,0.0,47001.0,35.261708,48.0,181.0,147.0,1.0,0.0,1.0,0.0,104.0,3.0,5.0,0.0
230298,77.0,1.0,11001.0,26.813590,48.4,65.5,79.4,0.0,0.0,1.0,0.0,101.0,2.0,2.0,0.0
190467,65.0,0.0,41551.0,38.687584,44.0,164.0,243.0,1.0,0.0,1.0,0.0,103.0,2.0,0.0,0.0
1636,76.0,0.0,50001.0,25.109569,43.8,81.2,151.3,0.0,0.0,0.0,0.0,84.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218624,57.0,0.0,19001.0,28.946099,51.9,77.5,83.0,1.0,0.0,1.0,0.0,85.0,0.0,0.0,0.0
111195,65.0,1.0,8001.0,31.522112,42.0,107.0,155.0,1.0,0.0,1.0,0.0,106.0,3.0,2.0,0.0
118292,78.0,0.0,8001.0,29.515939,55.0,158.0,200.0,1.0,0.0,1.0,0.0,84.0,3.0,4.0,0.0
191061,80.0,0.0,68276.0,27.095447,47.0,45.0,108.0,1.0,0.0,1.0,0.0,100.0,1.0,0.0,0.0
