# **Diccionario de Datos**
---

In [1]:
# Librerias
import os
import pandas as pd
import requests
import logging
import matplotlib.pyplot as plt

In [2]:
# Funcion para crear el objeto de creación de logs
def create_logger():
  logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(levenname)s - %(message)s')
  logger = logging.getLogger('Logger')
  logger.info('Logger creado')
  return logger

In [3]:
# Función de consumo a la base de datos de Firebase para cargar el DataSet
def download_firebase(url, logger):
  logger.info("Extrayendo el archivo desde Firebase")
  df = None
  try:
    df = pd.read_csv(url)
    logger.info("Archivo cargado")
  except requests.exceptions.RequestException as e:
    logger.info(f"Error al descargar el archivo CSV: {e}")
  except pd.errors.EmptyDataError:
    logger.info("El archivo CSV está vacío.")
  except Exception as e:
    logger.info(f"Ocurrió un error inesperado: {e}")
  return df

In [4]:
# Cargar DataSet
url = 'https://firebasestorage.googleapis.com/v0/b/personalwp-8822c.appspot.com/o/diabetes_prediction_dataset.csv?alt=media&token=4d70d154-c3d0-4fa0-a3aa-9b9972dd3b95'
logger = create_logger()
df = download_firebase(url, logger)

## **Conjunto de Datos**

In [5]:
# Información del DataSet
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [13]:
# Valores para 'gender' (Variable categórica)
gender = df['gender'].unique()
gender

array(['Female', 'Male', 'Other'], dtype=object)

In [19]:
# Valores para 'age' (Variable numerica)
age = df['age'].nunique()
age_min = df['age'].min()
age_max = df['age'].max()
print(age)
print(age_min)
print(age_max)

102
0.08
80.0


In [20]:
# Valores para 'hypertension' (Variable categórica)
hypertension = df['hypertension'].unique()
hypertension

array([0, 1])

In [21]:
# Valores para 'heart_disease' (Variable categórica)
heart_disease = df['heart_disease'].unique()
heart_disease

array([1, 0])

In [22]:
# Valores para 'smoking_history' (Variable categórica)
smoking_history = df['smoking_history'].unique()
smoking_history

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [24]:
# Valores para 'bmi' (Variable numerica)
bmi = df['bmi'].nunique()
bmi_min = df['bmi'].min()
bmi_max = df['bmi'].max()
print(bmi)
print(bmi_min)
print(bmi_max)

4247
10.01
95.69


In [25]:
# Valores para 'HbA1c_level' (Variable numerica)
HbA1c_level = df['HbA1c_level'].nunique()
HbA1c_level_min = df['HbA1c_level'].min()
HbA1c_level_max = df['HbA1c_level'].max()
print(HbA1c_level)
print(HbA1c_level_min)
print(HbA1c_level_max)

18
3.5
9.0


In [26]:
# Valores para 'blood_glucose_level' (Variable numerica)
blood_glucose_level = df['blood_glucose_level'].nunique()
blood_glucose_level_min = df['blood_glucose_level'].min()
blood_glucose_level_max = df['blood_glucose_level'].max()
print(blood_glucose_level)
print(blood_glucose_level_min)
print(blood_glucose_level_max)

18
80
300


In [27]:
# Valores para 'diabetes' (Variable categórica)
diabetes = df['diabetes'].unique()
diabetes

array([0, 1])

**Universidad Nacional de Colombia** - *Facultad de Ingeniería*