# **Reporte del Modelo Baseline**
---

In [1]:
# Librerias
import os
import pandas as pd
import requests
import logging
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Funcion para crear el objeto de creación de logs
def create_logger():
  logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(levenname)s - %(message)s')
  logger = logging.getLogger('Logger')
  logger.info('Logger creado')
  return logger

In [3]:
# Función de consumo a la base de datos de Firebase para cargar el DataSet
def download_firebase(url, logger):
  logger.info("Extrayendo el archivo desde Firebase")
  df = None
  try:
    df = pd.read_csv(url)
    logger.info("Archivo cargado")
  except requests.exceptions.RequestException as e:
    logger.info(f"Error al descargar el archivo CSV: {e}")
  except pd.errors.EmptyDataError:
    logger.info("El archivo CSV está vacío.")
  except Exception as e:
    logger.info(f"Ocurrió un error inesperado: {e}")
  return df

In [4]:
# Cargar DataSet
url = 'https://firebasestorage.googleapis.com/v0/b/personalwp-8822c.appspot.com/o/diabetes_prediction_dataset.csv?alt=media&token=4d70d154-c3d0-4fa0-a3aa-9b9972dd3b95'
logger = create_logger()
df = download_firebase(url, logger)

In [5]:
# Eliminación de valores atípicos y duplicados
seventy_fifth = df['bmi'].quantile(0.75)
twenty_fifth = df['bmi'].quantile(0.25)
iqr = seventy_fifth - twenty_fifth
upper = seventy_fifth + (10 * iqr)
outliers_bmi_upper = df[(df['bmi'] > upper)]
df = pd.merge(df, outliers_bmi_upper, indicator = True, how = 'outer').query('_merge == "left_only"').drop('_merge', axis = 1)
df = df.drop_duplicates(keep = "first")

## **Extracción de Caracteristicas**

### **Variables Categóricas a Numéricas**

In [18]:
# Conjunto de datos original
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,0.08,0,0,No Info,11.88,5.7,80,0
1,Female,0.08,0,0,No Info,12.22,3.5,130,0
2,Female,0.08,0,0,No Info,12.5,4.5,155,0
3,Female,0.08,0,0,No Info,12.74,3.5,140,0
4,Female,0.08,0,0,No Info,12.77,4.5,160,0


In [19]:
# Conversión de variable 'gender' a numérica
df['gender'] = pd.factorize(df['gender'])[0]

In [20]:
# Conversión de variable 'smoking_history' a numérica
df['smoking_history'] = pd.factorize(df['smoking_history'])[0]

In [21]:
# Conjunto de datos transformado
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,0.08,0,0,0,11.88,5.7,80,0
1,0,0.08,0,0,0,12.22,3.5,130,0
2,0,0.08,0,0,0,12.5,4.5,155,0
3,0,0.08,0,0,0,12.74,3.5,140,0
4,0,0.08,0,0,0,12.77,4.5,160,0


**Universidad Nacional de Colombia** - *Facultad de Ingeniería*