# **Reporte del Modelo Baseline**
---

In [1]:
# Librerias
!pip install optuna
import os
import requests
import logging
import optuna
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [2]:
# Funcion para crear el objeto de creación de logs
def create_logger():
  logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(levenname)s - %(message)s')
  logger = logging.getLogger('Logger')
  logger.info('Logger creado')
  return logger

In [3]:
# Función de consumo a la base de datos de Firebase para cargar el DataSet
def download_firebase(url, logger):
  logger.info("Extrayendo el archivo desde Firebase")
  df = None
  try:
    df = pd.read_csv(url)
    logger.info("Archivo cargado")
  except requests.exceptions.RequestException as e:
    logger.info(f"Error al descargar el archivo CSV: {e}")
  except pd.errors.EmptyDataError:
    logger.info("El archivo CSV está vacío.")
  except Exception as e:
    logger.info(f"Ocurrió un error inesperado: {e}")
  return df

In [4]:
# Cargar DataSet
url = 'https://firebasestorage.googleapis.com/v0/b/personalwp-8822c.appspot.com/o/diabetes_prediction_dataset.csv?alt=media&token=4d70d154-c3d0-4fa0-a3aa-9b9972dd3b95'
logger = create_logger()
df = download_firebase(url, logger)

In [5]:
# Eliminación de valores atípicos y duplicados
seventy_fifth = df['bmi'].quantile(0.75)
twenty_fifth = df['bmi'].quantile(0.25)
iqr = seventy_fifth - twenty_fifth
upper = seventy_fifth + (10 * iqr)
outliers_bmi_upper = df[(df['bmi'] > upper)]
df = pd.merge(df, outliers_bmi_upper, indicator = True, how = 'outer').query('_merge == "left_only"').drop('_merge', axis = 1)
df = df.drop_duplicates(keep = "first")

In [6]:
# Variables Categóricas a Numéricas
df['gender'] = pd.factorize(df['gender'])[0]
df['smoking_history'] = pd.factorize(df['smoking_history'])[0]

## **Implementación del Modelo**

### **Partición de Datos**

In [7]:
# Separación de la 'Data' (Características)
X = df.drop(columns = 'diabetes')
X.shape

(96143, 8)

In [8]:
# Separación del 'Target' (Variable objetivo)
y = df['diabetes']
y.shape

(96143,)

In [9]:
# Partición de los datos: 70% para entrenamiento, 30% para prueba y estratificación en las etiquetas (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

In [10]:
# Validación de la partición de los datos
print(f'Número de muestras en entrenamiento: {X_train.shape[0]}')
print(f'Número de muestras en prueba: {X_test.shape[0]}')
print(f'Número de características: {X_train.shape[1]}')

Número de muestras en entrenamiento: 67300
Número de muestras en prueba: 28843
Número de características: 8


In [11]:
# Distribución de la variable objetivo en los conjuntos de entrenamiento y prueba
print(f'Distribución de clases en entrenamiento: {np.bincount(y_train)}')
print(f'Distribución de clases en prueba: {np.bincount(y_test)}')

Distribución de clases en entrenamiento: [61363  5937]
Distribución de clases en prueba: [26298  2545]


### **Entrenamiento del Modelo**

In [12]:
# Función para la definición y entrenamiento de hiperparámetros a explorar
def objective(trial):
  max_depth = trial.suggest_int("max_depth", 1, 100)
  model = DecisionTreeClassifier(max_depth = max_depth)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  score = accuracy_score(y_test, y_pred)
  return score

In [13]:
# Exploración de hiperparámetros
study = optuna.create_study(direction = "maximize", storage = "sqlite:///hp.db", study_name = "DecisionTreeClassifier")
study.optimize(func = objective, n_trials = 100, n_jobs = -1)

[I 2024-12-11 08:53:33,844] A new study created in RDB with name: KNeighborsClassifier
[I 2024-12-11 08:53:34,290] Trial 0 finished with value: 0.9709114863225046 and parameters: {'max_depth': 6}. Best is trial 0 with value: 0.9709114863225046.
[I 2024-12-11 08:53:34,430] Trial 1 finished with value: 0.9516693825191554 and parameters: {'max_depth': 95}. Best is trial 0 with value: 0.9709114863225046.
[I 2024-12-11 08:53:34,857] Trial 3 finished with value: 0.9709114863225046 and parameters: {'max_depth': 6}. Best is trial 0 with value: 0.9709114863225046.
[I 2024-12-11 08:53:34,968] Trial 2 finished with value: 0.9516000416045488 and parameters: {'max_depth': 36}. Best is trial 0 with value: 0.9709114863225046.
[I 2024-12-11 08:53:35,421] Trial 4 finished with value: 0.9517387234337621 and parameters: {'max_depth': 97}. Best is trial 0 with value: 0.9709114863225046.
[I 2024-12-11 08:53:35,551] Trial 5 finished with value: 0.9514266893180321 and parameters: {'max_depth': 96}. Best is t

In [14]:
# Mejor hiperparámetro
params = study.best_params
print(params)

{'max_depth': 6}


In [15]:
# Mejor metrica
score = study.best_value
print(score)

0.9709114863225046


**Universidad Nacional de Colombia** - *Facultad de Ingeniería*