In [15]:
#==============================================
# Importación de Librerias
#==============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [14]:
#==============================================
# Carga dataset
#==============================================
print("Cargando datos...")
# Cargamos el dataset de diabetes desde un archivo CSV
diabetes = pd.read_csv('/content/drive/MyDrive/Promo/diabetes.csv')

# Mostrar las primeras 10 filas para entender la estructura del dataset
print("\nTop 10 registros del dataset:")
print(diabetes.head(10).to_string(index=False))

Cargando datos...

Top 10 registros del dataset:
 PatientID  Pregnancies  PlasmaGlucose  DiastolicBloodPressure  TricepsThickness  SerumInsulin       BMI  DiabetesPedigree  Age  Diabetic
   1354778            0            171                      80                34            23 43.509726          1.213191   21         0
   1147438            8             92                      93                47            36 21.240576          0.158365   23         0
   1640031            7            115                      47                52            35 41.511523          0.079019   23         0
   1883350            9            103                      78                25           304 29.582192          1.282870   43         1
   1424119            1             85                      59                27            35 42.604536          0.549542   22         0
   1619297            0             82                      92                 9           253 19.724160          0.103424 

In [16]:
#==============================================
# Preparación de variables
#==============================================

# X son las variables predictoras (caracteristicas) y y es la etiqueta obetivo (si es diabetico o no)
X = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
              'TricepsThickness','SerumInsulin','BMI',
              'DiabetesPedigree','Age']].values
y = diabetes['Diabetic'].values

# Dividir en entrenamiento (70%) y prueba (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [17]:
#==============================================
# Definición de modelos
#==============================================

# Modelo base: Regresión Logistica
log_reg = LogisticRegression(C=1.0, solver="liblinear", random_state=42)

# Bagging: usa multiple clasificadores (por defecto arboles) entrenados en subconjuntos bootstrap
bagging = BaggingClassifier(estimator=log_reg, n_estimators=50, random_state=42)

# Boosting (AdaBoost): entrena clasificadores secuenciales, ajustando mas peso a errores
adaboost = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=42)

# Boosting mas avanzado: Gradient Boosting
gboost = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [19]:
#==============================================
# Entrenamientos de modelos
#==============================================
print("Entrenando modelos...")

modelos = {
    "Logistic Regression": log_reg,
    "Bagging (LogReg)": bagging,
    "AdaBoost": adaboost,
    "GradientBoosting": gboost
}

resultados = {}

for nombre, modelo in modelos.items():
  modelo.fit(X_train, y_train)
  y_pred = modelo.predict(X_test)
  y_proba = modelo.predict_proba(X_test)[:,1] if hasattr(modelo, "predict_proba") else None

  acc = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "N/A"

  resultados[nombre] = {"Accuracy": acc, "AUC": auc}

  print(f">>> {nombre}")
  print("Accuracy:", round(acc, 4))
  print("AUC:", round(auc, 4) if auc != "N/A" else "No disponible")
  print("Matriz de confusión:", confusion_matrix(y_test, y_pred))
  print("Reporte de clasificación:", classification_report(y_test, y_pred))


Entrenando modelos...
>>> Logistic Regression
Accuracy: 0.7933
AUC: 0.8616
Matriz de confusión: [[1765  232]
 [ 388  615]]
Reporte de clasificación:               precision    recall  f1-score   support

           0       0.82      0.88      0.85      1997
           1       0.73      0.61      0.66      1003

    accuracy                           0.79      3000
   macro avg       0.77      0.75      0.76      3000
weighted avg       0.79      0.79      0.79      3000

>>> Bagging (LogReg)
Accuracy: 0.7937
AUC: 0.8616
Matriz de confusión: [[1769  228]
 [ 391  612]]
Reporte de clasificación:               precision    recall  f1-score   support

           0       0.82      0.89      0.85      1997
           1       0.73      0.61      0.66      1003

    accuracy                           0.79      3000
   macro avg       0.77      0.75      0.76      3000
weighted avg       0.79      0.79      0.79      3000

>>> AdaBoost
Accuracy: 0.9417
AUC: 0.9849
Matriz de confusión: [[1913   8

In [22]:
#==============================================
# Comparación final de resultados
#==============================================
print("Resumen comparativo de metricas")
df_resultados = pd.DataFrame(resultados).T
print(df_resultados)

Resumen comparativo de metricas
                     Accuracy       AUC
Logistic Regression  0.793333  0.861569
Bagging (LogReg)     0.793667  0.861608
AdaBoost             0.941667  0.984875
GradientBoosting     0.954667  0.990001
