In [None]:
#==============================================
# Importación de Librerias
#==============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [None]:
#==============================================
# Carga dataset
#==============================================
print("Loading Data...")
# Cargamos el dataset de diabetes desde un archivo CSV
diabetes = pd.read_csv('/content/drive/MyDrive/Promo/diabetes.csv')

print("\nTop 10 registros del dataset:")
print(diabetes.head(10).to_string(index=False))

Loading Data...

Top 10 registros del dataset:
 PatientID  Pregnancies  PlasmaGlucose  DiastolicBloodPressure  TricepsThickness  SerumInsulin       BMI  DiabetesPedigree  Age  Diabetic
   1354778            0            171                      80                34            23 43.509726          1.213191   21         0
   1147438            8             92                      93                47            36 21.240576          0.158365   23         0
   1640031            7            115                      47                52            35 41.511523          0.079019   23         0
   1883350            9            103                      78                25           304 29.582192          1.282870   43         1
   1424119            1             85                      59                27            35 42.604536          0.549542   22         0
   1619297            0             82                      92                 9           253 19.724160          0.103424   

In [None]:
#==============================================
# Preparación de variables
#==============================================

# X son las variables predictoras (caracteristicas) y y es la etiqueta obetivo (si es diabetico o no)
X = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values
y = diabetes['Diabetic'].values


In [None]:
#==============================================
# Divisón en entranamiento y prueba
#==============================================

# Separamos los datos en 70% entrenamiento y 30% prueba
# Esto nos permitirá evaluar qué tan bien generaliza el modelo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [None]:
#==============================================
# Evaluación de complejidad del modelo
#==============================================

# Vamos a probar 3 niveles de regularización
# - reg muy bajo -> modelo flexible -> riesgo de overfitting
# - reg medio -> equilibrado
# - reg muy alto -> modelo muy simple -> riesgo de underfitting
regs = [1000, 1, 0.01]

# Lista para guardar resultados
train_accs, test_accs = [], []

In [None]:
#==============================================
# Entrenamiento y evaluación de modelos
#==============================================
for reg in regs:
  print(f"\nEntrenando modelo con reg={reg} (equivale a C={1/reg})")

  model = LogisticRegression().fit(X_train, y_train)

  # Predicciones sobre entrenamiento y prueba
  y_train_pred = model.predict(X_train)
  y_test_pred = model.predict(X_test)

  # Probabilidades (necesarias para AUC)
  y_test_prob = model.predict_proba(X_test)[:,1]

  # Métricas de evaluación
  train_acc = accuracy_score(y_train, y_train_pred)
  test_acc  = accuracy_score(y_test, y_test_pred)
  precision = precision_score(y_test, y_test_pred)
  recall    = recall_score(y_test, y_test_pred)
  f1        = f1_score(y_test, y_test_pred)
  auc       = roc_auc_score(y_test, y_test_prob)

  # Guardar para graficar
  train_accs.append(train_accs)
  test_accs.append(test_accs)

  # Mostrar resultados
  print(f"Train Accuracy: {train_acc:.3f}")
  print(f"Test Accuracy : {test_acc:.3f}")
  print(f"Precision     : {precision:.3f}")
  print(f"Recall        : {recall:.3f}")
  print(f"F1-Score      : {f1:.3f}")
  print(f"ROC-AUC       : {auc:.3f}")






Entrenando modelo con reg=1000 (equivale a C=0.001)
Train Accuracy: 0.720
Test Accuracy : 0.711
Precision     : 0.588
Recall        : 0.471
F1-Score      : 0.523
ROC-AUC       : 0.745

Entrenando modelo con reg=1 (equivale a C=1.0)
Train Accuracy: 0.791
Test Accuracy : 0.773
Precision     : 0.697
Recall        : 0.574
F1-Score      : 0.630
ROC-AUC       : 0.848

Entrenando modelo con reg=0.01 (equivale a C=100.0)
Train Accuracy: 0.791
Test Accuracy : 0.774
Precision     : 0.699
Recall        : 0.577
F1-Score      : 0.632
ROC-AUC       : 0.848


In [None]:
#==============================================
# Visualizacion de resultados
#==============================================

# Graficamos para ver claramente el efecto
plt.figure(figsize=(8,5))
plt.plot(regs, train_accs, marker='o', label="Entrenamiento")
plt.plot(regs, test_accs, marker='o', label="Prueba")

# Usamos escala logaritmica en el eje x para los cambios se aprecien mejor
plt.sxcale('log')
plt.xlabel("Tasa de Regularización (reg)")
plt.ylabel("Accuracy")
plt.title("Underfitting vs overfitting en Logistic Regression")
plt.legend()
plt.show()