In [1]:
# Cargar librerías
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
import mlflow
import mlflow.sklearn


In [2]:
# Leer datos
df = pd.read_csv("data/diabetes.csv")



In [3]:
# identifica las columnas que tiene valores 0
(df==0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [4]:
# Reemplazar ceros en columnas específicas por NaN
cols_to_clean = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']
# No se consideran algunas, por ejemplo Pregnancies (por qué si es posible que se tengan cero embarazos)

# Reemplazar ceros por NaN para poder tratarlos como datos faltantes
for col in cols_to_clean:
    df[col] = df[col].replace(0, np.nan)

# Reemplazar NaN con la moda (valor más frecuente) de cada columna
for col in cols_to_clean:
    moda = df[col].mode() # buscar el método más común para determinar la moda de una columna
    df[col] = df[col].fillna(moda)


In [24]:
# Separar datos
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# agregar el traking uri de mlflow
mlflow.set_tracking_uri("http://127.0.0.1:9090")
# agregar set_experiment con nombre: ClasificadorDemoDiabetes
mlflow.set_experiment(experiment_name="ClasificadorDemoDiabetes")

2025/05/17 01:47:27 INFO mlflow.tracking.fluent: Experiment with name 'ClasificadorDemoDiabetes' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1747464447400, experiment_id='4', last_update_time=1747464447400, lifecycle_stage='active', name='ClasificadorDemoDiabetes', tags={}>

In [11]:
log_var = pd.read_csv("data/logreg_variaciones_educativas.csv", header=0)

In [12]:
log_var

Unnamed: 0,run_id,logreg_C,logreg_max_iter,solver,penalty
0,practica_01,0.01,400,liblinear,l2
1,practica_02,1.12,300,lbfgs,l2
2,practica_03,2.23,500,liblinear,l2
3,practica_04,3.34,200,liblinear,l2
4,practica_05,4.45,300,liblinear,l2
5,practica_06,5.56,500,lbfgs,l2
6,practica_07,6.67,300,lbfgs,l2
7,practica_08,7.78,500,lbfgs,l2
8,practica_09,8.89,400,lbfgs,l2
9,practica_10,10.0,200,lbfgs,l2


In [27]:
#quitar valores nulos, debido a que dio un error
df.dropna(inplace=True)

In [22]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,105.0,33.6,0.627,50,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
...,...,...,...,...,...,...,...,...,...
753,0,181.0,88.0,44.0,510.0,43.3,0.222,26,1
755,1,128.0,88.0,39.0,110.0,36.5,1.057,37,1
760,2,88.0,58.0,26.0,16.0,28.4,0.766,22,0
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0


In [26]:
# Entrenamiento y registro con MLflow
#C = 1.0
#max_iter = 1000
# usar los parámetros del archivo logreg_variaciones_educativas.csv
# para los parámtros C, max_iter, solver y penalty de LogisticRegresion
# se debe genear un run por cada fila del archivo, usando sus parámetros
# Analizar que usar
c1 = log_var['logreg_C'].tolist()
m = log_var['logreg_max_iter'].tolist()
s = log_var['solver'].tolist()
p = log_var['penalty'].tolist()
for i in range(len(c1)):
# agregar la línea del start_run
    with mlflow.start_run():
        # El pipeline usar StandarScaler para que todos los valore numéricos estén en la misma escala
        # no cambiar dicha línea
        # LogisticRegresion deben variar sus parámetros en cada iteración.
        pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("clf", LogisticRegression(
                    C=c1[i],
                    max_iter=m[i],
                    solver=s[i],
                    penalty=p[i]
                ))
            ])
        
        # Entrenar y evaluar
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
    
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
    
        # recuerde que log_param y log_metric debe ir guardando los valores
        # dependiendo del run correspondiente de cada fila
        mlflow.log_param("logreg_C", c1)
        mlflow.log_param("logreg_max_iter", m)
        mlflow.log_param("solver", s)
        mlflow.log_param("penalty", p)
        
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        
        # Guardar el pipeline completo
        mlflow.sklearn.log_model(pipeline, "modelo_pipeline")
        
        print(" Modelo registrado en MLflow")
        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f}")




 Modelo registrado en MLflow
Accuracy: 0.7975 | Precision: 0.6786
🏃 View run bouncy-worm-470 at: http://127.0.0.1:9090/#/experiments/4/runs/4d8b782fa4404d67a178891fc9bb1141
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run able-midge-829 at: http://127.0.0.1:9090/#/experiments/4/runs/09423e4ba4444f5db06d2bfa38c25367
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run clean-shrimp-110 at: http://127.0.0.1:9090/#/experiments/4/runs/593db0ca42f14ff78e9f84e0467dc77a
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run grandiose-hog-593 at: http://127.0.0.1:9090/#/experiments/4/runs/cf152fc1ad9d4efc914ad6b8d4132c2c
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run omniscient-eel-69 at: http://127.0.0.1:9090/#/experiments/4/runs/9545e986cfb34630801760746280edce
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run able-fox-752 at: http://127.0.0.1:9090/#/experiments/4/runs/9a969b7ca8da4a279fc010f5c9cd786b
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run persistent-shrew-406 at: http://127.0.0.1:9090/#/experiments/4/runs/6495bbeb04c6425a9828c3364c05fe35
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run brawny-croc-459 at: http://127.0.0.1:9090/#/experiments/4/runs/9d277d850f7e4cae964f8df3a58c5e8d
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run orderly-sheep-581 at: http://127.0.0.1:9090/#/experiments/4/runs/170bccbda9d34931aed8237a3759f8fc
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4




 Modelo registrado en MLflow
Accuracy: 0.7848 | Precision: 0.6957
🏃 View run resilient-seal-73 at: http://127.0.0.1:9090/#/experiments/4/runs/064d2b484e0c493098a859c1ee9f9fad
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/4
