In [5]:
import os, glob
os.chdir("..")  # subir de notebooks/ -> raíz del proyecto

print("CWD:", os.getcwd())
print("pyproject.toml aquí?:", os.path.exists("pyproject.toml"))
print("conf/base:", glob.glob("conf/base/*.*")[:5])

from kedro.framework.startup import bootstrap_project
from kedro.framework.session import KedroSession

bootstrap_project(".")
session = KedroSession.create(".")
context = session.load_context()
catalog = context.catalog



CWD: c:\Users\Roxana\proyecto-ml
pyproject.toml aquí?: True
conf/base: ['conf/base\\catalog.yml', 'conf/base\\parameters.yml', 'conf/base\\parameters_data_processing.yml', 'conf/base\\parameters_data_science.yml', 'conf/base\\parameters_reporting.yml']


In [6]:
from kedro.framework.startup import bootstrap_project
from kedro.framework.session import KedroSession
import pandas as pd, numpy as np, os
import matplotlib.pyplot as plt
import seaborn as sns

bootstrap_project(".")
session = KedroSession.create(".")
context = session.load_context()
catalog = context.catalog

REPORT_DIR = "data/08_reporting"
os.makedirs(REPORT_DIR, exist_ok=True)

def load_best(prefix):
    """Devuelve (nombre_dataset, df) priorizando features → clean → raw."""
    for k in (f"{prefix}_features", f"{prefix}_clean", f"{prefix}_raw"):
        if k in catalog:
            return k, catalog.load(k)
    raise ValueError(f"No encuentro datasets para prefix={prefix}")


In [7]:
from pathlib import Path

def data_dictionary(df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame({
        "columna": df.columns,
        "dtype": [str(df[c].dtype) for c in df.columns],
        "n_nulos": [int(df[c].isna().sum()) for c in df.columns],
        "n_uniq":  [int(df[c].nunique(dropna=True)) for c in df.columns],
        "ejemplo": [df[c].dropna().iloc[0] if df[c].notna().any() else None for c in df.columns],
    })

def eda_summary(df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame([{
        "n_filas": len(df),
        "n_cols": df.shape[1],
        "n_duplicados": int(df.duplicated().sum()),
        "nulos_totales": int(df.isna().sum().sum()),
    }])

for prefix in ["diabetes","cardio","cardiabetes"]:
    name, df = load_best(prefix)
    dd  = data_dictionary(df)
    eda = eda_summary(df)
    dd.to_csv(Path(REPORT_DIR, f"{prefix}_data_dictionary.csv"), index=False, encoding="utf-8")
    eda.to_csv(Path(REPORT_DIR, f"{prefix}_eda_summary.csv"), index=False, encoding="utf-8")
    print(f"✓ CSV exportados para {prefix}: data_dictionary / eda_summary (desde {name})")


✓ CSV exportados para diabetes: data_dictionary / eda_summary (desde diabetes_features)


✓ CSV exportados para cardio: data_dictionary / eda_summary (desde cardio_features)


✓ CSV exportados para cardiabetes: data_dictionary / eda_summary (desde cardiabetes_features)


In [8]:
def save_histograms(df: pd.DataFrame, prefix: str, max_cols=12):
    num = df.select_dtypes(include=np.number).iloc[:, :max_cols]
    if num.empty:
        print(f"[{prefix}] no hay numéricas para histogramas."); return
    rows = int(np.ceil(len(num.columns)/3))
    fig, axes = plt.subplots(rows, 3, figsize=(12, 3*rows))
    axes = np.atleast_1d(axes).ravel()
    for i, c in enumerate(num.columns):
        axes[i].hist(num[c].dropna(), bins=20)
        axes[i].set_title(c)
    for j in range(i+1, len(axes)): axes[j].axis("off")
    fig.suptitle(f"{prefix} — histogramas")
    fig.tight_layout()
    fig.savefig(os.path.join(REPORT_DIR, f"{prefix}_hists.png"), dpi=160)
    plt.close(fig)

def save_corr(df: pd.DataFrame, prefix: str, max_cols=30):
    num = df.select_dtypes(include=np.number).iloc[:, :max_cols]
    if len(num.columns) < 2:
        print(f"[{prefix}] insuficiente para correlación."); return
    corr = num.corr(numeric_only=True)
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, cmap="coolwarm", center=0)
    plt.title(f"{prefix} — correlación (primeras {num.shape[1]} numéricas)")
    plt.tight_layout()
    plt.savefig(os.path.join(REPORT_DIR, f"{prefix}_corr.png"), dpi=160)
    plt.close()

for prefix in ["diabetes","cardio","cardiabetes"]:
    _, df = load_best(prefix)
    save_histograms(df, prefix)
    save_corr(df, prefix)
    print(f"✓ Gráficos exportados para {prefix}: hists & corr")


✓ Gráficos exportados para diabetes: hists & corr


✓ Gráficos exportados para cardio: hists & corr


✓ Gráficos exportados para cardiabetes: hists & corr


In [9]:
def show_target_info(prefix: str, candidates=("Diabetes","Outcome","Diabetes_proxy","Riesgo_Alto","cardio_proxy")):
    name, df = load_best(prefix)
    for y in candidates:
        if y in df.columns:
            vc = df[y].value_counts(dropna=False)
            print(f"{prefix}: dataset={name}  target='{y}'  clases={vc.to_dict()}  nulos_target={int(df[y].isna().sum())}")
            return
    print(f"{prefix}: no encontré ninguna columna target esperada. Revisa preparación.")

for p in ["diabetes","cardio","cardiabetes"]:
    show_target_info(p)


diabetes: dataset=diabetes_features  target='Diabetes_proxy'  clases={1: 4497, 0: 3503}  nulos_target=0


cardio: dataset=cardio_features  target='Diabetes'  clases={0: 5639, 1: 2361}  nulos_target=0


cardiabetes: dataset=cardiabetes_features  target='Diabetes'  clases={0: 8127, 1: 7822}  nulos_target=0


In [10]:
from IPython.display import display
for p in ["diabetes","cardio","cardiabetes"]:
    name, df = load_best(p)
    print(f"\n== {p.upper()} == ({name}) shape={df.shape}")
    display(df.head(3))



== DIABETES == (diabetes_features) shape=(8000, 16)


Unnamed: 0,Edad,Cuartil_Edad,Sexo,Tipo_Diabetes,IMC,HbA1c,Insulina,Colesterol_Total,LDL,HDL,Trigliceridos,PAS,PAD,Registro,age_bin,Diabetes_proxy
0,43,Q2,Mujer,Tipo 2,36.7,8.52,194.12,222.24,129.79,47.16,167.35,149.8,94.3,2017,31-45,1
1,67,Q4,Mujer,Tipo 1,30.0,6.94,0.0,217.42,121.33,60.05,122.24,152.7,85.0,2013,60+,1
2,55,Q3,Mujer,Tipo 2,34.7,6.2,100.56,219.9,132.84,54.86,136.3,155.3,93.7,2015,46-60,0



== CARDIO == (cardio_features) shape=(8000, 18)


Unnamed: 0,Ao_Registro,Edad,Sexo,Comunidad_Autonoma,Riesgo_Cardiovascular,Actividad_Fisica,Tramo_Edad,Ingresos_Anuales,Nivel_Estres,Consumo_Alcohol,IMC,Diabetes,Colesterol_Total,Trigliceridos,PAS,PAD,age_bin,Riesgo_Alto
0,2016,53,Hombre,Cataluña,Bajo,Moderado,45-54,24665.24,Alto,No consume,30.464413,0,203.053597,228.38168,146.4,89.7,46-60,0
1,2013,66,Mujer,Cataluña,Bajo,Sedentario,65+,21693.89,Moderado,No consume,21.307639,0,214.19825,246.749168,151.8,87.2,60+,0
2,2017,68,Mujer,País Vasco,Bajo,Moderado,65+,15841.83,Moderado,No consume,32.672495,0,178.338545,214.823232,157.4,88.1,60+,0



== CARDIABETES == (cardiabetes_features) shape=(15949, 19)


Unnamed: 0,Ao_Registro,Edad,IMC,Diabetes,Colesterol_Total,Trigliceridos,PAS,PAD,Cohorte,HbA1c,Insulina,LDL,HDL,Nivel_Estres,Actividad_Fisica,Riesgo_Cardiovascular,Sexo,Tipo_Diabetes,age_bin
0,2016,53.0,30.464413,0,203.053597,228.38168,146.4,89.7,cardio,5.483777,10.602855,111.284292,71.456754,Alto,Moderado,Bajo,Hombre,No diabetes,46-60
1,2013,66.0,21.307639,0,214.19825,246.749168,151.8,87.2,cardio,5.212008,56.952216,124.064267,48.292897,Moderado,Sedentario,Bajo,Mujer,No diabetes,60+
2,2017,68.0,32.672495,0,178.338545,214.823232,157.4,88.1,cardio,6.750564,92.875636,123.757918,55.65998,Moderado,Moderado,Bajo,Mujer,Diabetes latente,60+


In [1]:
# --- Bootstrap Kedro desde el notebook ---
from pathlib import Path
import os, sys

root = Path.cwd()
while not (root / "pyproject.toml").exists() and root.parent != root:
    root = root.parent
os.chdir(root)
sys.path.append(str(root / "src"))

from kedro.framework.startup import bootstrap_project
from kedro.framework.session import KedroSession

bootstrap_project(str(root))
session = KedroSession.create(str(root))
context = session.load_context()

catalog = context.catalog
params = context.params

datasets = {
    "diabetes": catalog.load("diabetes_features"),
    "cardio": catalog.load("cardio_features"),
    "cardiabetes": catalog.load("cardiabetes_features"),
}

targets = {
    "diabetes": params["targets"]["diabetes"],
    "cardio": params["targets"]["cardio"],
    "cardiabetes": params["targets"]["cardiabetes"],
}

{k: (v.shape, targets[k]) for k, v in datasets.items()}


Kedro OK — pipelines: ['__default__', 'dataprep', 'data_science']



[1m{[0m
    [32m'diabetes'[0m: [1m([0m[1m([0m[1;36m8000[0m, [1;36m15[0m[1m)[0m, [32m'Tipo_Diabetes'[0m[1m)[0m,
    [32m'cardio'[0m: [1m([0m[1m([0m[1;36m8000[0m, [1;36m17[0m[1m)[0m, [32m'Riesgo_Cardiovascular'[0m[1m)[0m,
    [32m'cardiabetes'[0m: [1m([0m[1m([0m[1;36m15949[0m, [1;36m19[0m[1m)[0m, [32m'Diabetes'[0m[1m)[0m
[1m}[0m