# EDA inicial - Riesgo de ACV

Este notebook resume un analisis exploratorio rapido del dataset.

Objetivos:
- revisar forma y columnas,
- verificar balance de la variable objetivo,
- identificar valores faltantes,
- guardar un resumen util para documentacion.


In [None]:
from pathlib import Path
import json

import pandas as pd

DATA_PATH = Path("../data/raw/healthcare-dataset-stroke-data.csv")
OUT_PATH = Path("../reports/eda_summary.json")

df = pd.read_csv(DATA_PATH)
print("Filas, columnas:", df.shape)
print("\nColumnas:")
print(df.columns.tolist())

df.head()


In [None]:
target_dist = df["stroke"].value_counts(dropna=False)
missing = df.isna().sum().sort_values(ascending=False)

target_pct = (target_dist / len(df) * 100).round(2)

print("Distribucion de stroke:")
print(target_dist)
print("\nPorcentaje de stroke:")
print(target_pct)
print("\nValores faltantes por columna:")
print(missing)

summary = {
    "filas": int(df.shape[0]),
    "columnas": int(df.shape[1]),
    "distribucion_stroke": {str(k): int(v) for k, v in target_dist.items()},
    "porcentaje_stroke": {str(k): float(v) for k, v in target_pct.items()},
    "valores_faltantes": {str(k): int(v) for k, v in missing.items()},
}

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
OUT_PATH.write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(f"\nResumen EDA guardado en: {OUT_PATH}")
