# Proyecto Resuelto (Nivel Básico) – IFCD104
**Calidad y visualización de datos clínicos**

Este cuaderno genera un dataset clínico simulado, realiza perfilado y limpieza ligera,
crea visualizaciones y guarda entregables (CSV y gráficos).

In [None]:

import numpy as np
import pandas as pd
import os
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

# Mostrar gráficos inline si se ejecuta en Jupyter
%matplotlib inline

# Carpeta de salida
out_dir = "+P_BASICO\\entregables_basico"
os.makedirs(out_dir, exist_ok=True)

rng = np.random.default_rng(42)
N = 200
patient_ids = np.arange(1000, 1000 + N)
sex = rng.choice(["F", "M"], size=N, replace=True, p=[0.55, 0.45])
age = np.clip(rng.normal(55, 18, size=N).round().astype(int), -5, 105)
sbp = rng.normal(125, 18, size=N)
dbp = rng.normal(78, 12, size=N)
hr = rng.normal(78, 15, size=N)
temp = rng.normal(36.7, 0.6, size=N)
spo2 = rng.normal(96, 2.5, size=N)
glucose = rng.normal(105, 25, size=N)

start_date = datetime(2025, 1, 1)
dates = [start_date + timedelta(days=int(x)) for x in rng.integers(0, 120, size=N)]

df = pd.DataFrame({
    "patient_id": patient_ids,
    "sex": sex,
    "age": age,
    "sbp_mmHg": sbp.round(1),
    "dbp_mmHg": dbp.round(1),
    "heart_rate_bpm": hr.round(0).astype(int),
    "temperature_c": temp.round(1),
    "spo2_pct": spo2.round(1),
    "glucose_mgdl": glucose.round(0).astype(int),
    "admission_date": dates
})

# Introducir valores perdidos, outliers y duplicados
mask_missing = rng.choice([True, False], size=df.shape[0], p=[0.08, 0.92])
df.loc[mask_missing, "spo2_pct"] = np.nan
mask_missing2 = rng.choice([True, False], size=df.shape[0], p=[0.06, 0.94])
df.loc[mask_missing2, "temperature_c"] = np.nan
df.loc[rng.choice(df.index, 2, replace=False), "temperature_c"] = 42.0
df.loc[rng.choice(df.index, 2, replace=False), "sbp_mmHg"] = 300.0
df.loc[rng.choice(df.index, 2, replace=False), "spo2_pct"] = 60.0
df.loc[rng.choice(df.index, 2, replace=False), "age"] = -3
dups = df.sample(5, random_state=1)
df_raw = pd.concat([df, dups], ignore_index=True)

raw_path = os.path.join(out_dir, "+P_BASICO\\clinical_data_raw.csv")
df_raw.to_csv(raw_path, index=False)
print("Guardado:", raw_path)


In [None]:

def profile(df_):
    prof = {
        "rows": len(df_),
        "cols": df_.shape[1],
        "missing_total": int(df_.isna().sum().sum()),
        "missing_by_col": df_.isna().sum().to_dict(),
        "duplicates": int(df_.duplicated().sum())
    }
    num = df_.select_dtypes(include=[np.number])
    stats = num.describe().T
    return prof, stats

profile_raw, stats_raw = profile(df_raw)
profile_raw, stats_raw.head()


In [None]:

ranges = {
    "age": (0, 120),
    "sbp_mmHg": (70, 220),
    "dbp_mmHg": (40, 130),
    "heart_rate_bpm": (30, 200),
    "temperature_c": (34.0, 41.0),
    "spo2_pct": (80.0, 100.0),
    "glucose_mgdl": (50, 500)
}

df_clean = df_raw.copy()

# Imputación mediana
for col in ["temperature_c", "spo2_pct"]:
    med = df_clean[col].median()
    df_clean[col] = df_clean[col].fillna(med)

# Corrección de edad
df_clean.loc[df_clean["age"] < 0, "age"] = np.nan
df_clean["age"] = df_clean["age"].fillna(df_clean["age"].median())

# Clip a rangos plausibles
for col, (lo, hi) in ranges.items():
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].clip(lo, hi)

# Eliminar duplicados
df_clean = df_clean.drop_duplicates().reset_index(drop=True)

clean_path = os.path.join(out_dir, "+P_BASICO\\clinical_data_clean.csv")
df_clean.to_csv(clean_path, index=False)
print("Guardado:", clean_path)

profile_clean, stats_clean = profile(df_clean)
profile_clean, stats_clean.head()


In [None]:

fig_dir = os.path.join(out_dir, "figures")
os.makedirs(fig_dir, exist_ok=True)

def hist_plot(series, title, fname):
    plt.figure()
    plt.hist(series.dropna(), bins=20)
    plt.title(title)
    plt.xlabel(series.name)
    plt.ylabel("Frecuencia")
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, fname))
    plt.show()

for col in ["sbp_mmHg", "dbp_mmHg", "temperature_c", "spo2_pct", "heart_rate_bpm"]:
    hist_plot(df_raw[col], f"Distribución {col} (crudo)", f"{col}_raw.png")
    hist_plot(df_clean[col], f"Distribución {col} (limpio)", f"{col}_clean.png")
    
print("Figuras en:", fig_dir)


In [None]:

summary = pd.DataFrame({
    "métrica": ["filas", "columnas", "valores_perdidos", "duplicados"],
    "crudo": [profile_raw["rows"], profile_raw["cols"], profile_raw["missing_total"], profile_raw["duplicates"]],
    "limpio": [profile_clean["rows"], profile_clean["cols"], profile_clean["missing_total"], profile_clean["duplicates"]]
})
summary


## Conclusiones
- La limpieza ligera reduce valores extremos no plausibles y elimina duplicados.
- La imputación simple permite análisis descriptivo inicial; evaluar su impacto en modelos.
- Mantener diccionario de datos y registrar transformaciones mejora la trazabilidad.