In [25]:
import pandas as pd
df = pd.read_csv('train.csv')

In [8]:
# -*- coding: utf-8 -*-
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
import pandas as pd

def _lazy_import_plotting():
    import seaborn as sns
    import matplotlib.pyplot as plt
    return sns, plt

def _num_summary(x: pd.Series, digits: int = 2) -> Dict[str, Any]:
    s = pd.to_numeric(x, errors="coerce")
    return {
        "count": int(s.count()),
        "missing": int(s.isna().sum()),
        "mean±std": f"{s.mean():.{digits}f} ± {s.std(ddof=1):.{digits}f}",
        "median[IQR]": f"{s.median():.{digits}f} [{s.quantile(0.25):.{digits}f}, {s.quantile(0.75):.{digits}f}]",
        "min–max": f"{s.min():.{digits}f} – {s.max():.{digits}f}",
    }

def _iqr_bounds(x: pd.Series, k: float = 1.5) -> Tuple[float, float]:
    s = pd.to_numeric(x, errors="coerce")
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    return q1 - k * iqr, q3 + k * iqr

def _best_grid(n: int, max_cols: int = 3) -> Tuple[int, int]:
    if n <= 0:
        return 1, 1
    cols = min(max_cols, max(1, int(np.ceil(np.sqrt(n)))))
    rows = int(np.ceil(n / cols))
    # 讓列數不小於欄數時視覺較穩定
    if rows < cols and n > 4:
        rows, cols = cols, rows
    return rows, cols

def run_eda_v3(
    df: pd.DataFrame,
    numeric_cols: List[str],
    cat_cols: List[str],
    date_cols: Optional[List[str]] = None,   # 僅用於 parsing，不畫在總圖上
    uid_col: Optional[str] = "UID",
    out_dir: str = "eda_outputs",
    digits: int = 2,
    iqr_k: float = 1.5,
    topk_extremes: int = 10,
    max_cols_in_grid: int = 3,
    save_fig: bool = True,
    dpi: int = 150
) -> Dict[str, pd.DataFrame]:
    """
    依你的需求：
    1) 將所有「數值分布圖 (hist+kde)」與「類別分布圖 (countplot)」畫在同一張大圖的不同 grid。
    2) Numeric correlation heatmap 另外獨立一張圖。
    3) 只輸出三個 CSV：extremes_long.csv、summary_categorical.csv、summary_numeric.csv。
    """
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)

    # 解析日期欄位（僅轉型，不畫在總圖）
    date_cols = date_cols or []
    for c in date_cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")

    # ====== 數值摘要 ======
    numeric_summary = pd.DataFrame([
        {"variable": col, **_num_summary(df[col], digits)}
        for col in numeric_cols if col in df.columns
    ])
    numeric_summary.to_csv(out / "summary_numeric.csv", index=False, encoding="utf-8-sig")

    # ====== 類別摘要 ======
    cat_rows = []
    total = len(df)
    for col in cat_cols:
        if col not in df.columns: 
            continue
        vc = df[col].value_counts(dropna=False)
        for lvl, n in vc.items():
            cat_rows.append({
                "variable": col,
                "level": lvl,
                "count": int(n),
                "percent(%)": round(n / total * 100, 2) if total else 0.0
            })
    cat_summary = pd.DataFrame(cat_rows)
    if not cat_summary.empty:
        cat_summary.to_csv(out / "summary_categorical.csv", index=False, encoding="utf-8-sig")

    # ====== 極端值 Top-k 清單（每欄最小/最大各 topk_extremes） ======
    extremes_records = []
    for col in numeric_cols:
        if col not in df.columns:
            continue
        s = pd.to_numeric(df[col], errors="coerce")
        # 低端
        low_idx = s.nsmallest(topk_extremes).index
        for rank, idx in enumerate(low_idx, 1):
            extremes_records.append({
                "variable": col, "side": "low", "rank": rank,
                (uid_col if (uid_col and uid_col in df.columns) else "row_index"): 
                    (df.loc[idx, uid_col] if (uid_col and uid_col in df.columns) else int(idx)),
                "value": s.loc[idx]
            })
        # 高端
        high_idx = s.nlargest(topk_extremes).index
        for rank, idx in enumerate(high_idx, 1):
            extremes_records.append({
                "variable": col, "side": "high", "rank": rank,
                (uid_col if (uid_col and uid_col in df.columns) else "row_index"): 
                    (df.loc[idx, uid_col] if (uid_col and uid_col in df.columns) else int(idx)),
                "value": s.loc[idx]
            })

    extremes_long = pd.DataFrame(extremes_records)
    if not extremes_long.empty:
        extremes_long = extremes_long.sort_values(["variable", "side", "rank"])
        extremes_long.to_csv(out / "extremes_long.csv", index=False, encoding="utf-8-sig")

    # ====== 視覺化：一張總圖（數值 + 類別） ======
    plot_items = [("num", c) for c in numeric_cols if c in df.columns] + \
                 [("cat", c) for c in cat_cols if c in df.columns]

    if save_fig and plot_items:
        sns, plt = _lazy_import_plotting()
        sns.set_theme(style="whitegrid")

        n_plots = len(plot_items)
        rows, cols = _best_grid(n_plots, max_cols=max_cols_in_grid)

        fig, axes = plt.subplots(rows, cols, figsize=(cols * 5.5, rows * 4.5))
        # 把 axes 當成 1D 來處理
        if isinstance(axes, np.ndarray):
            axes = axes.flatten()
        else:
            axes = [axes]

        for ax, (kind, col) in zip(axes, plot_items):
            if kind == "num":
                s = pd.to_numeric(df[col], errors="coerce").dropna()
                lo, hi = _iqr_bounds(s, iqr_k)
                m, sd = s.mean(), s.std(ddof=1)
                sns.histplot(s, kde=True, bins=30, ax=ax)
                # 標示 mean±sd 與 IQR 界線
                ax.axvline(m, linestyle="--", linewidth=1)
                ax.axvline(m+sd, linestyle=":", linewidth=1)
                ax.axvline(m-sd, linestyle=":", linewidth=1)
                ax.axvline(lo, linestyle="-.", linewidth=1)
                ax.axvline(hi, linestyle="-.", linewidth=1)
                ax.set_title(f"{col} | mean±std {m:.{digits}f} ± {sd:.{digits}f}\nIQR bounds [{lo:.{digits}f}, {hi:.{digits}f}]")
                ax.set_xlabel(col); ax.set_ylabel("Count")
            else:  # kind == "cat"
                sns.countplot(x=df[col], ax=ax)
                ax.set_title(f"Counts of {col}")
                ax.set_xlabel(col); ax.set_ylabel("Count")

        # 把多餘的子圖關掉（如果 grid 比實際圖多）
        for j in range(len(plot_items), len(axes)):
            fig.delaxes(axes[j])

        fig.tight_layout()
        fig.savefig(out / "all_distributions.png", dpi=dpi)
        plt.close(fig)

    # ====== 相關係數熱圖（獨立一張） ======
    use_cols = [c for c in numeric_cols if c in df.columns]
    corr_df = df[use_cols].corr() if use_cols else pd.DataFrame()
    if save_fig and not corr_df.empty:
        sns, plt = _lazy_import_plotting()
        plt.figure(figsize=(max(6, 1.2*len(use_cols)), max(5, 1.0*len(use_cols))))
        sns.heatmap(corr_df, vmin=-1, vmax=1, annot=True, fmt=".2f", square=True, cbar=True, cmap="coolwarm")
        plt.title("Correlation (numeric variables)")
        plt.tight_layout()
        plt.savefig(out / "corr_numeric.png", dpi=dpi)
        plt.close()

    # 只回傳/保留你需要的三種表
    return {
        "summary_numeric": numeric_summary,
        "summary_categorical": cat_summary,
        "extremes_long": extremes_long,
    }


In [9]:
# 例：你的 DataFrame 叫 all_data
numeric_cols = ["Age", "Height", "Weight", "BMI", "ASMI", "Time_period"]
cat_cols = ["Gender", "Low_muscle_mass"]
date_cols = ["XRAY", "DXA"]  # 沒有就 []

results = run_eda_v3(
    df=df,
    numeric_cols=numeric_cols,
    cat_cols=cat_cols,
    date_cols=date_cols,
    uid_col="UID",
    out_dir="eda_outputs",
    digits=2,
    iqr_k=1.5,
    topk_extremes=10,
    max_cols_in_grid=3,   # 一列最多 3 張子圖
    save_fig=True,
    dpi=150
)

# 你需要的三個 DataFrame 可直接取用：
results["summary_numeric"].head()
results["summary_categorical"].head()
results["extremes_long"].head()
# 圖檔已輸出到 eda_outputs/all_distributions.png 與 eda_outputs/corr_numeric.png


Unnamed: 0,variable,side,rank,UID,value
90,ASMI,high,1,U0029,51.368421
91,ASMI,high,2,U0435,10.63
92,ASMI,high,3,U0769,10.261393
93,ASMI,high,4,U0325,9.74
94,ASMI,high,5,U0088,9.44


In [27]:
df = df.drop(df[df["UID"] == "U0029"].index)

In [29]:
df.to_csv('train.csv', index=False)