## 센터별 1일 후 하수처리량 10단계 분류 모델링

1. 합계 min,max 값을 이용해서 범위로 10단계 분류 라벨링
2. 파생변수 추가
3. 1일후, 2일후 하수처리량도 먼저 구하고 각 값에 해당하는 분류로 라벨링

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap

# 한글 폰트 설정
# plt.rcParams['font.family'] ='Malgun Gothic' # 윈도우
plt.rcParams['font.family'] ='AppleGothic' # 맥
# 맥, 윈도우 중 가능한거로
# plt.rcParams['font.family'] = ['AppleGothic', 'Malgun Gothic', 'NanumGothic', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
nanji = pd.read_csv('../data/processed/center_season/nanji/난지_merged.csv', encoding='utf-8-sig')
jungnang = pd.read_csv('../data/processed/center_season/jungnang/중랑_merged.csv', encoding='utf-8-sig')
seonam = pd.read_csv('../data/processed/center_season/seonam/서남_merged.csv', encoding='utf-8-sig')
tancheon = pd.read_csv('../data/processed/center_season/tancheon/탄천_merged.csv', encoding='utf-8-sig')

print(nanji.info())
print(jungnang.info())
print(seonam.info())
print(tancheon.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3103 entries, 0 to 3102
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   날짜             3103 non-null   object 
 1   요일             3103 non-null   object 
 2   공휴일            3103 non-null   int64  
 3   목욕장업           3103 non-null   int64  
 4   세탁업            3103 non-null   int64  
 5   수영장업           3103 non-null   int64  
 6   종합체육시설업        3103 non-null   int64  
 7   체력단련장업         3103 non-null   int64  
 8   하천             3103 non-null   float64
 9   생활인구           3103 non-null   float64
 10  불쾌지수(DI)       3103 non-null   float64
 11  불쾌지수등급         3103 non-null   object 
 12  일_일강수량(mm)     3103 non-null   float64
 13  일_최저기온(°C)     3103 non-null   float64
 14  일_평균기온(°C)     3103 non-null   float64
 15  일_최고기온(°C)     3103 non-null   float64
 16  일_평균풍속(m/s)    3103 non-null   float64
 17  일_최대순간풍속(m/s)  3103 non-null   float64
 18  최저습도(%) 

In [4]:
'''# 안전: 날짜 정렬ㅁ
nanji['날짜'] = pd.to_datetime(nanji['날짜'])
nanji = nanji.sort_values('날짜').reset_index(drop=True)
# 난지 6월 데이터 없음 -> 다 0으로 되어 있으니까 제거 
nanji = nanji[nanji["날짜"] < "2025-06-01"] 

# 날짜 파생 (순환 인코딩)
def add_cyclical_features(df, date_col='날짜'):
    d = df[date_col].dt

    # 월
    df['월_sin'] = np.sin(2*np.pi*d.month/12)
    df['월_cos'] = np.cos(2*np.pi*d.month/12)

    # 요일
    dow = d.weekday
    df['요일_sin'] = np.sin(2*np.pi*dow/7)
    df['요일_cos'] = np.cos(2*np.pi*dow/7)

    # 연중일
    doy = d.dayofyear
    df['연중일_sin'] = np.sin(2*np.pi*doy/365.25)
    df['연중일_cos'] = np.cos(2*np.pi*doy/365.25)
    return df

nanji = add_cyclical_features(nanji)

# 라그 (lag) : 과거 값
add_lag_specs = {
    '일_평균기온(°C)': [1, 3],
    '일_최고기온(°C)': [1, 3],
    '일_최저기온(°C)': [1, 3],
    '평균습도(%)': [1, 3],
    '불쾌지수(DI)': [1, 3],
    '하천': [1, 3],
    '일_평균풍속(m/s)': [1],  # 영향 약할 수 있어 가볍게
    '합계': [1,3,7],
    '생활인구': [1,3,7],
    '일_일강수량(mm)': [1,3],
}
for col, lags in add_lag_specs.items():
    for L in lags:
        nanji[f'{col}_lag{L}'] = nanji[col].shift(L)

# 롤링 (오늘까지) : 누적값
nanji['합계_roll7_mean'] = nanji['합계'].rolling(7, min_periods=1).mean()
nanji['합계_roll7_std']  = nanji['합계'].rolling(7, min_periods=2).std()
nanji['생활인구_roll7_mean'] = nanji['생활인구'].rolling(7, min_periods=1).mean()
nanji['강수량_roll3_sum'] = nanji['일_일강수량(mm)'].rolling(3, min_periods=1).sum()
nanji['강수량_roll7_sum'] = nanji['일_일강수량(mm)'].rolling(7, min_periods=1).sum()

# 변화율/증감
'''
# nanji['합계_전일증감률']    = nanji['합계'].pct_change()
# nanji['합계_전일변화']      = nanji['합계'].diff()
# nanji['생활인구_전일증감률'] = nanji['생활인구'].pct_change()
# nanji['강수량_전일변화']    = nanji['일_일강수량(mm)'].diff()
'''

# 날씨/이벤트
nanji['일교차']   = nanji['일_최고기온(°C)'] - nanji['일_최저기온(°C)']
nanji['체감온도'] = nanji['일_평균기온(°C)'] - 0.7*nanji['일_평균풍속(m/s)']
nanji['습도차이'] = nanji['최고습도(%)'] - nanji['최저습도(%)']

nanji['강수_여부'] = (nanji['일_일강수량(mm)'] > 0).astype(int)
nanji['폭우_여부'] = (nanji['일_일강수량(mm)'] >= 80).astype(int)
nanji['폭염일']    = (nanji['일_최고기온(°C)'] >= 33).astype(int)
# nanji['한파_여부'] = (nanji['일_최저기온(°C)'] <= -12).astype(int)

# 인허가 파생
# nanji['인허가'] = nanji[['목욕장업','세탁업','수영장업','종합체육시설업','체력단련장업']].sum(axis=1)

# 타깃 생성
nanji['합계_1일후'] = nanji['합계'].shift(-1)
nanji['합계_2일후'] = nanji['합계'].shift(-2)

# 요일 숫자로
nanji['요일_숫자'] = nanji['날짜'].dt.weekday # 0:월요일, 1:화요일, ... , 6:일요일

# 계절, 불쾌지수등급 숫자로
nanji = nanji.replace(
    {
        '계절': {'봄': 0, '여름' : 1, '가을' : 2, '겨울' : 3},
        '불쾌지수등급' : {'쾌적' : 0, '약간 불쾌' : 1, '불쾌' : 2, '매우 불쾌' : 3, '극심한 불쾌' : 4}
     })



nanji.columns
'''

"\n\n# 날씨/이벤트\nnanji['일교차']   = nanji['일_최고기온(°C)'] - nanji['일_최저기온(°C)']\nnanji['체감온도'] = nanji['일_평균기온(°C)'] - 0.7*nanji['일_평균풍속(m/s)']\nnanji['습도차이'] = nanji['최고습도(%)'] - nanji['최저습도(%)']\n\nnanji['강수_여부'] = (nanji['일_일강수량(mm)'] > 0).astype(int)\nnanji['폭우_여부'] = (nanji['일_일강수량(mm)'] >= 80).astype(int)\nnanji['폭염일']    = (nanji['일_최고기온(°C)'] >= 33).astype(int)\n# nanji['한파_여부'] = (nanji['일_최저기온(°C)'] <= -12).astype(int)\n\n# 인허가 파생\n# nanji['인허가'] = nanji[['목욕장업','세탁업','수영장업','종합체육시설업','체력단련장업']].sum(axis=1)\n\n# 타깃 생성\nnanji['합계_1일후'] = nanji['합계'].shift(-1)\nnanji['합계_2일후'] = nanji['합계'].shift(-2)\n\n# 요일 숫자로\nnanji['요일_숫자'] = nanji['날짜'].dt.weekday # 0:월요일, 1:화요일, ... , 6:일요일\n\n# 계절, 불쾌지수등급 숫자로\nnanji = nanji.replace(\n    {\n        '계절': {'봄': 0, '여름' : 1, '가을' : 2, '겨울' : 3},\n        '불쾌지수등급' : {'쾌적' : 0, '약간 불쾌' : 1, '불쾌' : 2, '매우 불쾌' : 3, '극심한 불쾌' : 4}\n     })\n\n\n\nnanji.columns\n"

In [5]:
def make_features(df):
    df = df.copy()  # 원본 보존
    
    # 0) 날짜 정리 및 정렬
    df['날짜'] = pd.to_datetime(df['날짜'])
    df = df.sort_values('날짜').reset_index(drop=True)
    
    # 난지/서남 6월 데이터 없음 → 0으로 채워진 구간 제거
    df = df[df["날짜"] < "2025-06-01"]

    # 1) 달/요일 숫자
    df['월'] = df['날짜'].dt.month
    df['요일'] = df['날짜'].dt.weekday  # 0=월, ... , 6=일

    # 2) 계절/불쾌지수등급 숫자 매핑
    season_map = {'봄': 0, '여름': 1, '가을': 2, '겨울': 3}
    discomfort_map = {'쾌적': 0, '약간 불쾌': 1, '불쾌': 2, '매우 불쾌': 3, '극심한 불쾌': 4}
    df['계절'] = df['계절'].map(season_map).astype('Int64')
    df['불쾌지수등급'] = df['불쾌지수등급'].map(discomfort_map).astype('Int64')

    # 3) 강수량 시차 피처
    df['강수량_1일전'] = df['일_일강수량(mm)'].shift(1)
    df['강수량_2일전'] = df['일_일강수량(mm)'].shift(2)
    df['강수량_1일_누적'] = df['일_일강수량(mm)'].rolling(1, min_periods=1).sum()
    df['강수량_2일_누적'] = df['일_일강수량(mm)'].rolling(2, min_periods=1).sum()
    df['강수량_3일_누적'] = df['일_일강수량(mm)'].rolling(3, min_periods=1).sum()
    df['강수량_5일_누적'] = df['일_일강수량(mm)'].rolling(5, min_periods=1).sum()
    df['강수량_7일_누적'] = df['일_일강수량(mm)'].rolling(7, min_periods=1).sum()

    df['일교차']   = df['일_최고기온(°C)'] - df['일_최저기온(°C)'] 
    df['폭우_여부'] = (df['일_일강수량(mm)'] >= 80).astype(int)
    
    # 4) ===== 체감온도 계산(온도/풍속/습도 기반) =====
    # 사용 컬럼명: '일_평균기온(°C)', '일_평균풍속(m/s)', '평균습도(%)'
    # 숫자형 강제(문자 섞여도 안전)
    if '일_평균기온(°C)' in df.columns:
        T = pd.to_numeric(df['일_평균기온(°C)'], errors='coerce')
    else:
        T = pd.Series(np.nan, index=df.index)
    if '일_평균풍속(m/s)' in df.columns:
        V_ms = pd.to_numeric(df['일_평균풍속(m/s)'], errors='coerce')
    else:
        V_ms = pd.Series(np.nan, index=df.index)
    if '평균습도(%)' in df.columns:
        RH = pd.to_numeric(df['평균습도(%)'], errors='coerce')
    else:
        RH = pd.Series(np.nan, index=df.index)

    # 4-1) 윈드칠(추울 때 바람 영향) — 단위: °C, 공식은 km/h 기준
    V_kmh = V_ms * 3.6
    wct_raw = 13.12 + 0.6215*T - 11.37*np.power(V_kmh, 0.16) + 0.3965*T*np.power(V_kmh, 0.16)
    wc_valid = (T <= 10.0) & (V_kmh >= 4.8)  # 유효 범위: T ≤ 10°C, v ≥ 4.8 km/h(≈1.34 m/s)
    wct = T.copy()             # 기본은 실제 기온
    wct[wc_valid] = wct_raw[wc_valid]

    # 4-2) 열지수(더울 때 습도 영향) — NOAA 공식(화씨) 활용, 최종은 °C
    # 유효 권장 범위: T ≥ 26.7°C & RH ≥ 40%
    T_f = T * 9/5 + 32
    HI_f = (-42.379 + 2.04901523*T_f + 10.14333127*RH
            - 0.22475541*T_f*RH - 0.00683783*T_f**2 - 0.05481717*RH**2
            + 0.00122874*T_f**2*RH + 0.00085282*T_f*RH**2
            - 0.00000199*T_f**2*RH**2)
    # 저습 보정
    mask_low = (RH < 13) & (T_f >= 80) & (T_f <= 112)
    adj_low = ((13 - RH)/4) * np.sqrt((17 - np.abs(T_f - 95))/17)
    HI_f = HI_f.where(~mask_low, HI_f - adj_low)
    # 고습 보정
    mask_high = (RH > 85) & (T_f >= 80) & (T_f <= 87)
    adj_high = ((RH - 85)/10) * ((87 - T_f)/5)
    HI_f = HI_f.where(~mask_high, HI_f + adj_high)
    hi_valid = (T_f >= 80) & (RH >= 40)
    HI_c = (HI_f - 32) * 5/9
    hi = T.copy()
    hi[hi_valid] = HI_c[hi_valid]

    # 4-3) Steadman Apparent Temperature(연속 지표, 전 구간)
    # AT = T + 0.33*e - 0.70*v - 4.00
    # e(hPa) = RH/100 * 6.105 * exp(17.27*T/(237.7+T)), v(m/s)
    e = (RH/100.0) * 6.105 * np.exp(17.27*T/(237.7 + T))
    at = T + 0.33*e - 0.70*V_ms - 4.00

    # 4-4) 최종 체감온도 선택(윈드칠 > 열지수 > 스테드먼)
    apparent = at.copy()
    apparent[hi_valid] = hi[hi_valid]
    apparent[wc_valid] = wct[wc_valid]

    # 4-5) 컬럼 추가
    df['윈드칠(°C)'] = wct
    df['열지수(°C)'] = hi
    df['스테드먼체감온도(°C)'] = at
    df['체감온도(°C)'] = apparent
    
    df = df.drop(['윈드칠(°C)', '열지수(°C)', '스테드먼체감온도(°C)'], axis=1)
    # 5) 타깃(1일/2일 후 합계)
    df['합계_1일후'] = df['합계'].shift(-1)
    df['합계_2일후'] = df['합계'].shift(-2)

    # 6) 결측 제거 및 리셋
    df = df.dropna().reset_index(drop=True)

    return df

nanji    = make_features(nanji)
jungnang = make_features(jungnang)
seonam   = make_features(seonam)
tancheon = make_features(tancheon)


In [6]:
# =========================================================
# 합계 10분위 라벨링 + 경계/분포 확인 헬퍼 (1일/2일 후 모두 지원)
# ---------------------------------------------------------
# - label_with_today_deciles:
#     오늘 '합계'로 10분위 경계 생성 → 합계_class 생성
#     같은 경계로 '합계_1일후' → 합계_1일후_class
#               '합계_2일후' → 합계_2일후_class
# - inspect_deciles:
#     오늘 '합계'의 분위수 경계와 구간별 개수/비율 확인
# - inspect_future_classes:
#     (오늘 경계 기준으로) 지정한 타깃 컬럼(1일/2일 후 등)이
#     각 구간에 얼마나 들어갔는지 확인
# =========================================================
def label_with_today_deciles(df, q=10):
    """
    df: 센터 하나의 데이터프레임
        필수 컬럼: '합계' (오늘)
        선택 컬럼: '합계_1일후', '합계_2일후' (없으면 자동 생성)
    q : 분위수 개수(10 → 10분위)

    동작:
      1) 오늘 '합계'로 q분위 경계(cats) 생성 (센터별 독립)
      2) 오늘 라벨:        합계_class (qcut → .cat.codes)
      3) 1일 후 라벨:     합계_1일후_class (오늘 경계에 pd.cut → .cat.codes)
      4) 2일 후 라벨:     합계_2일후_class (오늘 경계에 pd.cut → .cat.codes)
    반환:
      out: 라벨 컬럼이 추가된 DataFrame (원본 보존)
    """
    out = df.copy()

    # 0) 타입 안전장치: 합계/합계_1일후/합계_2일후를 숫자로 강제
    out["합계"] = pd.to_numeric(out["합계"], errors="coerce")

    # '합계_1일후' 없으면 생성(-1), 있으면 숫자화
    if "합계_1일후" not in out.columns:
        out["합계_1일후"] = out["합계"].shift(-1)
    else:
        out["합계_1일후"] = pd.to_numeric(out["합계_1일후"], errors="coerce")

    # '합계_2일후' 없으면 생성(-2), 있으면 숫자화  ← (중요 수정)
    if "합계_2일후" not in out.columns:
        out["합계_2일후"] = out["합계"].shift(-2)
    else:
        out["합계_2일후"] = pd.to_numeric(out["합계_2일후"], errors="coerce")

    # 1) 오늘 합계 기준 q분위 구간 (중복 경계는 drop)
    qbins = pd.qcut(out["합계"], q=q, duplicates="drop")   # Categorical
    cats  = qbins.cat.categories                           # IntervalIndex

    # 2) 오늘 라벨: qcut 결과의 코드 사용 (-1 → <NA>)
    out["합계_class"] = qbins.cat.codes.replace(-1, pd.NA).astype("Int64")

    # 3) 1일 후 라벨: 오늘 경계(cats)에 내일 값을 cut → 코드 사용
    cut_1 = pd.cut(out["합계_1일후"], bins=cats, include_lowest=True)
    out["합계_1일후_class"] = cut_1.cat.codes.replace(-1, pd.NA).astype("Int64")

    # 4) 2일 후 라벨: 오늘 경계(cats)에 2일 후 값을 cut → 코드 사용
    cut_2 = pd.cut(out["합계_2일후"], bins=cats, include_lowest=True)
    out["합계_2일후_class"] = cut_2.cat.codes.replace(-1, pd.NA).astype("Int64")

    return out


def inspect_deciles(df, q=10):
    """
    df의 '합계'로 q분위(qcut) 구간을 만들고,
    각 구간의 [왼경계, 오른경계], 개수, 비율을 표로 반환.
    주의: duplicates='drop' 때문에 실제 구간 수가 q보다 작을 수 있음.
    """
    qbins = pd.qcut(df["합계"], q=q, duplicates="drop")
    cats  = qbins.cat.categories                  # IntervalIndex (구간 경계들)
    vc    = qbins.value_counts(sort=False)        # 구간 순서 유지

    decile_tbl = pd.DataFrame({
        "decile": range(len(cats)),               # 0,1,2,... (라벨과 동일)
        "left":   cats.left.values,               # 구간 왼쪽 경계
        "right":  cats.right.values,              # 구간 오른쪽 경계
        "count":  vc.values,                      # 구간별 개수
    })
    decile_tbl["share"] = (decile_tbl["count"] / len(df)).round(4)
    return decile_tbl


def inspect_future_classes(df, target_col, q=10):
    """
    오늘 '합계' 기준 분위수 경계(cats)에 대해,
    지정한 타깃 컬럼(예: '합계_1일후', '합계_2일후')이
    각 구간에 얼마나 분포하는지 (건수/비율) 반환.

    비율의 분모는 target_col이 존재하는 행 수를 사용.
    """
    # 오늘 합계 기준 q분위 구간
    qbins = pd.qcut(df["합계"], q=q, duplicates="drop")
    cats  = qbins.cat.categories

    # 타깃 값을 오늘 경계에 cut
    tgt_cut = pd.cut(df[target_col], bins=cats, include_lowest=True)
    vc = tgt_cut.value_counts(sort=False)

    denom = df[target_col].notna().sum()

    tbl = pd.DataFrame({
        "decile":         range(len(cats)),
        "left":           cats.left.values,
        "right":          cats.right.values,
        "count":          vc.values,
        "share":          (vc.values / max(denom, 1)).round(4)
    })
    return tbl
# 1) 라벨 생성 (센터별 독립 적용)
nanji    = label_with_today_deciles(nanji, q=10)
seonam   = label_with_today_deciles(seonam, q=10)
jungnang = label_with_today_deciles(jungnang, q=10)
tancheon = label_with_today_deciles(tancheon, q=10)

# 2) 각 센터의 '오늘 합계' 기준 10분위 경계/빈도 확인
print("[nanji] 오늘 합계 10분위 경계/분포")
display(inspect_deciles(nanji, q=10))

print("[seonam] 오늘 합계 10분위 경계/분포")
display(inspect_deciles(seonam, q=10))

# 3) 같은 경계로 '1일 후 합계'가 어떻게 들어갔는지 확인
print("[nanji] 1일 후 합계 분포 (오늘 경계 기준)")
display(inspect_future_classes(nanji, target_col="합계_1일후", q=10))

print("[seonam] 1일 후 합계 분포 (오늘 경계 기준)")
display(inspect_future_classes(seonam, target_col="합계_1일후", q=10))

# 4) 같은 경계로 '2일 후 합계'가 어떻게 들어갔는지 확인
print("[nanji] 2일 후 합계 분포 (오늘 경계 기준)")
display(inspect_future_classes(nanji, target_col="합계_2일후", q=10))

print("[seonam] 2일 후 합계 분포 (오늘 경계 기준)")
display(inspect_future_classes(seonam, target_col="합계_2일후", q=10))

# 5) 라벨 결과 빠르게 스캔 (필요하면)
for name, df_ in {"nanji":nanji, "seonam":seonam, "jungnang":jungnang, "tancheon":tancheon}.items():
    print(f"[{name}] 주요 라벨 컬럼 미리보기")
    display(df_[["날짜","합계","합계_class","합계_1일후","합계_1일후_class","합계_2일후","합계_2일후_class"]].head(10))


[nanji] 오늘 합계 10분위 경계/분포


Unnamed: 0,decile,left,right,count,share
0,0,442332.799,507748.0,307,0.1
1,1,507748.0,521413.0,307,0.1
2,2,521413.0,532854.2,307,0.1
3,3,532854.2,543134.942,307,0.1
4,4,543134.942,556440.73,307,0.1
5,5,556440.73,574501.6,306,0.0997
6,6,574501.6,597499.162,307,0.1
7,7,597499.162,650692.8,307,0.1
8,8,650692.8,778260.8,307,0.1
9,9,778260.8,1381444.0,307,0.1


[seonam] 오늘 합계 10분위 경계/분포


Unnamed: 0,decile,left,right,count,share
0,0,1160336.999,1392512.4,307,0.1
1,1,1392512.4,1432768.2,307,0.1
2,2,1432768.2,1456865.6,307,0.1
3,3,1456865.6,1479430.6,307,0.1
4,4,1479430.6,1504241.0,307,0.1
5,5,1504241.0,1531060.4,306,0.0997
6,6,1531060.4,1572728.2,307,0.1
7,7,1572728.2,1653688.2,307,0.1
8,8,1653688.2,1916230.0,307,0.1
9,9,1916230.0,2780034.0,307,0.1


[nanji] 1일 후 합계 분포 (오늘 경계 기준)


Unnamed: 0,decile,left,right,count,share
0,0,442332.799,507748.0,306,0.0997
1,1,507748.0,521413.0,307,0.1
2,2,521413.0,532854.2,307,0.1
3,3,532854.2,543134.942,307,0.1
4,4,543134.942,556440.73,307,0.1
5,5,556440.73,574501.6,306,0.0997
6,6,574501.6,597499.162,308,0.1004
7,7,597499.162,650692.8,307,0.1
8,8,650692.8,778260.8,307,0.1
9,9,778260.8,1381444.0,307,0.1


[seonam] 1일 후 합계 분포 (오늘 경계 기준)


Unnamed: 0,decile,left,right,count,share
0,0,1160336.999,1392512.4,308,0.1004
1,1,1392512.4,1432768.2,307,0.1
2,2,1432768.2,1456865.6,307,0.1
3,3,1456865.6,1479430.6,306,0.0997
4,4,1479430.6,1504241.0,307,0.1
5,5,1504241.0,1531060.4,306,0.0997
6,6,1531060.4,1572728.2,307,0.1
7,7,1572728.2,1653688.2,307,0.1
8,8,1653688.2,1916230.0,307,0.1
9,9,1916230.0,2780034.0,307,0.1


[nanji] 2일 후 합계 분포 (오늘 경계 기준)


Unnamed: 0,decile,left,right,count,share
0,0,442332.799,507748.0,306,0.0997
1,1,507748.0,521413.0,306,0.0997
2,2,521413.0,532854.2,307,0.1
3,3,532854.2,543134.942,307,0.1
4,4,543134.942,556440.73,307,0.1
5,5,556440.73,574501.6,306,0.0997
6,6,574501.6,597499.162,309,0.1007
7,7,597499.162,650692.8,307,0.1
8,8,650692.8,778260.8,307,0.1
9,9,778260.8,1381444.0,307,0.1


[seonam] 2일 후 합계 분포 (오늘 경계 기준)


Unnamed: 0,decile,left,right,count,share
0,0,1160336.999,1392512.4,309,0.1007
1,1,1392512.4,1432768.2,307,0.1
2,2,1432768.2,1456865.6,307,0.1
3,3,1456865.6,1479430.6,305,0.0994
4,4,1479430.6,1504241.0,307,0.1
5,5,1504241.0,1531060.4,306,0.0997
6,6,1531060.4,1572728.2,307,0.1
7,7,1572728.2,1653688.2,307,0.1
8,8,1653688.2,1916230.0,307,0.1
9,9,1916230.0,2780034.0,307,0.1


[nanji] 주요 라벨 컬럼 미리보기


Unnamed: 0,날짜,합계,합계_class,합계_1일후,합계_1일후_class,합계_2일후,합계_2일후_class
0,2017-01-03,488454.0,0,510180.0,1,496289.0,0
1,2017-01-04,510180.0,1,496289.0,0,492958.0,0
2,2017-01-05,496289.0,0,492958.0,0,493911.0,0
3,2017-01-06,492958.0,0,493911.0,0,501165.0,0
4,2017-01-07,493911.0,0,501165.0,0,506762.0,0
5,2017-01-08,501165.0,0,506762.0,0,480426.0,0
6,2017-01-09,506762.0,0,480426.0,0,485066.0,0
7,2017-01-10,480426.0,0,485066.0,0,504841.0,0
8,2017-01-11,485066.0,0,504841.0,0,502739.0,0
9,2017-01-12,504841.0,0,502739.0,0,483969.0,0


[seonam] 주요 라벨 컬럼 미리보기


Unnamed: 0,날짜,합계,합계_class,합계_1일후,합계_1일후_class,합계_2일후,합계_2일후_class
0,2017-01-03,1466182.0,3,1460573.0,3,1455815.0,2
1,2017-01-04,1460573.0,3,1455815.0,2,1452640.0,2
2,2017-01-05,1455815.0,2,1452640.0,2,1451580.0,2
3,2017-01-06,1452640.0,2,1451580.0,2,1446570.0,2
4,2017-01-07,1451580.0,2,1446570.0,2,1453718.0,2
5,2017-01-08,1446570.0,2,1453718.0,2,1455235.0,2
6,2017-01-09,1453718.0,2,1455235.0,2,1461818.0,3
7,2017-01-10,1455235.0,2,1461818.0,3,1455187.0,2
8,2017-01-11,1461818.0,3,1455187.0,2,1454215.0,2
9,2017-01-12,1455187.0,2,1454215.0,2,1404229.0,1


[jungnang] 주요 라벨 컬럼 미리보기


Unnamed: 0,날짜,합계,합계_class,합계_1일후,합계_1일후_class,합계_2일후,합계_2일후_class
0,2017-01-03,1224533.0,5,1225081.0,5,1215632.0,4
1,2017-01-04,1225081.0,5,1215632.0,4,1249242.0,6
2,2017-01-05,1215632.0,4,1249242.0,6,1243458.0,5
3,2017-01-06,1249242.0,6,1243458.0,5,1211490.0,4
4,2017-01-07,1243458.0,5,1211490.0,4,1221105.0,4
5,2017-01-08,1211490.0,4,1221105.0,4,1191609.0,3
6,2017-01-09,1221105.0,4,1191609.0,3,1210332.0,4
7,2017-01-10,1191609.0,3,1210332.0,4,1171830.0,2
8,2017-01-11,1210332.0,4,1171830.0,2,1185414.0,3
9,2017-01-12,1171830.0,2,1185414.0,3,1224600.0,5


[tancheon] 주요 라벨 컬럼 미리보기


Unnamed: 0,날짜,합계,합계_class,합계_1일후,합계_1일후_class,합계_2일후,합계_2일후_class
0,2017-01-03,674834.0,2,671066.0,2,656065.0,1
1,2017-01-04,671066.0,2,656065.0,1,660300.0,1
2,2017-01-05,656065.0,1,660300.0,1,659687.0,1
3,2017-01-06,660300.0,1,659687.0,1,648120.0,0
4,2017-01-07,659687.0,1,648120.0,0,651859.0,0
5,2017-01-08,648120.0,0,651859.0,0,644396.0,0
6,2017-01-09,651859.0,0,644396.0,0,674222.0,2
7,2017-01-10,644396.0,0,674222.0,2,674328.0,2
8,2017-01-11,674222.0,2,674328.0,2,673495.0,2
9,2017-01-12,674328.0,2,673495.0,2,671626.0,2


In [7]:
print(nanji.shape, nanji.columns)
print(jungnang.shape, jungnang.columns)
print(tancheon.shape, tancheon.columns)
print(seonam.shape, seonam.columns)

(3069, 44) Index(['날짜', '요일', '공휴일', '목욕장업', '세탁업', '수영장업', '종합체육시설업', '체력단련장업', '하천',
       '생활인구', '불쾌지수(DI)', '불쾌지수등급', '일_일강수량(mm)', '일_최저기온(°C)', '일_평균기온(°C)',
       '일_최고기온(°C)', '일_평균풍속(m/s)', '일_최대순간풍속(m/s)', '최저습도(%)', '평균습도(%)',
       '최고습도(%)', '습도표준편차', '1처리장', '2처리장', '정화조', '중계펌프장', '합계', '계절', '월',
       '강수량_1일전', '강수량_2일전', '강수량_1일_누적', '강수량_2일_누적', '강수량_3일_누적',
       '강수량_5일_누적', '강수량_7일_누적', '일교차', '폭우_여부', '체감온도(°C)', '합계_1일후',
       '합계_2일후', '합계_class', '합계_1일후_class', '합계_2일후_class'],
      dtype='object')
(3069, 44) Index(['날짜', '요일', '공휴일', '목욕장업', '세탁업', '수영장업', '종합체육시설업', '체력단련장업', '하천',
       '생활인구', '불쾌지수(DI)', '불쾌지수등급', '일_일강수량(mm)', '일_최저기온(°C)', '일_평균기온(°C)',
       '일_최고기온(°C)', '일_평균풍속(m/s)', '일_최대순간풍속(m/s)', '최저습도(%)', '평균습도(%)',
       '최고습도(%)', '습도표준편차', '1처리장', '2처리장', '3처리장', '4처리장', '합계', '계절', '월',
       '강수량_1일전', '강수량_2일전', '강수량_1일_누적', '강수량_2일_누적', '강수량_3일_누적',
       '강수량_5일_누적', '강수량_7일_누적', '일교차', '폭우_여부', '체감온도(°C)', '합계_1일후',
  

In [8]:
not_use_col = ['1처리장', '2처리장', '정화조', '중계펌프장', '합계','시설현대화', '3처리장', '4처리장',
               '합계_1일후',
       '합계_2일후', '합계_class', '합계_1일후_class', '합계_2일후_class']

In [50]:
# =========================================================
# 다중분류 파이프라인 (4개 센터 공통)
# - 타깃: '합계_1일후_class'
# - 불사용 컬럼(not_use_col) 제외
# - 모델: RF, XGB, LGBM, CatBoost, GradientBoosting + ExtraTrees, HistGB, Logistic
# - 평가/시각화: accuracy, macro-F1, macro-recall, ROC-AUC(ovr), CM, ROC, SHAP, LIME
# - 저장: ./outputs/<center>/<timestamp>/ 이하 PNG/CSV/JSON/HTML 등
# - 변경점:
#   1) predictions.csv에 클래스별 확률(proba_<class>) + pred_conf 추가
#   2) 모델의 classes_ 순서와 우리가 쓰는 classes 정렬을 맞춰 확률 열 정렬 일치
#   3) 날짜/예측/실젯값 모두 1D로 reshape해 DataFrame 생성 오류 방지
# =========================================================

import os, json, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# 시각화
import matplotlib.pyplot as plt
from itertools import cycle

# 전처리/학습/평가
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, roc_auc_score,
    classification_report, confusion_matrix
)
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 모델
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    ExtraTreesClassifier, HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression

# 외부 라이브러리 (없으면 설치 필요)
try:
    import xgboost as xgb
except ModuleNotFoundError:
    xgb = None
try:
    import lightgbm as lgb
except ModuleNotFoundError:
    lgb = None
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    CatBoostClassifier = None

# 해석 라이브러리
try:
    import shap
except ModuleNotFoundError:
    shap = None
try:
    from lime import lime_tabular
except ModuleNotFoundError:
    lime_tabular = None

# -----------------------------
# 유틸: 출력 디렉토리 확보
# -----------------------------
def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)
    return path

# -----------------------------
# 데이터 준비: 피처/타깃/분할
# -----------------------------
def prepare_xy(df: pd.DataFrame,
               target_col: str = "합계_1일후_class",
               not_use_col: list = None,
               date_col: str = "날짜",
               test_size: float = 0.2,
               random_state: int = 42):
    if not_use_col is None:
        not_use_col = []

    # 날짜 기준 정렬 (날짜 시리즈 따로 보관)
    work = df.sort_values(date_col).reset_index(drop=True).copy()
    dates = pd.to_datetime(work[date_col])  # 날짜 보존

    # 제외 컬럼 (데이터에 있는 것만)
    drop_cols = [c for c in (set(not_use_col) | {target_col, date_col}) if c in work.columns]

    # X 숫자화
    X_raw = work.drop(columns=drop_cols, errors="ignore")
    for c in X_raw.columns:
        X_raw[c] = pd.to_numeric(X_raw[c], errors="coerce")

    # y
    y = work[target_col].astype("int64")

    # 시간순 80/20 분할
    n = len(work)
    split = int(n * (1 - test_size))
    X_train, X_test = X_raw.iloc[:split].copy(), X_raw.iloc[split:].copy()
    y_train, y_test = y.iloc[:split].copy(), y.iloc[split:].copy()
    dates_train, dates_test = dates.iloc[:split].copy(), dates.iloc[split:].copy()

    feature_names = list(X_raw.columns)
    return X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test

# -----------------------------
# 모델 구성 (n_classes 필요)
# -----------------------------
def build_models(n_classes: int, random_state: int = 42):
    """
    필수 모델 5종 + 추가 3종 반환 (라이브러리 없는 경우는 건너뜀)
    """
    models = {}

    # 1) RandomForest
    models["RandomForest"] = RandomForestClassifier(
        n_estimators=500, min_samples_leaf=2, random_state=random_state, n_jobs=-1
    )

    # 2) XGBoost
    if xgb is not None:
        models["XGBoost"] = xgb.XGBClassifier(
            n_estimators=600, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multi:softprob", num_class=n_classes,
            tree_method="hist", random_state=random_state, n_jobs=-1
        )

    # 3) LightGBM
    if lgb is not None:
        models["LightGBM"] = lgb.LGBMClassifier(
            n_estimators=700, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multiclass", num_class=n_classes,
            random_state=random_state, n_jobs=-1
        )

    # 4) CatBoost
    if CatBoostClassifier is not None:
        models["CatBoost"] = CatBoostClassifier(
            iterations=700, learning_rate=0.05, depth=6,
            loss_function="MultiClass", random_state=random_state,
            verbose=False
        )

    # 5) GradientBoosting
    models["GradientBoosting"] = GradientBoostingClassifier(random_state=random_state)

    # 추가) ExtraTrees
    models["ExtraTrees"] = ExtraTreesClassifier(
        n_estimators=600, min_samples_leaf=2, random_state=random_state, n_jobs=-1
    )

    # 추가) HistGradientBoosting
    models["HistGradientBoosting"] = HistGradientBoostingClassifier(
        random_state=random_state
    )

    # 추가) Logistic (multinomial)
    models["LogisticMultinomial"] = LogisticRegression(
        multi_class="multinomial", solver="saga", max_iter=2000, C=2.0, random_state=random_state, n_jobs=-1
    )

    return models

# -----------------------------
# 전처리 파이프라인 선택
#  - 트리 계열: Imputer만
#  - 로지스틱: Imputer + StandardScaler
# -----------------------------
def make_pipeline(model, model_name: str):
    if model_name == "LogisticMultinomial":
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])
    else:
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ])
    pipe = Pipeline(steps=[
        ("pre", pre),
        ("model", model),
    ])
    return pipe

# -----------------------------
# 평가/그림 저장
# -----------------------------
def plot_and_save_confusion_matrix(cm, classes, out_png):
    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation='nearest')
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(len(classes)), yticks=np.arange(len(classes)),
           xticklabels=classes, yticklabels=classes,
           ylabel='True label', xlabel='Predicted label',
           title='Confusion Matrix')
    # 값 표기
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)

def plot_and_save_multiclass_roc(y_true, y_score, classes, out_png):
    """
    y_true: (n,) int
    y_score: (n, K) proba
    """
    # binarize
    y_bin = label_binarize(y_true, classes=classes)
    n_classes = len(classes)

    # ROC curve for each class
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, roc_auc = dict(), dict(), dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # micro-average
    fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # macro-average
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # plot
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.plot(fpr["micro"], tpr["micro"],
            label='micro-average ROC (AUC = {0:0.3f})'.format(roc_auc["micro"]))
    ax.plot(fpr["macro"], tpr["macro"],
            label='macro-average ROC (AUC = {0:0.3f})'.format(roc_auc["macro"]))

    colors = cycle(plt.rcParams['axes.prop_cycle'].by_key().get('color', ['C0','C1','C2','C3']))
    for i, color in zip(range(n_classes), colors):
        ax.plot(fpr[i], tpr[i], label='class {0} (AUC = {1:0.3f})'
                                   ''.format(classes[i], roc_auc[i]))

    ax.plot([0, 1], [0, 1], 'k--', lw=1)
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Multiclass ROC (OvR)')
    ax.legend(loc="lower right", fontsize=8)
    fig.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)

# -----------------------------
# SHAP 저장 (트리계열만)
# -----------------------------
def save_shap_summary(pipeline, X_train, X_test, feature_names, out_png, max_samples=500):
    if shap is None:
        return  # shap 미설치 시 스킵
    # 트리 계열만 시도 (모델 타입 체크)
    model = pipeline.named_steps["model"]
    tree_like = (
        isinstance(model, (RandomForestClassifier, ExtraTreesClassifier,
                           GradientBoostingClassifier, HistGradientBoostingClassifier))
        or (xgb is not None and isinstance(model, xgb.XGBClassifier))
        or (lgb is not None and isinstance(model, lgb.LGBMClassifier))
        or (CatBoostClassifier is not None and isinstance(model, CatBoostClassifier))
    )
    if not tree_like:
        return

    # 전처리 변환된 입력 생성
    X_tr = pipeline.named_steps["pre"].transform(X_train)
    X_te = pipeline.named_steps["pre"].transform(X_test)

    # 샘플링 (SHAP 계산량 절약)
    n = X_tr.shape[0]
    idx_bg = np.random.RandomState(42).choice(n, size=min(max_samples, n), replace=False)
    background = X_tr[idx_bg]  # noqa: F841

    # explainer
    try:
        explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
        shap_vals = explainer.shap_values(X_te, check_additivity=False)
    except Exception:
        explainer = shap.TreeExplainer(model)
        shap_vals = explainer.shap_values(X_te)

    # summary plot
    plt.figure()
    shap.summary_plot(shap_vals, X_te, feature_names=feature_names, show=False)
    plt.tight_layout()
    plt.savefig(out_png, dpi=150)
    plt.close()

# -----------------------------
# LIME 저장 (샘플 몇 개)
# -----------------------------
def save_lime_examples(pipeline, X_train, X_test, y_test, feature_names, class_names, out_dir, n_examples=3):
    if lime_tabular is None:
        return  # lime 미설치 시 스킵

    explainer = lime_tabular.LimeTabularExplainer(
        training_data=X_train.values,
        feature_names=feature_names,
        class_names=[str(c) for c in class_names],
        discretize_continuous=True,
        mode='classification'
    )
    n = min(n_examples, len(X_test))
    for i in range(n):
        exp = explainer.explain_instance(
            data_row=X_test.iloc[i].values,
            predict_fn=lambda x: pipeline.predict_proba(pd.DataFrame(x, columns=feature_names)),
            num_features=min(10, len(feature_names))
        )
        out_html = os.path.join(out_dir, f"lime_example_{i+1}.html")
        exp.save_to_file(out_html)

# -----------------------------
# 한 모델 학습/평가/저장 루틴
# -----------------------------
def fit_evaluate_save(center_name: str,
                      df: pd.DataFrame,
                      target_col: str,
                      not_use_col: list,
                      save_root: str = "./outputs",
                      test_size: float = 0.2,
                      random_state: int = 42):
    ts = time.strftime("%Y%m%d_%H%M%S")
    out_dir = ensure_dir(os.path.join(save_root, center_name, ts))

    # 데이터 준비 (+ 날짜)
    X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test = prepare_xy(
        df, target_col=target_col, not_use_col=not_use_col,
        test_size=test_size, random_state=random_state
    )
    classes = np.sort(np.unique(y_train))
    n_classes = len(classes)

    models = build_models(n_classes=n_classes, random_state=random_state)
    summary_rows = []

    for name, model in models.items():
        print(f"[{center_name}] Training {name} ...")
        pipe = make_pipeline(model, name)
        pipe.fit(X_train, y_train)

        # ====== ▶ 예측/확률 계산 + 정렬 일치 ◀ ======
        y_pred = pipe.predict(X_test)
        y_proba = pipe.predict_proba(X_test) if hasattr(pipe, "predict_proba") else None

        model_classes = None
        if y_proba is not None and hasattr(pipe.named_steps["model"], "classes_"):
            model_classes = np.asarray(pipe.named_steps["model"].classes_)

        y_proba_aligned = None
        if y_proba is not None:
            y_proba_arr = np.asarray(y_proba)
            if y_proba_arr.ndim == 1:
                y_proba_arr = y_proba_arr.reshape(-1, 1)
            if model_classes is not None:
                # classes(정렬) 순서를 model_classes에 맞춰 인덱싱
                idx = [int(np.where(model_classes == c)[0][0]) for c in classes]
                y_proba_aligned = y_proba_arr[:, idx]
            else:
                y_proba_aligned = y_proba_arr

        # ====== ▶ 예측 CSV 저장 (날짜, 예측, 실제 + 확률) ◀ ======
        to1d = lambda a: np.asarray(a).reshape(-1)

        dates_col = pd.to_datetime(dates_test).dt.strftime("%Y-%m-%d")
        y_true_1d = to1d(y_test.values).astype(int)
        y_pred_1d = to1d(y_pred).astype(int)

        pred_df = pd.DataFrame({
            "날짜": to1d(dates_col),
            "y_true": y_true_1d,
            "y_pred": y_pred_1d,
        })

        if y_proba_aligned is not None:
            proba_cols = {f"proba_{int(c)}": y_proba_aligned[:, i] for i, c in enumerate(classes)}
            proba_df = pd.DataFrame(proba_cols)
            pred_df = pd.concat([pred_df, proba_df], axis=1)
            pred_df["pred_conf"] = y_proba_aligned.max(axis=1)

        model_dir = ensure_dir(os.path.join(out_dir, name))
        pred_df.to_csv(os.path.join(model_dir, "predictions.csv"), index=False, encoding="utf-8-sig")

        # ====== ▶ 점수 계산/저장 ◀ ======
        acc = accuracy_score(y_test, y_pred)
        f1m = f1_score(y_test, y_pred, average="macro")
        rec = recall_score(y_test, y_pred, average="macro")

        roc_auc = np.nan
        if y_proba_aligned is not None:
            try:
                roc_auc = roc_auc_score(y_test, y_proba_aligned, multi_class="ovr", average="macro")
            except Exception:
                roc_auc = np.nan

        report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        cm = confusion_matrix(y_test, y_pred, labels=classes)

        with open(os.path.join(model_dir, "classification_report.json"), "w", encoding="utf-8") as f:
            json.dump(report_dict, f, ensure_ascii=False, indent=2)
        meta = {"accuracy": acc, "macro_f1": f1m, "macro_recall": rec, "roc_auc_ovr_macro": roc_auc,
                "n_train": int(len(X_train)), "n_test": int(len(X_test)),
                "n_features": int(len(feature_names)), "classes": list(map(int, classes))}
        with open(os.path.join(model_dir, "metrics.json"), "w", encoding="utf-8") as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)

        # 그림 저장
        plot_and_save_confusion_matrix(cm, classes, os.path.join(model_dir, "confusion_matrix.png"))
        if y_proba_aligned is not None:
            plot_and_save_multiclass_roc(y_test, y_proba_aligned, classes, os.path.join(model_dir, "roc_curve.png"))

        # (선택) SHAP/LIME
        try:
            save_shap_summary(pipe, X_train, X_test, feature_names,
                              os.path.join(model_dir, "shap_summary.png"), max_samples=500)
        except Exception as e:
            print(f"  (SHAP skip: {e})")
        try:
            save_lime_examples(pipe, X_train, X_test, y_test, feature_names,
                               classes, model_dir, n_examples=3)
        except Exception as e:
            print(f"  (LIME skip: {e})")

        summary_rows.append({
            "center": center_name, "model": name,
            "accuracy": acc, "macro_f1": f1m, "macro_recall": rec, "roc_auc_ovr_macro": roc_auc
        })

    summary_df = pd.DataFrame(summary_rows).sort_values(["macro_f1", "accuracy"], ascending=False)
    summary_df.to_csv(os.path.join(out_dir, "_center_summary.csv"), index=False, encoding="utf-8-sig")
    print(f"[{center_name}] 결과 저장 완료 → {out_dir}")
    return out_dir, summary_df

# =========================================================
# 실행부
# - 각 센터 DataFrame: nanji, jungnang, tancheon, seonam
# - 타깃: '합계_1일후_class'
# - 불사용 컬럼: 사용자 지정 + 데이터에 있는 것만 드롭
# =========================================================

not_use_col = [
    '1처리장', '2처리장', '정화조', '중계펌프장', '합계', '시설현대화',
    '3처리장', '4처리장', '합계_1일후', '합계_2일후',
    '합계_class', '합계_1일후_class', '합계_2일후_class'
]

centers = {
    "nanji": nanji,
    "jungnang": jungnang,
    "tancheon": tancheon,
    "seonam": seonam
}

all_summaries = []
save_roots = {}

for name, df in centers.items():
    out_dir, summary_df = fit_evaluate_save(
        center_name=name,
        df=df,
        target_col="합계_1일후_class",
        not_use_col=not_use_col,
        save_root="./outputs",
        test_size=0.2,
        random_state=42
    )
    save_roots[name] = out_dir
    all_summaries.append(summary_df)

# 전체 센터 결과 합치기 & 저장
all_summary_df = pd.concat(all_summaries, axis=0).reset_index(drop=True)
ensure_dir("./outputs/_all_centers")
all_summary_df.to_csv("./outputs/_all_centers/_summary_all_centers.csv", index=False, encoding="utf-8-sig")

print("\n=== 저장 경로 요약 ===")
for k, v in save_roots.items():
    print(f"{k}: {v}")

print("\n상위 성능 모델 Top-10")
try:
    display(all_summary_df.sort_values(["macro_f1","accuracy"], ascending=False).head(10))
except NameError:
    # 노트북이 아닌 환경 대비
    print(all_summary_df.sort_values(["macro_f1","accuracy"], ascending=False).head(10))


[nanji] Training RandomForest ...
[nanji] Training XGBoost ...
[nanji] Training LightGBM ...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5814
[LightGBM] [Info] Number of data points in the train set: 2455, number of used features: 33
[LightGBM] [Info] Start training from score -2.098772
[LightGBM] [Info] Start training from score -2.085570
[LightGBM] [Info] Start training from score -2.139455
[LightGBM] [Info] Start training from score -2.245200
[LightGBM] [Info] Start training from score -2.458775
[LightGBM] [Info] Start training from score -2.425985
[LightGBM] [Info] Start training from score -2.601875
[LightGBM] [Info] Start training from score -2.454024
[LightGBM] [Info] Start training from score -2.380932
[LightGBM] [Info] Start training from score -2.272493
[nanji] Trainin

: 

In [12]:
# =========================================================
# 안정화 버전: 다중분류 파이프라인 (4개 센터 공통)
#  - 스레드 제한, SHAP/LIME 기본 OFF, SHAP 샘플링, 예외 내성
#  - 평가지표 CSV 저장 + predictions.csv에 전체 클래스 확률
# =========================================================

# ---- 꼭 맨 위(NumPy/Pandas import 이전)에 두세요 ----
import os
os.environ.setdefault("OMP_NUM_THREADS", "4")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "4")
os.environ.setdefault("MKL_NUM_THREADS", "4")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "4")
os.environ.setdefault("OMP_WAIT_POLICY", "PASSIVE")
# -------------------------------------------------------

import json, time, warnings, gc
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")  # 노트북 백엔드 이슈 회피
import matplotlib.pyplot as plt
from itertools import cycle
from tqdm.auto import tqdm

from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, roc_auc_score,
    classification_report, confusion_matrix
)
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    ExtraTreesClassifier, HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression

# 외부 라이브러리 (없으면 자동 스킵)
try:
    import xgboost as xgb
except ModuleNotFoundError:
    xgb = None
try:
    import lightgbm as lgb
except ModuleNotFoundError:
    lgb = None
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    CatBoostClassifier = None

# 해석 라이브러리
try:
    import shap
except ModuleNotFoundError:
    shap = None
try:
    from lime import lime_tabular
except ModuleNotFoundError:
    lime_tabular = None

# -----------------------------
# 토글(필요하면 True로 바꾸세요)
# -----------------------------
USE_XGB  = xgb is not None
USE_LGB  = lgb is not None
USE_CAT  = CatBoostClassifier is not None
USE_SHAP = False          # 커널 안정 위해 기본 OFF
USE_LIME = False          # 커널 안정 위해 기본 OFF
SHAP_MAX_BG   = 200       # background 샘플
SHAP_MAX_TEST = 200       # test 샘플 (요약용)
RANDOM_STATE  = 42

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)
    return path

def classification_report_to_df(report_dict: dict) -> pd.DataFrame:
    df = pd.DataFrame(report_dict).T
    if "support" not in df.columns:
        df["support"] = np.nan
    order = [idx for idx in df.index if idx not in ["macro avg","weighted avg","accuracy"]] \
            + ["macro avg","weighted avg","accuracy"]
    df = df.loc[[i for i in order if i in df.index]]
    return df.reset_index().rename(columns={"index":"label"})

def prepare_xy(df: pd.DataFrame,
               target_col: str = "합계_1일후_class",
               not_use_col: list = None,
               date_col: str = "날짜",
               test_size: float = 0.2,
               random_state: int = RANDOM_STATE):
    if not_use_col is None:
        not_use_col = []

    work = df.sort_values(date_col).reset_index(drop=True).copy()
    dates = pd.to_datetime(work[date_col])

    drop_cols = [c for c in (set(not_use_col) | {target_col, date_col}) if c in work.columns]

    X_raw = work.drop(columns=drop_cols, errors="ignore")
    for c in X_raw.columns:
        X_raw[c] = pd.to_numeric(X_raw[c], errors="coerce")

    y = work[target_col].astype("int64")

    n = len(work)
    split = int(n * (1 - test_size))
    X_train, X_test = X_raw.iloc[:split].copy(), X_raw.iloc[split:].copy()
    y_train, y_test = y.iloc[:split].copy(), y.iloc[split:].copy()
    dates_train, dates_test = dates.iloc[:split].copy(), dates.iloc[split:].copy()

    feature_names = list(X_raw.columns)
    return X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test

def build_models(n_classes: int, random_state: int = RANDOM_STATE):
    models = {}
    models["RandomForest"] = RandomForestClassifier(
        n_estimators=400, min_samples_leaf=2, random_state=random_state, n_jobs=-1
    )
    if USE_XGB:
        models["XGBoost"] = xgb.XGBClassifier(
            n_estimators=500, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multi:softprob", num_class=n_classes,
            tree_method="hist", random_state=random_state, n_jobs=-1,
            verbosity=0
        )
    if USE_LGB:
        models["LightGBM"] = lgb.LGBMClassifier(
            n_estimators=600, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multiclass", num_class=n_classes,
            random_state=random_state, n_jobs=-1,
            verbosity=-1
        )
    if USE_CAT:
        models["CatBoost"] = CatBoostClassifier(
            iterations=600, learning_rate=0.05, depth=6,
            loss_function="MultiClass", random_state=random_state,
            verbose=False
        )
    models["GradientBoosting"] = GradientBoostingClassifier(random_state=random_state)
    models["ExtraTrees"] = ExtraTreesClassifier(
        n_estimators=500, min_samples_leaf=2, random_state=random_state, n_jobs=-1
    )
    models["HistGradientBoosting"] = HistGradientBoostingClassifier(random_state=random_state)
    models["LogisticMultinomial"] = LogisticRegression(
        multi_class="multinomial", solver="saga", max_iter=1500, C=2.0,
        random_state=random_state, n_jobs=-1
    )
    return models

def make_pipeline(model, model_name: str):
    if model_name == "LogisticMultinomial":
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])
    else:
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ])
    return Pipeline(steps=[("pre", pre), ("model", model)])

def plot_and_save_confusion_matrix(cm, classes, out_png):
    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation='nearest')
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(len(classes)), yticks=np.arange(len(classes)),
           xticklabels=classes, yticklabels=classes,
           ylabel='True label', xlabel='Predicted label',
           title='Confusion Matrix')
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)

def plot_and_save_multiclass_roc(y_true, y_score, classes, out_png):
    from sklearn.metrics import roc_curve, auc
    y_bin = label_binarize(y_true, classes=classes)
    n_classes = len(classes)
    fpr, tpr, roc_auc = {}, {}, {}
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_score.ravel())
    from numpy import interp
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    from numpy import trapz
    from math import isfinite
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.plot(fpr["micro"], tpr["micro"], label='micro-average ROC')
    ax.plot(all_fpr, mean_tpr, label='macro-average ROC')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label=f'class {classes[i]}')
    ax.plot([0,1],[0,1],'k--',lw=1)
    ax.set_xlim([0.0,1.0]); ax.set_ylim([0.0,1.05])
    ax.set_xlabel('FPR'); ax.set_ylabel('TPR')
    ax.set_title('Multiclass ROC (OvR)')
    ax.legend(loc="lower right", fontsize=8)
    fig.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)

def save_shap_summary(pipeline, X_train, X_test, feature_names, out_png,
                      max_bg=SHAP_MAX_BG, max_test=SHAP_MAX_TEST):
    if not USE_SHAP or shap is None:
        return
    model = pipeline.named_steps["model"]
    tree_like = (
        isinstance(model, (RandomForestClassifier, ExtraTreesClassifier,
                           GradientBoostingClassifier, HistGradientBoostingClassifier))
        or (xgb is not None and isinstance(model, getattr(xgb, "XGBClassifier")))
        or (lgb is not None and isinstance(model, getattr(lgb, "LGBMClassifier")))
        or (CatBoostClassifier is not None and isinstance(model, CatBoostClassifier))
    )
    if not tree_like:
        return
    X_tr = pipeline.named_steps["pre"].transform(X_train)
    X_te = pipeline.named_steps["pre"].transform(X_test)
    # 샘플링(background + test)
    rng = np.random.RandomState(RANDOM_STATE)
    bg_idx = rng.choice(X_tr.shape[0], size=min(max_bg, X_tr.shape[0]), replace=False)
    te_idx = rng.choice(X_te.shape[0], size=min(max_test, X_te.shape[0]), replace=False)
    X_bg = X_tr[bg_idx]
    X_te_small = X_te[te_idx]
    try:
        explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
        shap_vals = explainer.shap_values(X_te_small, check_additivity=False)
    except Exception:
        explainer = shap.TreeExplainer(model)
        shap_vals = explainer.shap_values(X_te_small)
    plt.figure()
    shap.summary_plot(shap_vals, X_te_small, feature_names=feature_names, show=False)
    plt.tight_layout()
    plt.savefig(out_png, dpi=150)
    plt.close()

def save_lime_examples(pipeline, X_train, X_test, y_test, feature_names, class_names, out_dir, n_examples=3):
    if not USE_LIME or lime_tabular is None:
        return
    explainer = lime_tabular.LimeTabularExplainer(
        training_data=X_train.values,
        feature_names=feature_names,
        class_names=[str(c) for c in class_names],
        discretize_continuous=True,
        mode='classification'
    )
    n = min(n_examples, len(X_test))
    for i in range(n):
        exp = explainer.explain_instance(
            data_row=X_test.iloc[i].values,
            predict_fn=lambda x: pipeline.predict_proba(pd.DataFrame(x, columns=feature_names)),
            num_features=min(10, len(feature_names))
        )
        out_html = os.path.join(out_dir, f"lime_example_{i+1}.html")
        exp.save_to_file(out_html)

def fit_evaluate_save(center_name: str,
                      df: pd.DataFrame,
                      target_col: str,
                      not_use_col: list,
                      save_root: str = "./outputs",
                      test_size: float = 0.2,
                      random_state: int = RANDOM_STATE):
    ts = time.strftime("%Y%m%d_%H%M%S")
    out_dir = ensure_dir(os.path.join(save_root, center_name, ts))

    X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test = prepare_xy(
        df, target_col=target_col, not_use_col=not_use_col,
        test_size=test_size, random_state=random_state
    )
    classes = np.sort(np.unique(y_train))
    n_classes = len(classes)

    models = build_models(n_classes=n_classes, random_state=random_state)
    summary_rows = []

    for name, model in tqdm(list(models.items()), desc=f"{center_name} models", leave=False):
        model_dir = ensure_dir(os.path.join(out_dir, name))
        t0 = time.perf_counter()
        try:
            pipe = make_pipeline(model, name)
            pipe.fit(X_train, y_train)

            y_pred  = pipe.predict(X_test)
            y_proba = pipe.predict_proba(X_test) if hasattr(pipe, "predict_proba") else None

            # predictions.csv (날짜, 정답/예측, 신뢰도, 전체 proba)
            to1d = lambda a: np.asarray(a).reshape(-1)
            pred_df = pd.DataFrame({
                "날짜": pd.to_datetime(dates_test).dt.strftime("%Y-%m-%d").values,
                "y_true": to1d(y_test.values).astype(int),
                "y_pred": to1d(y_pred).astype(int),
            })
            if y_proba is not None:
                proba = np.asarray(y_proba)
                if proba.ndim == 2 and proba.shape[0] == len(pred_df):
                    pred_df["pred_conf"] = proba.max(axis=1)
                    model_classes = getattr(pipe.named_steps["model"], "classes_", classes)
                    for j, cls in enumerate(model_classes):
                        pred_df[f"proba_{cls}"] = proba[:, j]
            pred_df.to_csv(os.path.join(model_dir, "predictions.csv"), index=False, encoding="utf-8-sig")

            # 점수/리포트
            acc = accuracy_score(y_test, y_pred)
            f1_macro = f1_score(y_test, y_pred, average="macro")
            rec_macro = recall_score(y_test, y_pred, average="macro")
            roc_auc = np.nan
            if y_proba is not None:
                try:
                    roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
                except Exception:
                    roc_auc = np.nan

            report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
            report_df = classification_report_to_df(report_dict)
            report_df.to_csv(os.path.join(model_dir, "classification_report.csv"), index=False, encoding="utf-8-sig")
            with open(os.path.join(model_dir, "classification_report.json"), "w", encoding="utf-8") as f:
                json.dump(report_dict, f, ensure_ascii=False, indent=2)

            cm = confusion_matrix(y_test, y_pred, labels=classes)
            plot_and_save_confusion_matrix(cm, classes, os.path.join(model_dir, "confusion_matrix.png"))
            if y_proba is not None:
                try:
                    plot_and_save_multiclass_roc(y_test, np.asarray(y_proba), classes,
                                                 os.path.join(model_dir, "roc_curve.png"))
                except Exception:
                    pass

            # 중요도/SHAP/LIME
            try:
                est = pipe.named_steps["model"]
                if hasattr(est, "feature_importances_"):
                    imp = est.feature_importances_
                    imp_df = pd.DataFrame({"feature": feature_names, "importance": imp}).sort_values("importance", ascending=False)
                    imp_df.to_csv(os.path.join(model_dir, "feature_importances.csv"), index=False, encoding="utf-8-sig")
                    fig, ax = plt.subplots(figsize=(7, max(4, min(20, len(imp_df))*0.35)))
                    topk = imp_df.head(min(20, len(imp_df)))
                    ax.barh(topk["feature"][::-1], topk["importance"][::-1])
                    ax.set_title("Top Feature Importances")
                    fig.tight_layout()
                    fig.savefig(os.path.join(model_dir, "feature_importances_top20.png"), dpi=150)
                    plt.close(fig)
            except Exception:
                pass

            try:
                save_shap_summary(pipe, X_train, X_test, feature_names,
                                  os.path.join(model_dir, "shap_summary.png"))
            except Exception as e:
                tqdm.write(f"  (SHAP skip: {e})")
            try:
                save_lime_examples(pipe, X_train, X_test, y_test, feature_names, classes, model_dir, n_examples=3)
            except Exception as e:
                tqdm.write(f"  (LIME skip: {e})")

            # 요약 DF
            summary_rows.append({
                "center": center_name,
                "model": name,
                "accuracy": acc,
                "macro_f1": f1_macro,
                "macro_recall": rec_macro,
                "roc_auc_ovr_macro": roc_auc
            })
            secs = time.perf_counter() - t0
            tqdm.write(f"[{center_name}] {name} ✓ {secs:.1f}s | acc={acc:.3f}, macroF1={f1_macro:.3f}")

        except Exception as e:
            # 모델 실패해도 다음으로
            err_path = os.path.join(model_dir, "_error.txt")
            with open(err_path, "w", encoding="utf-8") as f:
                f.write(str(e))
            tqdm.write(f"[{center_name}] {name} ✗ error: {e}")

        finally:
            gc.collect()

    # 센터 요약 저장(DF)
    center_df = pd.DataFrame(summary_rows).sort_values(["macro_f1", "accuracy"], ascending=False)
    center_df.to_csv(os.path.join(out_dir, "_center_summary.csv"), index=False, encoding="utf-8-sig")
    print(f"[{center_name}] 결과 저장 완료 → {out_dir}")
    return out_dir, center_df

# ===================== 실행부 예시 =====================

not_use_col = [
    '1처리장','2처리장','정화조','중계펌프장','합계','시설현대화',
    '3처리장','4처리장','합계_1일후','합계_2일후',
    '합계_class','합계_1일후_class','합계_2일후_class'
]

# nanji, jungnang, tancheon, seonam = ...  # 미리 준비된 DataFrame

centers = {
    "nanji": nanji,
    "jungnang": jungnang,
    "tancheon": tancheon,
    "seonam": seonam
}

all_summaries = []
save_roots = {}

for name, df in tqdm(list(centers.items()), desc="Centers", leave=True):
    out_dir, summary_df = fit_evaluate_save(
        center_name=name,
        df=df,
        target_col="합계_1일후_class",
        not_use_col=not_use_col,
        save_root="./outputs",
        test_size=0.2,
        random_state=RANDOM_STATE
    )
    save_roots[name] = out_dir
    all_summaries.append(summary_df)

all_summary_df = pd.concat(all_summaries, axis=0).reset_index(drop=True)
ensure_dir("./outputs/_all_centers")
all_summary_df.to_csv("./outputs/_all_centers/_summary_all_centers.csv", index=False, encoding="utf-8-sig")

print("\n=== 저장 경로 요약 ===")
for k, v in save_roots.items():
    print(f"{k}: {v}")


                                              
Centers:   0%|          | 0/4 [00:00<?, ?it/s]     

[nanji] RandomForest ✓ 0.9s | acc=0.212, macroF1=0.180


                                              
Centers:   0%|          | 0/4 [00:05<?, ?it/s]             

[nanji] XGBoost ✓ 4.3s | acc=0.218, macroF1=0.193


                                              
Centers:   0%|          | 0/4 [00:26<?, ?it/s]             

[nanji] LightGBM ✓ 21.1s | acc=0.202, macroF1=0.160


                                              
Centers:   0%|          | 0/4 [00:30<?, ?it/s]             

[nanji] CatBoost ✓ 3.6s | acc=0.208, macroF1=0.193


                                              
Centers:   0%|          | 0/4 [00:38<?, ?it/s]             

[nanji] GradientBoosting ✓ 8.6s | acc=0.248, macroF1=0.187


                                              
Centers:   0%|          | 0/4 [00:39<?, ?it/s]             

[nanji] ExtraTrees ✓ 0.7s | acc=0.212, macroF1=0.181


                                              
Centers:   0%|          | 0/4 [00:46<?, ?it/s]             

[nanji] HistGradientBoosting ✓ 6.9s | acc=0.225, macroF1=0.212


                                              
nanji models: 100%|██████████| 8/8 [00:48<00:00,  6.03s/it]
Centers:  25%|██▌       | 1/4 [00:48<02:24, 48.25s/it]

[nanji] LogisticMultinomial ✓ 1.7s | acc=0.147, macroF1=0.152
[nanji] 결과 저장 완료 → ./outputs/nanji/20250824_132719


                                                      
Centers:  25%|██▌       | 1/4 [00:49<02:24, 48.25s/it]

[jungnang] RandomForest ✓ 0.8s | acc=0.181, macroF1=0.156


                                                      
Centers:  25%|██▌       | 1/4 [00:53<02:24, 48.25s/it]        

[jungnang] XGBoost ✓ 4.7s | acc=0.176, macroF1=0.158


                                                      
Centers:  25%|██▌       | 1/4 [01:15<02:24, 48.25s/it]        

[jungnang] LightGBM ✓ 21.4s | acc=0.174, macroF1=0.158


                                                      
Centers:  25%|██▌       | 1/4 [01:19<02:24, 48.25s/it]        

[jungnang] CatBoost ✓ 3.9s | acc=0.191, macroF1=0.167


                                                      
Centers:  25%|██▌       | 1/4 [01:28<02:24, 48.25s/it]        

[jungnang] GradientBoosting ✓ 9.4s | acc=0.166, macroF1=0.130


                                                      
Centers:  25%|██▌       | 1/4 [01:29<02:24, 48.25s/it]        

[jungnang] ExtraTrees ✓ 0.8s | acc=0.195, macroF1=0.180


                                                      
Centers:  25%|██▌       | 1/4 [01:36<02:24, 48.25s/it]        

[jungnang] HistGradientBoosting ✓ 7.0s | acc=0.164, macroF1=0.145


                                                      
jungnang models: 100%|██████████| 8/8 [00:49<00:00,  6.21s/it]
Centers:  50%|█████     | 2/4 [01:37<01:38, 49.12s/it]

[jungnang] LogisticMultinomial ✓ 1.2s | acc=0.142, macroF1=0.093
[jungnang] 결과 저장 완료 → ./outputs/jungnang/20250824_132807


                                                      
Centers:  50%|█████     | 2/4 [01:38<01:38, 49.12s/it]

[tancheon] RandomForest ✓ 0.8s | acc=0.186, macroF1=0.168


                                                      
Centers:  50%|█████     | 2/4 [01:43<01:38, 49.12s/it]        

[tancheon] XGBoost ✓ 4.6s | acc=0.158, macroF1=0.134


                                                      
Centers:  50%|█████     | 2/4 [02:05<01:38, 49.12s/it]        

[tancheon] LightGBM ✓ 21.6s | acc=0.169, macroF1=0.150


                                                      
Centers:  50%|█████     | 2/4 [02:08<01:38, 49.12s/it]        

[tancheon] CatBoost ✓ 3.5s | acc=0.178, macroF1=0.154


                                                      
Centers:  50%|█████     | 2/4 [02:17<01:38, 49.12s/it]        

[tancheon] GradientBoosting ✓ 8.2s | acc=0.150, macroF1=0.117


                                                      
Centers:  50%|█████     | 2/4 [02:17<01:38, 49.12s/it]        

[tancheon] ExtraTrees ✓ 0.7s | acc=0.182, macroF1=0.165


                                                      
Centers:  50%|█████     | 2/4 [02:24<01:38, 49.12s/it]        

[tancheon] HistGradientBoosting ✓ 6.7s | acc=0.168, macroF1=0.157


                                                      
tancheon models: 100%|██████████| 8/8 [00:48<00:00,  6.07s/it]
Centers:  75%|███████▌  | 3/4 [02:26<00:48, 48.87s/it]

[tancheon] LogisticMultinomial ✓ 1.9s | acc=0.161, macroF1=0.085
[tancheon] 결과 저장 완료 → ./outputs/tancheon/20250824_132857


                                                      
Centers:  75%|███████▌  | 3/4 [02:27<00:48, 48.87s/it]

[seonam] RandomForest ✓ 0.8s | acc=0.067, macroF1=0.091


                                                      
Centers:  75%|███████▌  | 3/4 [02:32<00:48, 48.87s/it]      

[seonam] XGBoost ✓ 4.7s | acc=0.070, macroF1=0.086


                                                      
Centers:  75%|███████▌  | 3/4 [02:52<00:48, 48.87s/it]      

[seonam] LightGBM ✓ 20.5s | acc=0.065, macroF1=0.089


                                                      
Centers:  75%|███████▌  | 3/4 [02:56<00:48, 48.87s/it]      

[seonam] CatBoost ✓ 3.9s | acc=0.065, macroF1=0.083


                                                      
Centers:  75%|███████▌  | 3/4 [03:06<00:48, 48.87s/it]      

[seonam] GradientBoosting ✓ 9.6s | acc=0.075, macroF1=0.092


                                                      
Centers:  75%|███████▌  | 3/4 [03:07<00:48, 48.87s/it]      

[seonam] ExtraTrees ✓ 0.8s | acc=0.067, macroF1=0.083


                                                      
Centers:  75%|███████▌  | 3/4 [03:14<00:48, 48.87s/it]      

[seonam] HistGradientBoosting ✓ 6.9s | acc=0.070, macroF1=0.098


                                                      
seonam models: 100%|██████████| 8/8 [00:49<00:00,  6.21s/it]
Centers: 100%|██████████| 4/4 [03:16<00:00, 49.06s/it]

[seonam] LogisticMultinomial ✓ 1.9s | acc=0.080, macroF1=0.095
[seonam] 결과 저장 완료 → ./outputs/seonam/20250824_132946

=== 저장 경로 요약 ===
nanji: ./outputs/nanji/20250824_132719
jungnang: ./outputs/jungnang/20250824_132807
tancheon: ./outputs/tancheon/20250824_132857
seonam: ./outputs/seonam/20250824_132946





In [10]:
all_summary_df

Unnamed: 0,center,model,accuracy,macro_f1,macro_recall,roc_auc_ovr_macro
0,nanji,HistGradientBoosting,0.224756,0.212307,0.262086,0.760573
1,nanji,XGBoost,0.218241,0.193479,0.245417,0.729828
2,nanji,CatBoost,0.208469,0.192562,0.296884,0.735916
3,nanji,GradientBoosting,0.247557,0.187489,0.2267,0.720437
4,nanji,ExtraTrees,0.211726,0.180977,0.230077,0.736817
5,nanji,RandomForest,0.211726,0.179818,0.235965,0.756728
6,nanji,LightGBM,0.201954,0.159692,0.21372,0.718091
7,nanji,LogisticMultinomial,0.14658,0.15246,0.300648,0.685987
8,jungnang,ExtraTrees,0.19544,0.18002,0.20015,0.690929
9,jungnang,CatBoost,0.190554,0.167382,0.200558,0.685889


In [None]:
# =========================================================
# 다중분류 파이프라인 (4개 센터 공통, tqdm + 확률 저장 + DF로 평가지표 저장)
# - 타깃: '합계_1일후_class'
# - 불사용 컬럼(not_use_col) 제외
# - 모델: RF, XGB, LGBM, CatBoost, GradientBoosting + ExtraTrees, HistGB, Logistic
# - 평가/시각화: accuracy, macro-F1, macro-recall, ROC-AUC(ovr), CM, ROC, SHAP, LIME
# - 저장: ./outputs/<center>/<timestamp>/ 이하 PNG/CSV/JSON/HTML 등
# =========================================================

import os, json, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# 시각화
import matplotlib.pyplot as plt
from itertools import cycle

# tqdm 진행 바
from tqdm.auto import tqdm

# 전처리/학습/평가
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, roc_auc_score,
    classification_report, confusion_matrix
)
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 모델
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    ExtraTreesClassifier, HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression

# 외부 라이브러리 (없으면 스킵)
try:
    import xgboost as xgb
except ModuleNotFoundError:
    xgb = None
try:
    import lightgbm as lgb
except ModuleNotFoundError:
    lgb = None
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    CatBoostClassifier = None

# 해석 라이브러리 (옵션)
try:
    import shap
except ModuleNotFoundError:
    shap = None
try:
    from lime import lime_tabular
except ModuleNotFoundError:
    lime_tabular = None

# -----------------------------
# 기본 유틸
# -----------------------------
def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)
    return path

def classification_report_to_df(report_dict: dict) -> pd.DataFrame:
    """sklearn classification_report(output_dict=True) -> DataFrame"""
    df = pd.DataFrame(report_dict).T
    # accuracy가 단일 스칼라인 경우를 위해 컬럼 정리
    if "support" not in df.columns:
        df["support"] = np.nan
    # 정렬: (개별 클래스 -> macro avg -> weighted avg -> accuracy)
    order = [idx for idx in df.index if idx not in ["macro avg","weighted avg","accuracy"]] + ["macro avg","weighted avg","accuracy"]
    df = df.loc[[i for i in order if i in df.index]]
    return df.reset_index().rename(columns={"index":"label"})

# -----------------------------
# 데이터 준비: 피처/타깃/분할 (시간순 80/20)
# -----------------------------
def prepare_xy(df: pd.DataFrame,
               target_col: str = "합계_1일후_class",
               not_use_col: list = None,
               date_col: str = "날짜",
               test_size: float = 0.2,
               random_state: int = 42):
    if not_use_col is None:
        not_use_col = []

    # 날짜 기준 정렬
    work = df.sort_values(date_col).reset_index(drop=True).copy()
    dates = pd.to_datetime(work[date_col])  # 날짜 보존

    # 제외 컬럼 (데이터에 있는 것만)
    drop_cols = [c for c in (set(not_use_col) | {target_col, date_col}) if c in work.columns]

    # X 숫자화
    X_raw = work.drop(columns=drop_cols, errors="ignore")
    for c in X_raw.columns:
        X_raw[c] = pd.to_numeric(X_raw[c], errors="coerce")

    # y
    y = work[target_col].astype("int64")

    # 시간순 80/20 분할
    n = len(work)
    split = int(n * (1 - test_size))
    X_train, X_test = X_raw.iloc[:split].copy(), X_raw.iloc[split:].copy()
    y_train, y_test = y.iloc[:split].copy(), y.iloc[split:].copy()
    dates_train, dates_test = dates.iloc[:split].copy(), dates.iloc[split:].copy()

    feature_names = list(X_raw.columns)
    return X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test

# -----------------------------
# 모델 구성 (n_classes 필요, 로그 억제)
# -----------------------------
def build_models(n_classes: int, random_state: int = 42):
    models = {}

    # 1) RandomForest
    models["RandomForest"] = RandomForestClassifier(
        n_estimators=500, min_samples_leaf=2, random_state=random_state, n_jobs=-1
    )

    # 2) XGBoost
    if xgb is not None:
        models["XGBoost"] = xgb.XGBClassifier(
            n_estimators=600, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multi:softprob", num_class=n_classes,
            tree_method="hist", random_state=random_state, n_jobs=-1,
            verbosity=0  # 로그 억제
        )

    # 3) LightGBM
    if lgb is not None:
        models["LightGBM"] = lgb.LGBMClassifier(
            n_estimators=700, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multiclass", num_class=n_classes,
            random_state=random_state, n_jobs=-1,
            verbosity=-1  # 로그 억제
        )

    # 4) CatBoost
    if CatBoostClassifier is not None:
        models["CatBoost"] = CatBoostClassifier(
            iterations=700, learning_rate=0.05, depth=6,
            loss_function="MultiClass", random_state=random_state,
            verbose=False  # 로그 억제
        )

    # 5) GradientBoosting
    models["GradientBoosting"] = GradientBoostingClassifier(random_state=random_state)

    # 추가) ExtraTrees
    models["ExtraTrees"] = ExtraTreesClassifier(
        n_estimators=600, min_samples_leaf=2, random_state=random_state, n_jobs=-1
    )

    # 추가) HistGradientBoosting
    models["HistGradientBoosting"] = HistGradientBoostingClassifier(
        random_state=random_state
    )

    # 추가) Logistic (multinomial)
    models["LogisticMultinomial"] = LogisticRegression(
        multi_class="multinomial", solver="saga", max_iter=2000, C=2.0,
        random_state=random_state, n_jobs=-1
    )

    return models

# -----------------------------
# 전처리 파이프라인
#  - 트리 계열: Imputer만
#  - 로지스틱: Imputer + StandardScaler
# -----------------------------
def make_pipeline(model, model_name: str):
    if model_name == "LogisticMultinomial":
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])
    else:
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ])
    pipe = Pipeline(steps=[
        ("pre", pre),
        ("model", model),
    ])
    return pipe

# -----------------------------
# 평가/그림 저장
# -----------------------------
def plot_and_save_confusion_matrix(cm, classes, out_png):
    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation='nearest')
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(len(classes)), yticks=np.arange(len(classes)),
           xticklabels=classes, yticklabels=classes,
           ylabel='True label', xlabel='Predicted label',
           title='Confusion Matrix')
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)

def plot_and_save_multiclass_roc(y_true, y_score, classes, out_png):
    """
    y_true: (n,) int
    y_score: (n, K) proba
    """
    from sklearn.metrics import roc_curve, auc

    # binarize
    y_bin = label_binarize(y_true, classes=classes)
    n_classes = len(classes)

    # ROC curve for each class
    fpr, tpr, roc_auc = {}, {}, {}
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # micro-average
    fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # macro-average
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # plot
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.plot(fpr["micro"], tpr["micro"], label=f'micro-average ROC (AUC = {roc_auc["micro"]:.3f})')
    ax.plot(fpr["macro"], tpr["macro"], label=f'macro-average ROC (AUC = {roc_auc["macro"]:.3f})')

    colors = cycle(plt.rcParams['axes.prop_cycle'].by_key().get('color', ['C0','C1','C2','C3']))
    for i, color in zip(range(n_classes), colors):
        ax.plot(fpr[i], tpr[i], label=f'class {classes[i]} (AUC = {roc_auc[i]:.3f})')

    ax.plot([0, 1], [0, 1], 'k--', lw=1)
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Multiclass ROC (OvR)')
    ax.legend(loc="lower right", fontsize=8)
    fig.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)

# -----------------------------
# SHAP 저장 (트리계열만)
# -----------------------------
def save_shap_summary(pipeline, X_train, X_test, feature_names, out_png, max_samples=500):
    if shap is None:
        return
    model = pipeline.named_steps["model"]
    tree_like = (
        isinstance(model, (RandomForestClassifier, ExtraTreesClassifier,
                           GradientBoostingClassifier, HistGradientBoostingClassifier))
        or (xgb is not None and isinstance(model, xgb.XGBClassifier))
        or (lgb is not None and isinstance(model, lgb.LGBMClassifier))
        or (CatBoostClassifier is not None and isinstance(model, CatBoostClassifier))
    )
    if not tree_like:
        return

    X_tr = pipeline.named_steps["pre"].transform(X_train)
    X_te = pipeline.named_steps["pre"].transform(X_test)

    n = X_tr.shape[0]
    idx_bg = np.random.RandomState(42).choice(n, size=min(max_samples, n), replace=False)
    background = X_tr[idx_bg]  # noqa: F841

    try:
        explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
        shap_vals = explainer.shap_values(X_te, check_additivity=False)
    except Exception:
        explainer = shap.TreeExplainer(model)
        shap_vals = explainer.shap_values(X_te)

    plt.figure()
    shap.summary_plot(shap_vals, X_te, feature_names=feature_names, show=False)
    plt.tight_layout()
    plt.savefig(out_png, dpi=150)
    plt.close()

# -----------------------------
# LIME 저장 (샘플 몇 개)
# -----------------------------
def save_lime_examples(pipeline, X_train, X_test, y_test, feature_names, class_names, out_dir, n_examples=3):
    if lime_tabular is None:
        return
    explainer = lime_tabular.LimeTabularExplainer(
        training_data=X_train.values,
        feature_names=feature_names,
        class_names=[str(c) for c in class_names],
        discretize_continuous=True,
        mode='classification'
    )
    n = min(n_examples, len(X_test))
    for i in range(n):
        exp = explainer.explain_instance(
            data_row=X_test.iloc[i].values,
            predict_fn=lambda x: pipeline.predict_proba(pd.DataFrame(x, columns=feature_names)),
            num_features=min(10, len(feature_names))
        )
        out_html = os.path.join(out_dir, f"lime_example_{i+1}.html")
        exp.save_to_file(out_html)

# -----------------------------
# 한 센터: 학습/평가/저장
# -----------------------------
def fit_evaluate_save(center_name: str,
                      df: pd.DataFrame,
                      target_col: str,
                      not_use_col: list,
                      save_root: str = "./outputs",
                      test_size: float = 0.2,
                      random_state: int = 42):
    ts = time.strftime("%Y%m%d_%H%M%S")
    out_dir = ensure_dir(os.path.join(save_root, center_name, ts))

    # 데이터 준비
    X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test = prepare_xy(
        df, target_col=target_col, not_use_col=not_use_col,
        test_size=test_size, random_state=random_state
    )
    classes = np.sort(np.unique(y_train))
    n_classes = len(classes)

    models = build_models(n_classes=n_classes, random_state=random_state)
    summary_rows = []

    # 모델 루프 (tqdm)
    for name, model in tqdm(list(models.items()), desc=f"{center_name} models", leave=False):
        t0 = time.perf_counter()
        pipe = make_pipeline(model, name)
        pipe.fit(X_train, y_train)

        # 예측/확률
        y_pred = pipe.predict(X_test)
        y_proba = pipe.predict_proba(X_test) if hasattr(pipe, "predict_proba") else None

        # ====== ▶ 예측 CSV 저장 (날짜, 실제, 예측, 신뢰도, 전체 proba) ◀ ======
        to1d = lambda a: np.asarray(a).reshape(-1)
        dates_col = pd.to_datetime(dates_test).dt.strftime("%Y-%m-%d")
        y_true_1d = to1d(y_test.values).astype(int)
        y_pred_1d = to1d(y_pred).astype(int)

        pred_df = pd.DataFrame({
            "날짜": to1d(dates_col),
            "y_true": y_true_1d,
            "y_pred": y_pred_1d,
        })

        # 전체 클래스 확률 열 추가 (estimator의 classes_ 순서 사용)
        if y_proba is not None:
            y_proba_arr = np.asarray(y_proba)
            if y_proba_arr.ndim == 2 and y_proba_arr.shape[0] == len(pred_df):
                model_classes = getattr(pipe.named_steps["model"], "classes_", classes)
                # 신뢰도(top-1)
                pred_df["pred_conf"] = y_proba_arr.max(axis=1)
                # per-class proba
                for j, cls in enumerate(model_classes):
                    pred_df[f"proba_{cls}"] = y_proba_arr[:, j]

        model_dir = ensure_dir(os.path.join(out_dir, name))
        pred_df.to_csv(os.path.join(model_dir, "predictions.csv"), index=False, encoding="utf-8-sig")
        # ===============================================================

        # 점수 계산
        acc = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average="macro")
        rec_macro = recall_score(y_test, y_pred, average="macro")

        roc_auc = np.nan
        if y_proba is not None:
            try:
                roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
            except Exception:
                roc_auc = np.nan

        # 리포트/혼동행렬
        report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        cm = confusion_matrix(y_test, y_pred, labels=classes)

        # 저장: 리포트 JSON + CSV(DF)
        with open(os.path.join(model_dir, "classification_report.json"), "w", encoding="utf-8") as f:
            json.dump(report_dict, f, ensure_ascii=False, indent=2)
        report_df = classification_report_to_df(report_dict)
        report_df.to_csv(os.path.join(model_dir, "classification_report.csv"), index=False, encoding="utf-8-sig")

        # 저장: 메타 점수 JSON
        meta = {
            "accuracy": acc,
            "macro_f1": f1_macro,
            "macro_recall": rec_macro,
            "roc_auc_ovr_macro": roc_auc,
            "n_train": int(len(X_train)),
            "n_test": int(len(X_test)),
            "n_features": int(len(feature_names)),
            "classes": list(map(int, classes))
        }
        with open(os.path.join(model_dir, "metrics.json"), "w", encoding="utf-8") as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)

        # 그림 저장: CM, ROC
        plot_and_save_confusion_matrix(cm, classes, os.path.join(model_dir, "confusion_matrix.png"))
        if y_proba is not None:
            try:
                plot_and_save_multiclass_roc(y_test, np.asarray(y_proba), classes, os.path.join(model_dir, "roc_curve.png"))
            except Exception:
                pass

        # 트리계열 중요도, SHAP, LIME (가능한 경우만)
        try:
            est = pipe.named_steps["model"]
            if hasattr(est, "feature_importances_"):
                importances = est.feature_importances_
            else:
                importances = None

            if importances is not None:
                imp_df = pd.DataFrame({"feature": feature_names, "importance": importances}) \
                            .sort_values("importance", ascending=False)
                imp_df.to_csv(os.path.join(model_dir, "feature_importances.csv"), index=False, encoding="utf-8-sig")

                topk = imp_df.head(min(20, len(imp_df)))
                fig, ax = plt.subplots(figsize=(7, max(4, len(topk)*0.35)))
                ax.barh(topk["feature"][::-1], topk["importance"][::-1])
                ax.set_title("Top Feature Importances")
                fig.tight_layout()
                fig.savefig(os.path.join(model_dir, "feature_importances_top20.png"), dpi=150)
                plt.close(fig)
        except Exception:
            pass

        try:
            save_shap_summary(pipe, X_train, X_test, feature_names,
                              os.path.join(model_dir, "shap_summary.png"), max_samples=500)
        except Exception as e:
            print(f"  (SHAP skip: {e})")
        try:
            save_lime_examples(pipe, X_train, X_test, y_test, feature_names,
                               classes, model_dir, n_examples=3)
        except Exception as e:
            print(f"  (LIME skip: {e})")

        # 요약 DF용 행
        summary_rows.append({
            "center": center_name,
            "model": name,
            "accuracy": acc,
            "macro_f1": f1_macro,
            "macro_recall": rec_macro,
            "roc_auc_ovr_macro": roc_auc
        })

        secs = time.perf_counter() - t0
        tqdm.write(f"[{center_name}] {name} ✓ {secs:.1f}s | acc={acc:.3f}, macroF1={f1_macro:.3f}")

    # 센터별 요약 테이블 저장 (DataFrame CSV)
    summary_df = pd.DataFrame(summary_rows).sort_values(["macro_f1", "accuracy"], ascending=False)
    summary_df.to_csv(os.path.join(out_dir, "_center_summary.csv"), index=False, encoding="utf-8-sig")

    print(f"[{center_name}] 결과 저장 완료 → {out_dir}")
    return out_dir, summary_df

# =========================================================
# 실행부 (사용 전: nanji, jungnang, tancheon, seonam DataFrame이 준비돼 있어야 함)
# =========================================================

not_use_col = [
    '1처리장', '2처리장', '정화조', '중계펌프장', '합계', '시설현대화',
    '3처리장', '4처리장', '합계_1일후', '합계_2일후',
    '합계_class', '합계_1일후_class', '합계_2일후_class'
]

# 예시: 이미 로드되어 있다고 가정
# nanji, jungnang, tancheon, seonam = ...

centers = {
    "nanji": nanji,
    "jungnang": jungnang,
    "tancheon": tancheon,
    "seonam": seonam
}

all_summaries, save_roots = [], {}

# 센터 루프 (tqdm)
for name, df in tqdm(list(centers.items()), desc="Centers", leave=True):
    out_dir, summary_df = fit_evaluate_save(
        center_name=name,
        df=df,
        target_col="합계_1일후_class",
        not_use_col=not_use_col,
        save_root="./outputs",
        test_size=0.2,
        random_state=42
    )
    save_roots[name] = out_dir
    all_summaries.append(summary_df)

# 전체 센터 결과 합치기 & 저장 (DataFrame CSV)
all_summary_df = pd.concat(all_summaries, axis=0).reset_index(drop=True)
ensure_dir("./outputs/_all_centers")
all_summary_df.to_csv("./outputs/_all_centers/_summary_all_centers.csv", index=False, encoding="utf-8-sig")

print("\n=== 저장 경로 요약 ===")
for k, v in save_roots.items():
    print(f"{k}: {v}")

# 노트북이라면 상위 10개 미리보기(없어도 무방)
try:
    from IPython.display import display
    print("\n상위 성능 모델 Top-10")
    display(all_summary_df.sort_values(["macro_f1","accuracy"], ascending=False).head(10))
except Exception:
    pass
