In [14]:
# 센터별, 회귀, 분류 모델별로 결과 나옴
# 피클 파일은 아직 저장 못함

In [None]:
# ================================================================================================
# 전체 파이프라인을 위한 선행 실행 코드 (모든 기반 함수들)
# ================================================================================================

# 필수 라이브러리 import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, f1_score
from tqdm import tqdm
import os, sys, platform, random, time, json, warnings
from datetime import datetime


# 선택적 라이브러리 import
try:
    import xgboost as xgb
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

try:
    import lightgbm as lgb
    HAS_LGB = True
except ImportError:
    HAS_LGB = False

try:
    import catboost as cb
    HAS_CATBOOST = True
except ImportError:
    HAS_CATBOOST = False

try:
    import shap
    HAS_SHAP = True
except ImportError:
    HAS_SHAP = False

# 한글 폰트 설정. 
warnings.filterwarnings("ignore")
try:
    plt.rcParams['font.family'] = 'AppleGothic' # 맥
except Exception:
    plt.rcParams['font.family'] ='Malgun Gothic' # 윈도우
plt.rcParams['axes.unicode_minus'] = False

# ================================================================================================
# 1. 모델 정의 함수들
# ================================================================================================
def build_regression_models():
    """회귀 모델들"""
    models = {}
    
    models["RandomForest_Reg"] = RandomForestRegressor(
        n_estimators=300, min_samples_leaf=2, random_state=42, n_jobs=-1
    )
    
    models["LinearRegression"] = LinearRegression()
    
    models["GradientBoosting_Reg"] = GradientBoostingRegressor(
        n_estimators=200, learning_rate=0.1, random_state=42
    )
    
    if HAS_XGB:
        models["XGBoost_Reg"] = xgb.XGBRegressor(
            n_estimators=400, max_depth=5, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            random_state=42, n_jobs=-1, verbosity=0
        )
    
    if HAS_LGB:
        models["LightGBM_Reg"] = lgb.LGBMRegressor(
            n_estimators=500, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            random_state=42, n_jobs=-1, verbosity=-1
        )
    
    if HAS_CATBOOST:
        models["CatBoost_Reg"] = cb.CatBoostRegressor(
            iterations=500, learning_rate=0.05, depth=6,
            random_state=42, verbose=False
        )
    
    return models

def build_classification_models():
    """분류 모델들 (4등급)"""
    models = {}
    
    models["RandomForest_Clf"] = RandomForestClassifier(
        n_estimators=300, min_samples_leaf=2, random_state=42, 
        n_jobs=-1, class_weight='balanced'
    )
    
    models["GradientBoosting_Clf"] = GradientBoostingClassifier(
        n_estimators=200, learning_rate=0.1, random_state=42
    )
    
    models["LogisticRegression_Clf"] = LogisticRegression(
        multi_class="multinomial", solver="lbfgs", max_iter=1000,
        random_state=42, class_weight='balanced'
    )
    
    if HAS_XGB:
        models["XGBoost_Clf"] = xgb.XGBClassifier(
            n_estimators=400, max_depth=5, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multi:softprob", num_class=4,
            tree_method="hist", random_state=42, n_jobs=-1, verbosity=0
        )
    
    if HAS_LGB:
        models["LightGBM_Clf"] = lgb.LGBMClassifier(
            n_estimators=500, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="multiclass", num_class=4,
            random_state=42, n_jobs=-1, verbosity=-1, is_unbalance=True
        )
    
    if HAS_CATBOOST:
        models["CatBoost_Clf"] = cb.CatBoostClassifier(
            iterations=500, learning_rate=0.05, depth=6,
            random_state=42, verbose=False, auto_class_weights='Balanced'
        )
    
    return models

# ================================================================================================
# 2. 파이프라인 및 데이터 처리 함수들
# ================================================================================================
def make_pipeline_unified(model, model_name, model_type):
    """통합 전처리 파이프라인"""
    if model_name in ["LinearRegression", "LogisticRegression_Clf"]:
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])
    else:
        pre = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
        ])
    return Pipeline(steps=[("pre", pre), ("model", model)])

def prepare_data_stratified(df, target_col, model_type, test_size=0.2, split_method='stratified'):
    """데이터 준비 - Stratified vs 시계열 분할"""
    work = df.sort_values('날짜').reset_index(drop=True).copy()
    dates = pd.to_datetime(work['날짜'])

    not_use_col = [
        '날짜',
        '1처리장','2처리장','정화조','중계펌프장','합계','시설현대화',
        '3처리장','4처리장','합계', '합계_1일후','합계_2일후',
        '등급','등급_1일후','등급_2일후'
    ]
    
    drop_cols = [c for c in (set(not_use_col) | {target_col}) if c in work.columns]
    X_raw = work.drop(columns=drop_cols, errors="ignore")
    
    for c in X_raw.columns:
        X_raw[c] = pd.to_numeric(X_raw[c], errors="coerce")

    if model_type == "regression":
        y = pd.to_numeric(work[target_col], errors="coerce")
    else:
        y = work[target_col].astype("int64")

    valid_idx = (~X_raw.isnull().all(axis=1)) & (~pd.isnull(y))
    X_raw = X_raw[valid_idx].reset_index(drop=True)
    y = y[valid_idx].reset_index(drop=True)
    dates = dates[valid_idx].reset_index(drop=True)
    
    if split_method == 'stratified':
        if model_type == "classification":
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
            train_idx, test_idx = next(sss.split(X_raw, y))
        else:
            train_idx, test_idx = train_test_split(
                range(len(X_raw)), test_size=test_size, random_state=42
            )
            
        X_train, X_test = X_raw.iloc[train_idx].copy(), X_raw.iloc[test_idx].copy()
        y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()
        dates_train, dates_test = dates.iloc[train_idx].copy(), dates.iloc[test_idx].copy()
        
    else:  # temporal split
        n = len(X_raw)
        split = int(n * (1 - test_size))
        X_train, X_test = X_raw.iloc[:split].copy(), X_raw.iloc[split:].copy()
        y_train, y_test = y.iloc[:split].copy(), y.iloc[split:].copy()
        dates_train, dates_test = dates.iloc[:split].copy(), dates.iloc[split:].copy()

    feature_names = list(X_raw.columns)
    return X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test

# ================================================================================================
# 3. 평가 함수들
# ================================================================================================
def evaluate_regression_model(model, model_name, X_train, X_test, y_train, y_test):
    """회귀 모델 평가"""
    try:
        pipe = make_pipeline_unified(model, model_name, "regression")
        pipe.fit(X_train, y_train)
        
        y_pred = pipe.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-8))) * 100
        
        return {
            'model': model_name,
            'type': 'regression',
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'mape': mape,
            'success': True
        }, pipe, y_pred
        
    except Exception as e:
        return {
            'model': model_name,
            'type': 'regression',
            'mae': np.nan,
            'rmse': np.nan,
            'r2': np.nan,
            'mape': np.nan,
            'success': False,
            'error': str(e)
        }, None, None

def evaluate_classification_model(model, model_name, X_train, X_test, y_train, y_test):
    """분류 모델 평가"""
    try:
        pipe = make_pipeline_unified(model, model_name, "classification")
        pipe.fit(X_train, y_train)
        
        y_pred = pipe.predict(X_test)
        
        if isinstance(y_pred, np.ndarray) and y_pred.ndim > 1:
            y_pred = y_pred.ravel()
        
        acc = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)
        f1_weighted = f1_score(y_test, y_pred, average="weighted", zero_division=0)
        
        extreme_classes = [0, 3]
        y_true_extreme = pd.Series(y_test).isin(extreme_classes).astype(int)
        y_pred_extreme = pd.Series(y_pred).isin(extreme_classes).astype(int)
        extreme_f1 = f1_score(y_true_extreme, y_pred_extreme, zero_division=0)
        
        return {
            'model': model_name,
            'type': 'classification',
            'accuracy': acc,
            'macro_f1': f1_macro,
            'weighted_f1': f1_weighted,
            'extreme_f1': extreme_f1,
            'success': True
        }, pipe, y_pred
        
    except Exception as e:
        return {
            'model': model_name,
            'type': 'classification',
            'accuracy': np.nan,
            'macro_f1': np.nan,
            'weighted_f1': np.nan,
            'extreme_f1': np.nan,
            'success': False,
            'error': str(e)
        }, None, None

def comprehensive_evaluation_comparison(center_name, df):
    """Stratified vs 시계열 분할 비교 평가"""
    print(f"\n{'='*70}")
    print(f"센터: {center_name} - Stratified vs 시계열 분할 비교")
    print(f"{'='*70}")
    
    print(f"데이터 크기: {len(df)}행, {len(df.columns)}컬럼")
    
    if '등급_1일후' in df.columns:
        grade_dist = df['등급_1일후'].value_counts().sort_index()
        print(f"등급 분포: {dict(grade_dist)}")
        
        min_class = grade_dist.min()
        max_class = grade_dist.max()
        imbalance_ratio = max_class / min_class
        print(f"클래스 불균형 비율: {imbalance_ratio:.1f}:1 (최대:{max_class}, 최소:{min_class})")
    
    results = []
    
    for split_method in ['temporal', 'stratified']:
        print(f"\n{'='*50}")
        print(f"분할 방법: {split_method.upper()}")
        print(f"{'='*50}")
        
        # 회귀 모델 평가
        reg_method_name = "random_shuffle" if split_method == "stratified" else split_method
        print(f"\n--- 회귀 모델 평가 ({reg_method_name}) ---")
        
        try:
            X_train, X_test, y_train, y_test, feature_names, dates_train, dates_test = prepare_data_stratified(
                df, target_col="합계_1일후", model_type="regression", test_size=0.2, split_method=split_method
            )
            
            print(f"회귀용 데이터: 학습 {len(X_train)}행, 테스트 {len(X_test)}행")
            
            regression_models = build_regression_models()
            
            for model_name, model in tqdm(regression_models.items(), desc=f"회귀({reg_method_name})", leave=False):
                result, pipe, y_pred = evaluate_regression_model(model, model_name, X_train, X_test, y_train, y_test)
                result['center'] = center_name
                result['split_method'] = split_method
                results.append(result)
                
                if result['success']:
                    print(f"  {model_name:18s}: R²={result['r2']:.3f}, MAE={result['mae']:.0f}, MAPE={result['mape']:.1f}%")
                else:
                    print(f"  {model_name:18s}: 실패 - {result.get('error', '')[:50]}")
                    
        except Exception as e:
            print(f"회귀 모델 평가 실패 ({reg_method_name}): {e}")
        
        # 분류 모델 평가
        print(f"\n--- 분류 모델 평가 ({split_method}) ---")
        
        try:
            X_train_clf, X_test_clf, y_train_clf, y_test_clf, feature_names_clf, _, _ = prepare_data_stratified(
                df, target_col="등급_1일후", model_type="classification", test_size=0.2, split_method=split_method
            )
            
            print(f"분류용 데이터: 학습 {len(X_train_clf)}행, 테스트 {len(X_test_clf)}행")
            
            test_dist = pd.Series(y_test_clf).value_counts().sort_index()
            train_dist = pd.Series(y_train_clf).value_counts().sort_index()
            print(f"학습 세트 등급 분포: {dict(train_dist)}")
            print(f"테스트 세트 등급 분포: {dict(test_dist)}")
            
            classification_models = build_classification_models()
            
            for model_name, model in tqdm(classification_models.items(), desc=f"분류({split_method})", leave=False):
                result, pipe, y_pred = evaluate_classification_model(model, model_name, X_train_clf, X_test_clf, y_train_clf, y_test_clf)
                result['center'] = center_name
                result['split_method'] = split_method
                results.append(result)
                
                if result['success']:
                    print(f"  {model_name:18s}: ACC={result['accuracy']:.3f}, F1={result['macro_f1']:.3f}, 극값F1={result['extreme_f1']:.3f}")
                else:
                    print(f"  {model_name:18s}: 실패 - {result.get('error', '')[:50]}")
                    
        except Exception as e:
            print(f"분류 모델 평가 실패 ({split_method}): {e}")
    
    return results

# ================================================================================================
# 4. Feature Importance & SHAP 분석 함수들
# ================================================================================================
def extract_feature_importance(model, model_name, feature_names):
    """모델별 Feature Importance 추출"""
    try:
        mdl = model.named_steps['model']
        if hasattr(mdl, 'feature_importances_'):
            importance = mdl.feature_importances_
        elif hasattr(mdl, 'coef_'):
            coef = mdl.coef_
            if isinstance(coef, np.ndarray) and coef.ndim == 2:
                importance = np.mean(np.abs(coef), axis=0)
            else:
                importance = np.abs(coef)
        else:
            return None

        if len(importance) != len(feature_names):
            print(f"[경고] importance 길이({len(importance)}) != feature_names({len(feature_names)})")
            m = min(len(importance), len(feature_names))
            importance = np.asarray(importance)[:m]
            feature_names = list(feature_names)[:m]

        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)

        return importance_df
    except Exception as e:
        print(f"Feature importance 추출 실패 ({model_name}): {e}")
        return None

def plot_feature_importance(importance_df, model_name, top_n=15):
    """Feature Importance 시각화"""
    if importance_df is None or len(importance_df) == 0:
        return None
    
    fig, ax = plt.subplots(figsize=(10, 8))
    top_features = importance_df.head(top_n)
    
    ax.barh(range(len(top_features)), top_features['importance'], color='skyblue')
    ax.set_yticks(range(len(top_features)))
    ax.set_yticklabels(top_features['feature'])
    ax.set_xlabel('Importance')
    ax.set_title(f'{model_name} - Top {top_n} Feature Importance')
    ax.invert_yaxis()
    
    for i, v in enumerate(top_features['importance']):
        ax.text(v + 0.001, i, f'{v:.3f}', va='center')
    
    fig.tight_layout()
    return fig

def analyze_model_with_shap(model, X_test, feature_names, model_name, max_samples=100):
    """SHAP 분석"""
    if not HAS_SHAP:
        print("SHAP 라이브러리가 설치되지 않았습니다.")
        return None
    
    try:
        if len(X_test) > max_samples:
            sample_idx = np.random.choice(len(X_test), max_samples, replace=False)
            X_sample = X_test.iloc[sample_idx]
        else:
            X_sample = X_test
        
        X_processed = model.named_steps['pre'].transform(X_sample)
        
        if 'RandomForest' in model_name or 'GradientBoosting' in model_name:
            explainer = shap.TreeExplainer(model.named_steps['model'])
        elif 'XGBoost' in model_name:
            explainer = shap.TreeExplainer(model.named_steps['model'])
        elif 'LightGBM' in model_name:
            explainer = shap.TreeExplainer(model.named_steps['model'])
        elif 'CatBoost' in model_name:
            explainer = shap.TreeExplainer(model.named_steps['model'])
        else:
            explainer = shap.LinearExplainer(model.named_steps['model'], X_processed)
        
        shap_values = explainer.shap_values(X_processed)
        
        if isinstance(shap_values, list):
            shap_values = shap_values[0]
        
        return shap_values, X_processed, explainer
        
    except Exception as e:
        print(f"SHAP 분석 실패 ({model_name}): {e}")
        return None

def plot_shap_summary(shap_values, X_processed, feature_names, model_name):
    """SHAP Summary Plot"""
    if shap_values is None or not HAS_SHAP:
        return []

    figs = []
    try:
        # Bar plot
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, X_processed,
                          feature_names=feature_names,
                          plot_type="bar", show=False)
        ax = plt.gca()
        ax.set_title(f'{model_name} - SHAP Feature Importance')
        figs.append(plt.gcf())

        # Beeswarm plot
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, X_processed,
                          feature_names=feature_names,
                          show=False)
        ax = plt.gca()
        ax.set_title(f'{model_name} - SHAP Summary Plot')
        figs.append(plt.gcf())

    except Exception as e:
        print(f"SHAP 시각화 실패 ({model_name}): {e}")

    return figs

# ================================================================================================
# 5. 피처 엔지니어링 함수들 (Data Leakage 방지 버전)
# ================================================================================================
def make_features(df, cutoff_date=None):
    """파생변수 생성 함수 - Data Leakage 방지 버전"""
    df = df.copy()
    
    df['날짜'] = pd.to_datetime(df['날짜'])
    df = df.sort_values('날짜').reset_index(drop=True)

    df['월'] = df['날짜'].dt.month
    df['요일'] = df['날짜'].dt.weekday

    season_map = {'봄': 0, '여름': 1, '가을': 2, '겨울': 3}
    discomfort_map = {'쾌적': 0, '약간 불쾌': 1, '불쾌': 2, '매우 불쾌': 3, '극심한 불쾌': 4}
    df['계절'] = df['계절'].map(season_map).astype('Int64')
    df['불쾌지수등급'] = df['불쾌지수등급'].map(discomfort_map).astype('Int64')

    # 강수량 시차 피처
    df['강수량_1일전'] = df['일_일강수량(mm)'].shift(1)
    df['강수량_2일전'] = df['일_일강수량(mm)'].shift(2)
    df['강수량_1일_누적'] = df['일_일강수량(mm)'].rolling(1, min_periods=1).sum()
    df['강수량_2일_누적'] = df['일_일강수량(mm)'].rolling(2, min_periods=1).sum()
    df['강수량_3일_누적'] = df['일_일강수량(mm)'].rolling(3, min_periods=1).sum()
    df['강수량_5일_누적'] = df['일_일강수량(mm)'].rolling(5, min_periods=1).sum()
    df['강수량_7일_누적'] = df['일_일강수량(mm)'].rolling(7, min_periods=1).sum()

    df['일교차'] = df['일_최고기온(°C)'] - df['일_최저기온(°C)']
    df['폭우_여부'] = (df['일_일강수량(mm)'] >= 80).astype(int)
    
    # 체감온도 계산
    if '일_평균기온(°C)' in df.columns:
        T = pd.to_numeric(df['일_평균기온(°C)'], errors='coerce')
    else:
        T = pd.Series(np.nan, index=df.index)
    if '일_평균풍속(m/s)' in df.columns:
        V_ms = pd.to_numeric(df['일_평균풍속(m/s)'], errors='coerce')
    else:
        V_ms = pd.Series(np.nan, index=df.index)
    if '평균습도(%)' in df.columns:
        RH = pd.to_numeric(df['평균습도(%)'], errors='coerce')
    else:
        RH = pd.Series(np.nan, index=df.index)

    # 체감온도 계산 (간단 버전)
    e = (RH/100.0) * 6.105 * np.exp(17.27*T/(237.7 + T))
    df['체감온도(°C)'] = T + 0.33*e - 0.70*V_ms - 4.00
    
    # 분류용 등급 계산
    q = df['합계'].dropna().quantile([0.15, 0.70, 0.90])
    q15, q70, q90 = float(q.loc[0.15]), float(q.loc[0.70]), float(q.loc[0.90])

    def categorize(x):
        if pd.isna(x):
            return np.nan
        if x < q15:
            return 0
        elif x < q70:
            return 1
        elif x < q90:
            return 2
        else:
            return 3

    df['등급'] = df['합계'].apply(categorize)
    
    # 타겟 변수 생성 (Data Leakage 방지)
    if cutoff_date is not None:
        cutoff = pd.to_datetime(cutoff_date)
        
        df['합계_1일후'] = np.nan
        df['합계_2일후'] = np.nan
        df['등급_1일후'] = np.nan
        df['등급_2일후'] = np.nan
        
        for i in range(len(df)):
            current_date = df.loc[i, '날짜']
            
            if i + 1 < len(df) and current_date <= cutoff:
                next_date = df.loc[i+1, '날짜']
                if next_date <= cutoff:
                    df.loc[i, '합계_1일후'] = df.loc[i+1, '합계']
                    df.loc[i, '등급_1일후'] = df.loc[i+1, '등급']
            
            if i + 2 < len(df) and current_date <= cutoff:
                next2_date = df.loc[i+2, '날짜']
                if next2_date <= cutoff:
                    df.loc[i, '합계_2일후'] = df.loc[i+2, '합계']
                    df.loc[i, '등급_2일후'] = df.loc[i+2, '등급']
    else:
        df['합계_1일후'] = df['합계'].shift(-1)
        df['합계_2일후'] = df['합계'].shift(-2)
        df['등급_1일후'] = df['등급'].shift(-1).astype('Int64')
        df['등급_2일후'] = df['등급'].shift(-2).astype('Int64')

    df.attrs['cutoffs'] = {"q15": q15, "q70": q70, "q90": q90}
    df = df.dropna().reset_index(drop=True)
    df = df[df["날짜"] < "2025-06-01"]
    
    return df

def make_features_for_prediction(historical_df, future_df):
    """새로운 데이터에 대한 파생변수 생성 (과거 데이터 활용)"""
    combined_df = pd.concat([historical_df, future_df], ignore_index=True)
    combined_df['날짜'] = pd.to_datetime(combined_df['날짜'])
    combined_df = combined_df.sort_values('날짜').reset_index(drop=True)
    
    combined_df['월'] = combined_df['날짜'].dt.month
    combined_df['요일'] = combined_df['날짜'].dt.weekday
    
    season_map = {'봄': 0, '여름': 1, '가을': 2, '겨울': 3}
    discomfort_map = {'쾌적': 0, '약간 불쾌': 1, '불쾌': 2, '매우 불쾌': 3, '극심한 불쾌': 4}
    combined_df['계절'] = combined_df['계절'].map(season_map).astype('Int64')
    combined_df['불쾌지수등급'] = combined_df['불쾌지수등급'].map(discomfort_map).astype('Int64')
    
    # 시차 변수들
    combined_df['강수량_1일전'] = combined_df['일_일강수량(mm)'].shift(1)
    combined_df['강수량_2일전'] = combined_df['일_일강수량(mm)'].shift(2)
    combined_df['강수량_1일_누적'] = combined_df['일_일강수량(mm)'].rolling(1, min_periods=1).sum()
    combined_df['강수량_2일_누적'] = combined_df['일_일강수량(mm)'].rolling(2, min_periods=1).sum()
    combined_df['강수량_3일_누적'] = combined_df['일_일강수량(mm)'].rolling(3, min_periods=1).sum()
    combined_df['강수량_5일_누적'] = combined_df['일_일강수량(mm)'].rolling(5, min_periods=1).sum()
    combined_df['강수량_7일_누적'] = combined_df['일_일강수량(mm)'].rolling(7, min_periods=1).sum()
    
    combined_df['일교차'] = combined_df['일_최고기온(°C)'] - combined_df['일_최저기온(°C)']
    combined_df['폭우_여부'] = (combined_df['일_일강수량(mm)'] >= 80).astype(int)
    
    # 체감온도 계산
    T = pd.to_numeric(combined_df.get('일_평균기온(°C)', np.nan), errors='coerce')
    V_ms = pd.to_numeric(combined_df.get('일_평균풍속(m/s)', np.nan), errors='coerce')
    RH = pd.to_numeric(combined_df.get('평균습도(%)', np.nan), errors='coerce')
    
    e = (RH/100.0) * 6.105 * np.exp(17.27*T/(237.7 + T))
    combined_df['체감온도(°C)'] = T + 0.33*e - 0.70*V_ms - 4.00
    
    # 새 데이터 부분만 반환
    historical_len = len(historical_df)
    return combined_df.iloc[historical_len:].reset_index(drop=True)

# ================================================================================================
# 6. 유틸리티 함수들
# ================================================================================================
def load_original_data():
    """원본 데이터 로드"""
    nanji_raw = pd.read_csv('../data/processed/center_season/nanji/난지_merged.csv', encoding='utf-8-sig')
    jungnang_raw = pd.read_csv('../data/processed/center_season/jungnang/중랑_merged.csv', encoding='utf-8-sig')
    seonam_raw = pd.read_csv('../data/processed/center_season/seonam/서남_merged.csv', encoding='utf-8-sig')
    tancheon_raw = pd.read_csv('../data/processed/center_season/tancheon/탄천_merged.csv', encoding='utf-8-sig')
    
    return {
        "nanji": nanji_raw,
        "jungnang": jungnang_raw,
        "seonam": seonam_raw,
        "tancheon": tancheon_raw
    }

def prepare_prediction_features(future_data, expected_features):
    """예측용 피처 준비"""
    not_use_col = [
        '날짜', '1처리장','2처리장','정화조','중계펌프장','합계','시설현대화',
        '3처리장','4처리장','합계', '합계_1일후','합계_2일후',
        '등급','등급_1일후','등급_2일후'
    ]
    
    available_cols = [col for col in future_data.columns if col not in not_use_col]
    X_future = future_data[available_cols].copy()
    
    for c in X_future.columns:
        X_future[c] = pd.to_numeric(X_future[c], errors="coerce")
    
    missing_features = set(expected_features) - set(X_future.columns)
    if missing_features:
        for feature in missing_features:
            X_future[feature] = 0
    
    X_future = X_future[expected_features].copy()
    return X_future

def create_performance_summary(results_df):
    """성능 요약 생성"""
    summary = {}
    
    for center in results_df['center'].unique():
        center_data = results_df[results_df['center'] == center]
        summary[center] = {}
        
        for task_type in ['regression', 'classification']:
            task_data = center_data[center_data['task_type'] == task_type]
            task_data_clean = task_data.dropna(subset=['actual_value', 'predicted_value'])
            
            if len(task_data_clean) > 0:
                if task_type == 'regression':
                    # 회귀 지표 계산
                    abs_errors = np.abs(task_data_clean['actual_value'] - task_data_clean['predicted_value'])
                    squared_errors = (task_data_clean['actual_value'] - task_data_clean['predicted_value']) ** 2
                    pct_errors = abs_errors / (np.abs(task_data_clean['actual_value']) + 1e-8) * 100
                    
                    # R² 계산
                    y_true = task_data_clean['actual_value'].values
                    y_pred = task_data_clean['predicted_value'].values
                    ss_res = np.sum((y_true - y_pred) ** 2)
                    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
                    r2 = 1 - (ss_res / (ss_tot + 1e-8))
                    
                    summary[center]['regression'] = {
                        'model_name': task_data_clean.iloc[0]['model_name'],
                        'prediction_count': len(task_data_clean),
                        'mae': abs_errors.mean(),
                        'rmse': np.sqrt(squared_errors.mean()),
                        'mape': pct_errors.mean(),
                        'r2_on_predictions': r2
                    }
                else:
                    # 분류 지표 계산
                    correct = (task_data_clean['actual_value'] == task_data_clean['predicted_value']).astype(int)
                    
                    summary[center]['classification'] = {
                        'model_name': task_data_clean.iloc[0]['model_name'],
                        'prediction_count': len(task_data_clean),
                        'accuracy': correct.mean(),
                        'correct_count': int(correct.sum()),
                        'total_count': len(task_data_clean)
                    }
    
    return summary

# ================================================================================================
# 7. 모든 기반 함수 준비 완료 메시지
# ================================================================================================
print("="*60)
print("모든 기반 함수들이 로드되었습니다!")
print("이제 센터별 완전 분석 파이프라인을 실행할 수 있습니다.")
print("="*60)
print()
print("다음 단계:")
print("1. 센터별 완전 분석 파이프라인 코드를 실행하세요")
print("2. run_complete_analysis() 함수를 실행하세요")
print()
print("필요한 라이브러리 확인:")
print(f"- XGBoost: {'✓' if HAS_XGB else '✗'}")
print(f"- LightGBM: {'✓' if HAS_LGB else '✗'}")
print(f"- CatBoost: {'✓' if HAS_CATBOOST else '✗'}")
print(f"- SHAP: {'✓' if HAS_SHAP else '✗'}")
print("="*60)

모든 기반 함수들이 로드되었습니다!
이제 센터별 완전 분석 파이프라인을 실행할 수 있습니다.

다음 단계:
1. 센터별 완전 분석 파이프라인 코드를 실행하세요
2. run_complete_analysis() 함수를 실행하세요

필요한 라이브러리 확인:
- XGBoost: ✓
- LightGBM: ✓
- CatBoost: ✓
- SHAP: ✓


In [12]:
# ================================================================================================
# 센터별 완전 분석 파이프라인 (회귀/분류 각각 + 시각화 포함)
# ================================================================================================


def run_complete_center_analysis(cutoff_date='2025-05-20', save_visualizations=True):
    """
    센터별 완전 분석 파이프라인
    - 4개 센터 x 2개 태스크(회귀/분류) = 총 8개 분석
    - 각 센터별 최고 성능 모델 선택
    - Feature Importance + SHAP 시각화 포함
    - 최종 예측 수행
    """
    
    print(f"{'='*80}")
    print(f"센터별 완전 분석 파이프라인 시작")
    print(f"학습 기간: ~ {cutoff_date}")
    print(f"예측 기간: {cutoff_date} 이후")
    print(f"{'='*80}")
    
    # 데이터 로드
    try:
        centers_data = load_original_data()
        print(f"데이터 로드 완료:")
        for name, df in centers_data.items():
            print(f"  {name}: {len(df)}행")
    except Exception as e:
        print(f"데이터 로드 실패: {e}")
        return None
    
    # 결과 저장용
    all_results = {}
    all_training_results = []
    all_predictions = []
    best_models_summary = []
    
    # 각 센터별 개별 분석
    for center_name, df_raw in centers_data.items():
        print(f"\n{'='*60}")
        print(f"센터: {center_name.upper()}")
        print(f"{'='*60}")
        
        # 센터별 분석 실행
        center_results = analyze_single_center_complete(
            center_name, df_raw, cutoff_date, save_visualizations
        )
        
        if center_results:
            all_results[center_name] = center_results
            all_training_results.extend(center_results['training_results'])
            all_predictions.extend(center_results['predictions'])
            best_models_summary.extend(center_results['best_models_info'])
    
    # 전체 결과 요약
    print(f"\n{'='*80}")
    print(f"전체 분석 결과 요약")
    print(f"{'='*80}")
    
    # 센터별 최고 성능 모델 요약
    print_best_models_summary(best_models_summary)
    
    # 예측 성능 요약
    if all_predictions:
        final_results_df = pd.DataFrame(all_predictions)
        performance_summary = create_performance_summary(final_results_df)
        print_prediction_performance(performance_summary)
        
        # 결과 저장
        save_complete_results(final_results_df, all_training_results, best_models_summary)
    
    return {
        'center_results': all_results,
        'predictions': all_predictions,
        'training_results': all_training_results,
        'best_models': best_models_summary
    }

def analyze_single_center_complete(center_name, df_raw, cutoff_date, save_visualizations=True):
    """단일 센터 완전 분석"""
    
    print(f"\n[{center_name.upper()} 센터 분석 시작]")
    
    try:
        # 1. 데이터 준비
        df_raw['날짜'] = pd.to_datetime(df_raw['날짜'])
        df_raw = df_raw.sort_values('날짜').reset_index(drop=True)
        cutoff = pd.to_datetime(cutoff_date)
        
        # 학습/예측 데이터 분할
        raw_train_data = df_raw[df_raw['날짜'] <= cutoff].copy()
        raw_future_data = df_raw[df_raw['날짜'] > cutoff].copy()
        
        print(f"  학습 데이터: {len(raw_train_data)}행")
        print(f"  예측 데이터: {len(raw_future_data)}행")
        
        if len(raw_train_data) < 50:
            print(f"  학습 데이터 부족")
            return None
        
        # 2. 피처 엔지니어링 (Data Leakage 방지)
        train_data = make_features(raw_train_data, cutoff_date=cutoff_date)
        print(f"  피처 생성 완료: {len(train_data)}행")
        
        # 3. 모델 학습 및 평가
        training_results = comprehensive_evaluation_comparison(center_name, train_data)
        
        # 4. 최고 성능 모델 선택 및 상세 분석
        best_models = select_best_models_for_center(center_name, train_data, training_results)
        best_models_info = []
        
        if best_models:
            # 각 태스크별 최고 모델 상세 분석
            for task_type, model_info in best_models.items():
                print(f"\n--- {center_name.upper()} {task_type.upper()} 최고 모델 분석 ---")
                
                # 상세 분석 수행 (Feature Importance + SHAP)
                detailed_analysis = perform_detailed_model_analysis(
                    center_name, train_data, model_info, task_type, save_visualizations
                )
                
                if detailed_analysis:
                    model_info.update(detailed_analysis)
                    best_models_info.append({
                        'center': center_name,
                        'task_type': task_type,
                        'model_name': model_info['model_name'],
                        'performance': model_info['performance'],
                        'analysis_completed': True
                    })
        
        # 5. 예측 수행
        predictions = []
        if len(raw_future_data) > 0 and best_models:
            print(f"\n--- {center_name.upper()} 예측 수행 ---")
            
            # 예측용 데이터 준비
            future_data_processed = make_features_for_prediction(raw_train_data, raw_future_data)
            
            # 각 태스크별 예측
            for task_type, model_info in best_models.items():
                task_predictions = make_predictions_for_task(
                    center_name, future_data_processed, model_info, task_type
                )
                predictions.extend(task_predictions)
        
        return {
            'training_results': training_results,
            'best_models': best_models,
            'best_models_info': best_models_info,
            'predictions': predictions
        }
        
    except Exception as e:
        print(f"  센터 분석 실패: {e}")
        return None

def select_best_models_for_center(center_name, train_data, training_results):
    """센터별 최고 성능 모델 선택"""
    
    results_df = pd.DataFrame(training_results)
    successful_results = results_df[results_df['success'] == True]
    
    if len(successful_results) == 0:
        print(f"  성공한 모델이 없습니다.")
        return None
    
    best_models = {}
    
    # 회귀 최고 성능 모델
    reg_results = successful_results[successful_results['type'] == 'regression']
    if len(reg_results) > 0:
        best_reg = reg_results.loc[reg_results['r2'].idxmax()]
        print(f"  최고 회귀 모델: {best_reg['model']} (R²={best_reg['r2']:.3f})")
        
        # 모델 재학습
        reg_model = retrain_best_model_full(
            train_data, best_reg['model'], 'regression', best_reg['split_method']
        )
        
        if reg_model:
            best_models['regression'] = {
                'model_name': best_reg['model'],
                'pipeline': reg_model['pipeline'],
                'feature_names': reg_model['feature_names'],
                'performance': dict(best_reg),
                'split_method': best_reg['split_method']
            }
    
    # 분류 최고 성능 모델
    clf_results = successful_results[successful_results['type'] == 'classification']
    if len(clf_results) > 0:
        best_clf = clf_results.loc[clf_results['macro_f1'].idxmax()]
        print(f"  최고 분류 모델: {best_clf['model']} (F1={best_clf['macro_f1']:.3f})")
        
        # 모델 재학습
        clf_model = retrain_best_model_full(
            train_data, best_clf['model'], 'classification', best_clf['split_method']
        )
        
        if clf_model:
            best_models['classification'] = {
                'model_name': best_clf['model'],
                'pipeline': clf_model['pipeline'],
                'feature_names': clf_model['feature_names'],
                'performance': dict(best_clf),
                'split_method': best_clf['split_method']
            }
    
    return best_models

def retrain_best_model_full(train_data, model_name, model_type, split_method):
    """최고 성능 모델 전체 데이터로 재학습"""
    
    try:
        target_col = "합계_1일후" if model_type == "regression" else "등급_1일후"
        
        # 전체 데이터 사용
        X_train, X_test, y_train, y_test, feature_names, _, _ = prepare_data_stratified(
            train_data, target_col=target_col, model_type=model_type, 
            test_size=0.05, split_method=split_method
        )
        
        # 전체 데이터 결합
        X_all = pd.concat([X_train, X_test], ignore_index=True)
        y_all = pd.concat([y_train, y_test], ignore_index=True)
        
        # 모델 구축
        if model_type == "regression":
            models = build_regression_models()
        else:
            models = build_classification_models()
        
        model = models[model_name]
        pipeline = make_pipeline_unified(model, model_name, model_type)
        pipeline.fit(X_all, y_all)
        
        return {
            'pipeline': pipeline,
            'feature_names': feature_names
        }
        
    except Exception as e:
        print(f"    모델 재학습 실패: {e}")
        return None

def perform_detailed_model_analysis(center_name, train_data, model_info, task_type, save_visualizations=True):
    """상세 모델 분석 (Feature Importance + SHAP)"""
    
    try:
        pipeline = model_info['pipeline']
        feature_names = model_info['feature_names']
        model_name = model_info['model_name']
        
        # 테스트 데이터 준비
        target_col = "합계_1일후" if task_type == "regression" else "등급_1일후"
        X_train, X_test, y_train, y_test, _, _, _ = prepare_data_stratified(
            train_data, target_col=target_col, model_type=task_type, 
            test_size=0.2, split_method=model_info['split_method']
        )
        
        analysis_results = {}
        
        # 1. Feature Importance 분석
        print(f"    Feature Importance 분석...")
        importance_df = extract_feature_importance(pipeline, model_name, feature_names)
        
        if importance_df is not None:
            print(f"    Top 5 중요 피처:")
            for idx, row in importance_df.head(5).iterrows():
                print(f"      {row['feature']}: {row['importance']:.3f}")
            
            # 시각화
            if save_visualizations:
                fig = plot_feature_importance(importance_df, f"{center_name}_{model_name}")
                if fig:
                    save_path = f"feature_importance_{center_name}_{model_name}_{task_type}.png"
                    fig.savefig(save_path, dpi=300, bbox_inches='tight')
                    plt.close(fig)
                    print(f"    Feature Importance 시각화 저장: {save_path}")
            
            analysis_results['feature_importance'] = importance_df
        
        # 2. SHAP 분석
        print(f"    SHAP 분석...")
        try:
            shap_result = analyze_model_with_shap(pipeline, X_test, feature_names, model_name, max_samples=50)
            
            if shap_result:
                shap_values, X_processed, explainer = shap_result
                
                # SHAP 시각화
                if save_visualizations:
                    shap_figs = plot_shap_summary(shap_values, X_processed, feature_names, 
                                                f"{center_name}_{model_name}")
                    
                    for i, fig in enumerate(shap_figs):
                        suffix = "bar" if i == 0 else "beeswarm"
                        save_path = f"shap_{suffix}_{center_name}_{model_name}_{task_type}.png"
                        fig.savefig(save_path, dpi=300, bbox_inches='tight')
                        plt.close(fig)
                        print(f"    SHAP {suffix} 시각화 저장: {save_path}")
                
                analysis_results['shap_analysis'] = {
                    'shap_values': shap_values,
                    'feature_names': feature_names
                }
        
        except Exception as e:
            print(f"    SHAP 분석 실패: {e}")
        
        return analysis_results
        
    except Exception as e:
        print(f"    상세 분석 실패: {e}")
        return None

def make_predictions_for_task(center_name, future_data, model_info, task_type):
    """태스크별 예측 수행"""
    
    predictions = []
    
    try:
        pipeline = model_info['pipeline']
        feature_names = model_info['feature_names']
        model_name = model_info['model_name']
        
        # 예측 데이터 준비
        X_future = prepare_prediction_features(future_data, feature_names)
        
        if X_future is None or len(X_future) == 0:
            return predictions
        
        # 예측 수행
        y_pred = pipeline.predict(X_future)
        target_col = "합계_1일후" if task_type == "regression" else "등급_1일후"
        
        print(f"    {task_type} 예측 완료: {len(y_pred)}개")
        
        # 결과 저장
        for i in range(len(X_future)):
            # 실제값 추출 (가능한 경우)
            actual_val = None
            if target_col in future_data.columns and i < len(future_data):
                actual_val = future_data.iloc[i].get(target_col)
                if pd.notna(actual_val):
                    actual_val = int(actual_val) if task_type == 'classification' else float(actual_val)
            
            pred_result = {
                'date': future_data.iloc[i]['날짜'],
                'center': center_name,
                'task_type': task_type,
                'model_name': model_name,
                'target_column': target_col,
                'actual_value': actual_val,
                'predicted_value': int(y_pred[i]) if task_type == 'classification' else float(y_pred[i])
            }
            predictions.append(pred_result)
            
    except Exception as e:
        print(f"    {task_type} 예측 실패: {e}")
    
    return predictions

def print_best_models_summary(best_models_summary):
    """최고 성능 모델 요약 출력"""
    
    print(f"\n--- 센터별 최고 성능 모델 요약 ---")
    
    centers = {}
    for model in best_models_summary:
        center = model['center']
        if center not in centers:
            centers[center] = {}
        centers[center][model['task_type']] = model
    
    for center, tasks in centers.items():
        print(f"\n{center.upper()} 센터:")
        
        if 'regression' in tasks:
            reg = tasks['regression']
            perf = reg['performance']
            print(f"  회귀: {reg['model_name']} (R²={perf['r2']:.3f}, MAE={perf['mae']:.1f})")
        
        if 'classification' in tasks:
            clf = tasks['classification']
            perf = clf['performance']
            print(f"  분류: {clf['model_name']} (F1={perf['macro_f1']:.3f}, ACC={perf['accuracy']:.3f})")

def print_prediction_performance(performance_summary):
    """예측 성능 출력"""
    
    print(f"\n--- 예측 성능 요약 ---")
    
    for center, perf in performance_summary.items():
        print(f"\n{center.upper()} 센터:")
        
        if 'regression' in perf:
            reg = perf['regression']
            print(f"  회귀 예측 ({reg['prediction_count']}개):")
            print(f"    MAE: {reg['mae']:.2f}")
            print(f"    RMSE: {reg['rmse']:.2f}")
            if reg.get('r2_on_predictions'):
                print(f"    R²: {reg['r2_on_predictions']:.3f}")
        
        if 'classification' in perf:
            clf = perf['classification']
            print(f"  분류 예측 ({clf['prediction_count']}개):")
            print(f"    정확도: {clf['accuracy']:.1%}")
            print(f"    정답: {clf['correct_count']}/{clf['total_count']}")

def save_complete_results(predictions_df, training_results, best_models_summary):
    """결과 저장"""
    
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    
    # 1. 예측 결과
    pred_filename = f"complete_analysis_predictions_{timestamp}.csv"
    predictions_df.to_csv(pred_filename, index=False, encoding='utf-8-sig')
    print(f"\n예측 결과 저장: {pred_filename}")
    
    # 2. 학습 결과
    if training_results:
        train_df = pd.DataFrame(training_results)
        train_filename = f"complete_analysis_training_{timestamp}.csv"
        train_df.to_csv(train_filename, index=False, encoding='utf-8-sig')
        print(f"학습 결과 저장: {train_filename}")
    
    # 3. 최고 모델 요약
    if best_models_summary:
        best_df = pd.DataFrame(best_models_summary)
        best_filename = f"complete_analysis_best_models_{timestamp}.csv"
        best_df.to_csv(best_filename, index=False, encoding='utf-8-sig')
        print(f"최고 모델 요약 저장: {best_filename}")

# ================================================================================================
# 실행 함수
# ================================================================================================

def run_complete_analysis(cutoff_date='2025-05-20'):
    """완전 분석 실행"""
    
    print("센터별 완전 분석 파이프라인을 시작합니다...")
    print("이 과정은 시간이 오래 걸릴 수 있습니다.")
    
    start_time = time.time()
    
    # 분석 실행
    results = run_complete_center_analysis(cutoff_date=cutoff_date, save_visualizations=True)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f"\n{'='*80}")
    print(f"완전 분석 완료!")
    print(f"총 소요시간: {elapsed_time:.1f}초 ({elapsed_time/60:.1f}분)")
    print(f"분석된 센터: {len(results['center_results'])}개")
    print(f"생성된 예측: {len(results['predictions'])}개")
    print(f"시각화 파일들이 현재 디렉토리에 저장되었습니다.")
    print(f"{'='*80}")
    
    return results

# 사용 예시
if __name__ == "__main__":
    print("=== 센터별 완전 분석 파이프라인 ===")
    print()
    print("이 파이프라인은 다음을 수행합니다:")
    print("1. 4개 센터별 개별 분석")
    print("2. 센터별 회귀/분류 최고 성능 모델 선택")
    print("3. Feature Importance + SHAP 시각화 생성")
    print("4. 선택된 모델로 예측 수행")
    print("5. 결과 CSV 파일 및 시각화 이미지 저장")
    print()
    print("실행 방법:")
    print("results = run_complete_analysis(cutoff_date='2025-05-20')")
    print()
    print("생성되는 파일:")
    print("- complete_analysis_predictions_YYYYMMDD_HHMMSS.csv")
    print("- complete_analysis_training_YYYYMMDD_HHMMSS.csv")
    print("- complete_analysis_best_models_YYYYMMDD_HHMMSS.csv")
    print("- feature_importance_센터명_모델명_태스크.png")
    print("- shap_bar_센터명_모델명_태스크.png")
    print("- shap_beeswarm_센터명_모델명_태스크.png")

=== 센터별 완전 분석 파이프라인 ===

이 파이프라인은 다음을 수행합니다:
1. 4개 센터별 개별 분석
2. 센터별 회귀/분류 최고 성능 모델 선택
3. Feature Importance + SHAP 시각화 생성
4. 선택된 모델로 예측 수행
5. 결과 CSV 파일 및 시각화 이미지 저장

실행 방법:
results = run_complete_analysis(cutoff_date='2025-05-20')

생성되는 파일:
- complete_analysis_predictions_YYYYMMDD_HHMMSS.csv
- complete_analysis_training_YYYYMMDD_HHMMSS.csv
- complete_analysis_best_models_YYYYMMDD_HHMMSS.csv
- feature_importance_센터명_모델명_태스크.png
- shap_bar_센터명_모델명_태스크.png
- shap_beeswarm_센터명_모델명_태스크.png


In [13]:
# 1. 선행 코드 실행 (모든 import, 모델 정의, 평가 함수들)
# (위 코드 전체 실행)

# 2. 센터별 완전 분석 파이프라인 코드 실행
# (complete_center_analysis 아티팩트의 코드 실행)

# 3. 분석 실행
results = run_complete_analysis(cutoff_date='2025-05-20')

센터별 완전 분석 파이프라인을 시작합니다...
이 과정은 시간이 오래 걸릴 수 있습니다.
센터별 완전 분석 파이프라인 시작
학습 기간: ~ 2025-05-20
예측 기간: 2025-05-20 이후
데이터 로드 완료:
  nanji: 3103행
  jungnang: 3103행
  seonam: 3103행
  tancheon: 3103행

센터: NANJI

[NANJI 센터 분석 시작]
  학습 데이터: 3062행
  예측 데이터: 41행
  피처 생성 완료: 3058행

센터: nanji - Stratified vs 시계열 분할 비교
데이터 크기: 3058행, 44컬럼
등급 분포: {0.0: 458, 1.0: 1682, 2.0: 611, 3.0: 307}
클래스 불균형 비율: 5.5:1 (최대:1682, 최소:307)

분할 방법: TEMPORAL

--- 회귀 모델 평가 (temporal) ---
회귀용 데이터: 학습 2446행, 테스트 612행


회귀(temporal):  17%|█▋        | 1/6 [00:00<00:03,  1.28it/s]

  RandomForest_Reg  : R²=0.557, MAE=49542, MAPE=7.3%
  LinearRegression  : R²=0.513, MAE=62439, MAPE=9.7%


회귀(temporal):  50%|█████     | 3/6 [00:02<00:02,  1.23it/s]

  GradientBoosting_Reg: R²=0.395, MAE=58665, MAPE=9.0%


회귀(temporal):  67%|██████▋   | 4/6 [00:02<00:01,  1.42it/s]

  XGBoost_Reg       : R²=0.480, MAE=56356, MAPE=8.4%


회귀(temporal):  83%|████████▎ | 5/6 [00:04<00:01,  1.14s/it]

  LightGBM_Reg      : R²=0.492, MAE=54005, MAPE=8.0%


                                                             

  CatBoost_Reg      : R²=0.540, MAE=48369, MAPE=7.1%

--- 분류 모델 평가 (temporal) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 453, 1: 1309, 2: 435, 3: 249}
테스트 세트 등급 분포: {0: 5, 1: 373, 2: 176, 3: 58}


분류(temporal):  17%|█▋        | 1/6 [00:00<00:01,  3.38it/s]

  RandomForest_Clf  : ACC=0.748, F1=0.481, 극값F1=0.500


분류(temporal):  33%|███▎      | 2/6 [00:06<00:15,  3.98s/it]

  GradientBoosting_Clf: ACC=0.665, F1=0.468, 극값F1=0.386
  LogisticRegression_Clf: ACC=0.314, F1=0.308, 극값F1=0.199


분류(temporal):  67%|██████▋   | 4/6 [00:08<00:04,  2.03s/it]

  XGBoost_Clf       : ACC=0.722, F1=0.567, 극값F1=0.497


분류(temporal):  83%|████████▎ | 5/6 [00:15<00:03,  3.60s/it]

  LightGBM_Clf      : ACC=0.694, F1=0.527, 극값F1=0.432


                                                             

  CatBoost_Clf      : ACC=0.672, F1=0.508, 극값F1=0.404

분할 방법: STRATIFIED

--- 회귀 모델 평가 (random_shuffle) ---
회귀용 데이터: 학습 2446행, 테스트 612행


회귀(random_shuffle):  17%|█▋        | 1/6 [00:00<00:03,  1.34it/s]

  RandomForest_Reg  : R²=0.612, MAE=47300, MAPE=7.0%
  LinearRegression  : R²=0.592, MAE=50530, MAPE=7.5%


회귀(random_shuffle):  50%|█████     | 3/6 [00:02<00:02,  1.25it/s]

  GradientBoosting_Reg: R²=0.620, MAE=46878, MAPE=6.9%


회귀(random_shuffle):  67%|██████▋   | 4/6 [00:02<00:01,  1.45it/s]

  XGBoost_Reg       : R²=0.637, MAE=45859, MAPE=6.8%


회귀(random_shuffle):  83%|████████▎ | 5/6 [00:04<00:01,  1.02s/it]

  LightGBM_Reg      : R²=0.605, MAE=48431, MAPE=7.2%


                                                                   

  CatBoost_Reg      : R²=0.648, MAE=44875, MAPE=6.6%

--- 분류 모델 평가 (stratified) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 366, 1: 1345, 2: 489, 3: 246}
테스트 세트 등급 분포: {0: 92, 1: 337, 2: 122, 3: 61}


분류(stratified):  17%|█▋        | 1/6 [00:00<00:01,  3.61it/s]

  RandomForest_Clf  : ACC=0.725, F1=0.664, 극값F1=0.660


분류(stratified):  33%|███▎      | 2/6 [00:06<00:15,  3.92s/it]

  GradientBoosting_Clf: ACC=0.725, F1=0.658, 극값F1=0.648
  LogisticRegression_Clf: ACC=0.613, F1=0.578, 극값F1=0.597


분류(stratified):  67%|██████▋   | 4/6 [00:08<00:04,  2.05s/it]

  XGBoost_Clf       : ACC=0.745, F1=0.667, 극값F1=0.645


분류(stratified):  83%|████████▎ | 5/6 [00:15<00:03,  3.55s/it]

  LightGBM_Clf      : ACC=0.735, F1=0.665, 극값F1=0.636


                                                               

  CatBoost_Clf      : ACC=0.675, F1=0.630, 극값F1=0.632
  최고 회귀 모델: CatBoost_Reg (R²=0.648)




  최고 분류 모델: XGBoost_Clf (F1=0.667)

--- NANJI REGRESSION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      강수량_7일_누적: 12.240
      하천: 5.756
      강수량_1일_누적: 5.337
      세탁업: 5.043
      체감온도(°C): 4.405
    Feature Importance 시각화 저장: feature_importance_nanji_CatBoost_Reg_regression.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_nanji_CatBoost_Reg_regression.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_nanji_CatBoost_Reg_regression.png

--- NANJI CLASSIFICATION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      체감온도(°C): 0.071
      일_최저기온(°C): 0.058
      강수량_7일_누적: 0.058
      강수량_2일_누적: 0.054
      목욕장업: 0.047
    Feature Importance 시각화 저장: feature_importance_nanji_XGBoost_Clf_classification.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_nanji_XGBoost_Clf_classification.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_nanji_XGBoost_Clf_classification.png

--- NANJI 예측 수행 ---
    regression 예측 완료: 41개
    classification 예측 완료: 41개

센터: JUNGNANG

[JUNGNANG 센터

회귀(temporal):  17%|█▋        | 1/6 [00:00<00:03,  1.30it/s]

  RandomForest_Reg  : R²=0.304, MAE=91088, MAPE=7.1%
  LinearRegression  : R²=0.055, MAE=131262, MAPE=10.7%


회귀(temporal):  50%|█████     | 3/6 [00:02<00:02,  1.20it/s]

  GradientBoosting_Reg: R²=0.137, MAE=105765, MAPE=8.4%


회귀(temporal):  67%|██████▋   | 4/6 [00:02<00:01,  1.43it/s]

  XGBoost_Reg       : R²=0.269, MAE=96690, MAPE=7.6%


회귀(temporal):  83%|████████▎ | 5/6 [00:04<00:01,  1.15s/it]

  LightGBM_Reg      : R²=0.254, MAE=96761, MAPE=7.6%


                                                             

  CatBoost_Reg      : R²=0.259, MAE=99822, MAPE=7.9%

--- 분류 모델 평가 (temporal) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 405, 1: 1321, 2: 458, 3: 262}
테스트 세트 등급 분포: {0: 55, 1: 359, 2: 153, 3: 45}


분류(temporal):  17%|█▋        | 1/6 [00:00<00:01,  3.57it/s]

  RandomForest_Clf  : ACC=0.627, F1=0.425, 극값F1=0.311


분류(temporal):  33%|███▎      | 2/6 [00:06<00:16,  4.05s/it]

  GradientBoosting_Clf: ACC=0.547, F1=0.380, 극값F1=0.303
  LogisticRegression_Clf: ACC=0.359, F1=0.296, 극값F1=0.312


분류(temporal):  67%|██████▋   | 4/6 [00:08<00:04,  2.03s/it]

  XGBoost_Clf       : ACC=0.541, F1=0.442, 극값F1=0.342


분류(temporal):  83%|████████▎ | 5/6 [00:15<00:03,  3.55s/it]

  LightGBM_Clf      : ACC=0.531, F1=0.468, 극값F1=0.375


                                                             

  CatBoost_Clf      : ACC=0.554, F1=0.411, 극값F1=0.325

분할 방법: STRATIFIED

--- 회귀 모델 평가 (random_shuffle) ---
회귀용 데이터: 학습 2446행, 테스트 612행


회귀(random_shuffle):  17%|█▋        | 1/6 [00:00<00:03,  1.37it/s]

  RandomForest_Reg  : R²=0.576, MAE=76240, MAPE=5.5%
  LinearRegression  : R²=0.522, MAE=89786, MAPE=6.6%


회귀(random_shuffle):  50%|█████     | 3/6 [00:02<00:02,  1.20it/s]

  GradientBoosting_Reg: R²=0.572, MAE=78344, MAPE=5.7%


회귀(random_shuffle):  67%|██████▋   | 4/6 [00:02<00:01,  1.39it/s]

  XGBoost_Reg       : R²=0.575, MAE=76245, MAPE=5.5%


회귀(random_shuffle):  83%|████████▎ | 5/6 [00:04<00:01,  1.04s/it]

  LightGBM_Reg      : R²=0.562, MAE=78893, MAPE=5.7%


                                                                   

  CatBoost_Reg      : R²=0.602, MAE=75195, MAPE=5.5%

--- 분류 모델 평가 (stratified) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 368, 1: 1344, 2: 489, 3: 245}
테스트 세트 등급 분포: {0: 92, 1: 336, 2: 122, 3: 62}


분류(stratified):  17%|█▋        | 1/6 [00:00<00:01,  3.63it/s]

  RandomForest_Clf  : ACC=0.711, F1=0.629, 극값F1=0.606


분류(stratified):  33%|███▎      | 2/6 [00:07<00:16,  4.08s/it]

  GradientBoosting_Clf: ACC=0.730, F1=0.640, 극값F1=0.603
  LogisticRegression_Clf: ACC=0.560, F1=0.548, 극값F1=0.543


분류(stratified):  67%|██████▋   | 4/6 [00:08<00:04,  2.04s/it]

  XGBoost_Clf       : ACC=0.745, F1=0.661, 극값F1=0.626


분류(stratified):  83%|████████▎ | 5/6 [00:15<00:03,  3.56s/it]

  LightGBM_Clf      : ACC=0.739, F1=0.647, 극값F1=0.632


                                                               

  CatBoost_Clf      : ACC=0.691, F1=0.653, 극값F1=0.654
  최고 회귀 모델: CatBoost_Reg (R²=0.602)




  최고 분류 모델: XGBoost_Clf (F1=0.661)

--- JUNGNANG REGRESSION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      강수량_7일_누적: 9.430
      세탁업: 6.933
      체력단련장업: 6.412
      목욕장업: 5.363
      하천: 4.950
    Feature Importance 시각화 저장: feature_importance_jungnang_CatBoost_Reg_regression.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_jungnang_CatBoost_Reg_regression.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_jungnang_CatBoost_Reg_regression.png

--- JUNGNANG CLASSIFICATION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      수영장업: 0.066
      체력단련장업: 0.058
      강수량_7일_누적: 0.058
      목욕장업: 0.053
      일_최저기온(°C): 0.047
    Feature Importance 시각화 저장: feature_importance_jungnang_XGBoost_Clf_classification.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_jungnang_XGBoost_Clf_classification.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_jungnang_XGBoost_Clf_classification.png

--- JUNGNANG 예측 수행 ---
    regression 예측 완료: 41개
    classification 예측 완료: 41개

센터: SEONAM

[S

회귀(temporal):  17%|█▋        | 1/6 [00:00<00:04,  1.11it/s]

  RandomForest_Reg  : R²=0.037, MAE=157633, MAPE=10.9%
  LinearRegression  : R²=0.058, MAE=156666, MAPE=10.9%


회귀(temporal):  50%|█████     | 3/6 [00:02<00:02,  1.12it/s]

  GradientBoosting_Reg: R²=0.270, MAE=124493, MAPE=8.5%


회귀(temporal):  67%|██████▋   | 4/6 [00:03<00:01,  1.32it/s]

  XGBoost_Reg       : R²=0.244, MAE=127234, MAPE=8.7%


회귀(temporal):  83%|████████▎ | 5/6 [00:04<00:01,  1.07s/it]

  LightGBM_Reg      : R²=0.280, MAE=120014, MAPE=8.1%


                                                             

  CatBoost_Reg      : R²=0.327, MAE=124794, MAPE=8.5%

--- 분류 모델 평가 (temporal) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 136, 1: 1493, 2: 544, 3: 273}
테스트 세트 등급 분포: {0: 323, 1: 187, 2: 68, 3: 34}


분류(temporal):  17%|█▋        | 1/6 [00:00<00:01,  3.65it/s]

  RandomForest_Clf  : ACC=0.304, F1=0.323, 극값F1=0.124


분류(temporal):  33%|███▎      | 2/6 [00:07<00:16,  4.16s/it]

  GradientBoosting_Clf: ACC=0.477, F1=0.432, 극값F1=0.593
  LogisticRegression_Clf: ACC=0.271, F1=0.306, 극값F1=0.225


분류(temporal):  67%|██████▋   | 4/6 [00:08<00:04,  2.07s/it]

  XGBoost_Clf       : ACC=0.317, F1=0.307, 극값F1=0.117


분류(temporal):  83%|████████▎ | 5/6 [00:15<00:03,  3.61s/it]

  LightGBM_Clf      : ACC=0.301, F1=0.265, 극값F1=0.057


                                                             

  CatBoost_Clf      : ACC=0.252, F1=0.265, 극값F1=0.149

분할 방법: STRATIFIED

--- 회귀 모델 평가 (random_shuffle) ---
회귀용 데이터: 학습 2446행, 테스트 612행


회귀(random_shuffle):  17%|█▋        | 1/6 [00:00<00:03,  1.27it/s]

  RandomForest_Reg  : R²=0.562, MAE=91557, MAPE=5.4%
  LinearRegression  : R²=0.528, MAE=100152, MAPE=6.0%


회귀(random_shuffle):  50%|█████     | 3/6 [00:02<00:02,  1.17it/s]

  GradientBoosting_Reg: R²=0.604, MAE=86879, MAPE=5.1%


회귀(random_shuffle):  67%|██████▋   | 4/6 [00:03<00:01,  1.38it/s]

  XGBoost_Reg       : R²=0.590, MAE=87201, MAPE=5.1%


회귀(random_shuffle):  83%|████████▎ | 5/6 [00:05<00:01,  1.15s/it]

  LightGBM_Reg      : R²=0.578, MAE=89065, MAPE=5.3%


                                                                   

  CatBoost_Reg      : R²=0.594, MAE=86488, MAPE=5.1%

--- 분류 모델 평가 (stratified) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 367, 1: 1344, 2: 489, 3: 246}
테스트 세트 등급 분포: {0: 92, 1: 336, 2: 123, 3: 61}


분류(stratified):  17%|█▋        | 1/6 [00:00<00:01,  3.54it/s]

  RandomForest_Clf  : ACC=0.727, F1=0.652, 극값F1=0.669


분류(stratified):  33%|███▎      | 2/6 [00:07<00:16,  4.22s/it]

  GradientBoosting_Clf: ACC=0.735, F1=0.655, 극값F1=0.662
  LogisticRegression_Clf: ACC=0.595, F1=0.562, 극값F1=0.575


분류(stratified):  67%|██████▋   | 4/6 [00:09<00:04,  2.13s/it]

  XGBoost_Clf       : ACC=0.745, F1=0.658, 극값F1=0.659


분류(stratified):  83%|████████▎ | 5/6 [00:16<00:03,  3.66s/it]

  LightGBM_Clf      : ACC=0.745, F1=0.652, 극값F1=0.647


                                                               

  CatBoost_Clf      : ACC=0.699, F1=0.644, 극값F1=0.673
  최고 회귀 모델: GradientBoosting_Reg (R²=0.604)




  최고 분류 모델: XGBoost_Clf (F1=0.658)

--- SEONAM REGRESSION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      일_일강수량(mm): 0.258
      강수량_1일_누적: 0.141
      강수량_7일_누적: 0.114
      일_최저기온(°C): 0.085
      세탁업: 0.080
    Feature Importance 시각화 저장: feature_importance_seonam_GradientBoosting_Reg_regression.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_seonam_GradientBoosting_Reg_regression.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_seonam_GradientBoosting_Reg_regression.png

--- SEONAM CLASSIFICATION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      일_일강수량(mm): 0.087
      강수량_1일_누적: 0.086
      체력단련장업: 0.071
      폭우_여부: 0.064
      목욕장업: 0.047
    Feature Importance 시각화 저장: feature_importance_seonam_XGBoost_Clf_classification.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_seonam_XGBoost_Clf_classification.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_seonam_XGBoost_Clf_classification.png

--- SEONAM 예측 수행 ---
    regression 예측 완료: 41개
    classification 예측

회귀(temporal):  17%|█▋        | 1/6 [00:00<00:03,  1.43it/s]

  RandomForest_Reg  : R²=0.275, MAE=62889, MAPE=8.3%
  LinearRegression  : R²=0.287, MAE=66726, MAPE=8.9%


회귀(temporal):  50%|█████     | 3/6 [00:02<00:02,  1.29it/s]

  GradientBoosting_Reg: R²=0.027, MAE=75273, MAPE=10.1%


회귀(temporal):  67%|██████▋   | 4/6 [00:02<00:01,  1.50it/s]

  XGBoost_Reg       : R²=0.109, MAE=72585, MAPE=9.7%


회귀(temporal):  83%|████████▎ | 5/6 [00:04<00:01,  1.04s/it]

  LightGBM_Reg      : R²=0.202, MAE=66029, MAPE=8.7%


                                                             

  CatBoost_Reg      : R²=0.197, MAE=68829, MAPE=9.2%

--- 분류 모델 평가 (temporal) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 345, 1: 1348, 2: 496, 3: 257}
테스트 세트 등급 분포: {0: 114, 1: 332, 2: 116, 3: 50}


분류(temporal):  17%|█▋        | 1/6 [00:00<00:01,  3.59it/s]

  RandomForest_Clf  : ACC=0.557, F1=0.410, 극값F1=0.246


분류(temporal):  33%|███▎      | 2/6 [00:06<00:15,  3.81s/it]

  GradientBoosting_Clf: ACC=0.364, F1=0.361, 극값F1=0.371
  LogisticRegression_Clf: ACC=0.268, F1=0.259, 극값F1=0.444


분류(temporal):  67%|██████▋   | 4/6 [00:08<00:03,  1.93s/it]

  XGBoost_Clf       : ACC=0.477, F1=0.405, 극값F1=0.322


분류(temporal):  83%|████████▎ | 5/6 [00:16<00:03,  3.82s/it]

  LightGBM_Clf      : ACC=0.454, F1=0.382, 극값F1=0.293


                                                             

  CatBoost_Clf      : ACC=0.358, F1=0.354, 극값F1=0.369

분할 방법: STRATIFIED

--- 회귀 모델 평가 (random_shuffle) ---
회귀용 데이터: 학습 2446행, 테스트 612행


회귀(random_shuffle):  17%|█▋        | 1/6 [00:00<00:03,  1.36it/s]

  RandomForest_Reg  : R²=0.500, MAE=48022, MAPE=6.0%
  LinearRegression  : R²=0.377, MAE=57635, MAPE=7.3%


회귀(random_shuffle):  50%|█████     | 3/6 [00:02<00:02,  1.28it/s]

  GradientBoosting_Reg: R²=0.510, MAE=47588, MAPE=6.0%


회귀(random_shuffle):  67%|██████▋   | 4/6 [00:02<00:01,  1.52it/s]

  XGBoost_Reg       : R²=0.514, MAE=46528, MAPE=5.8%


회귀(random_shuffle):  83%|████████▎ | 5/6 [00:04<00:01,  1.01s/it]

  LightGBM_Reg      : R²=0.482, MAE=48315, MAPE=6.0%


                                                                   

  CatBoost_Reg      : R²=0.515, MAE=47014, MAPE=5.9%

--- 분류 모델 평가 (stratified) ---
분류용 데이터: 학습 2446행, 테스트 612행
학습 세트 등급 분포: {0: 367, 1: 1344, 2: 489, 3: 246}
테스트 세트 등급 분포: {0: 92, 1: 336, 2: 123, 3: 61}


분류(stratified):  17%|█▋        | 1/6 [00:00<00:01,  3.65it/s]

  RandomForest_Clf  : ACC=0.641, F1=0.538, 극값F1=0.509


분류(stratified):  33%|███▎      | 2/6 [00:06<00:15,  3.87s/it]

  GradientBoosting_Clf: ACC=0.636, F1=0.524, 극값F1=0.485
  LogisticRegression_Clf: ACC=0.449, F1=0.432, 극값F1=0.459


분류(stratified):  67%|██████▋   | 4/6 [00:08<00:03,  1.96s/it]

  XGBoost_Clf       : ACC=0.658, F1=0.547, 극값F1=0.488


분류(stratified):  83%|████████▎ | 5/6 [00:15<00:03,  3.50s/it]

  LightGBM_Clf      : ACC=0.647, F1=0.545, 극값F1=0.515


                                                               

  CatBoost_Clf      : ACC=0.614, F1=0.559, 극값F1=0.577
  최고 회귀 모델: CatBoost_Reg (R²=0.515)




  최고 분류 모델: CatBoost_Clf (F1=0.559)

--- TANCHEON REGRESSION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      세탁업: 8.238
      체력단련장업: 7.269
      강수량_7일_누적: 5.583
      하천: 5.496
      일_일강수량(mm): 5.422
    Feature Importance 시각화 저장: feature_importance_tancheon_CatBoost_Reg_regression.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_tancheon_CatBoost_Reg_regression.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_tancheon_CatBoost_Reg_regression.png

--- TANCHEON CLASSIFICATION 최고 모델 분석 ---
    Feature Importance 분석...
    Top 5 중요 피처:
      세탁업: 8.290
      강수량_7일_누적: 6.579
      생활인구: 5.513
      월: 5.492
      체력단련장업: 4.324
    Feature Importance 시각화 저장: feature_importance_tancheon_CatBoost_Clf_classification.png
    SHAP 분석...
    SHAP bar 시각화 저장: shap_bar_tancheon_CatBoost_Clf_classification.png
    SHAP beeswarm 시각화 저장: shap_beeswarm_tancheon_CatBoost_Clf_classification.png

--- TANCHEON 예측 수행 ---
    regression 예측 완료: 41개
    classification 예측 완료: 41개

전체 분석 결과 요약

-

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>