## 점수 : 10.13465  등수 :367

In [2]:
!pip install -U xgboost==1.7.6
!pip install -U scikit-learn

Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
   ---------------------------------------- 0.0/70.9 MB ? eta -:--:--
   - -------------------------------------- 2.6/70.9 MB 14.0 MB/s eta 0:00:05
   --- ------------------------------------ 7.1/70.9 MB 17.7 MB/s eta 0:00:04
   ------- -------------------------------- 13.4/70.9 MB 21.9 MB/s eta 0:00:03
   ----------- ---------------------------- 20.2/70.9 MB 25.0 MB/s eta 0:00:03
   --------------- ------------------------ 27.8/70.9 MB 27.1 MB/s eta 0:00:02
   ------------------ --------------------- 33.0/70.9 MB 27.1 MB/s eta 0:00:02
   --------------------- ------------------ 38.5/70.9 MB 26.9 MB/s eta 0:00:02
   ------------------------- -------------- 44.6/70.9 MB 26.9 MB/s eta 0:00:01
   ---------------------------- ----------- 51.4/70.9 MB 27.5 MB/s eta 0:00:01
   -------------------------------- ------- 57.4/70.9 MB 27.7 MB/s 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.1 which is incompatible.


In [None]:
import pandas as pd
import numpy as np
import os
import random
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings(action='ignore')

def seed_everything(seed):
    """모든 난수 시드 고정"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

# ======================
# 1) 데이터 불러오기
# ======================
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI goorm project1/dataset/train.csv', encoding='utf-8')
test_df  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI goorm project1/dataset/test.csv',  encoding='utf-8')
building_info = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI goorm project1/dataset/building_info.csv', encoding='utf-8')

# ======================
# 2) 빌딩 메타 전처리 (강화)
# ======================
# '-' → NaN → 0 처리(먼저 NaN으로 바꾼 뒤, 숫자형 변환이 쉬움)
building_info = building_info.replace('-', np.nan)

# 숫자형 컬럼들 명시적 변환 (데이터셋 컬럼명에 맞추세요)
num_cols = ['연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']
for c in num_cols:
    if c in building_info.columns:
        building_info[c] = pd.to_numeric(building_info[c], errors='coerce')

# 결측치 0 채움
building_info[num_cols] = building_info[num_cols].fillna(0)

# 건물유형 원-핫
if '건물유형' in building_info.columns:
    building_type_dummies = pd.get_dummies(building_info['건물유형'], prefix='건물유형')
    building_info = pd.concat([building_info.drop(columns=['건물유형']), building_type_dummies], axis=1)

# 면적/설비 비율 파생
if set(['연면적(m2)','냉방면적(m2)']).issubset(building_info.columns):
    building_info['냉방면적비'] = (building_info['냉방면적(m2)'] / (building_info['연면적(m2)'] + 1e-6)).clip(0, 1)

for col, newname in [
    ('태양광용량(kW)',  '태양광_면적당용량'),
    ('ESS저장용량(kWh)', 'ESS_면적당용량'),
    ('PCS용량(kW)',    'PCS_면적당용량'),
]:
    if col in building_info.columns and '연면적(m2)' in building_info.columns:
        building_info[newname] = building_info[col] / (building_info['연면적(m2)'] + 1e-6)

# ======================
# 3) 병합
# ======================
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df  = pd.merge(test_df,  building_info, on='건물번호', how='left')



# ======================
# 4) 시간·날씨 파생
# ======================
train_df['일시'] = pd.to_datetime(train_df['일시'], format='%Y%m%d %H')
test_df['일시']  = pd.to_datetime(test_df['일시'],  format='%Y%m%d %H')

def add_time_weather_features(df):
    df['month'] = df['일시'].dt.month
    df['day'] = df['일시'].dt.day
    df['hour'] = df['일시'].dt.hour
    df['dayofweek'] = df['일시'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # 주기형 인코딩
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dow_sin']  = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dow_cos']  = np.cos(2 * np.pi * df['dayofweek'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # 체감 온도 THI, 냉/난방 지표
    if set(['기온(°C)','습도(%)']).issubset(df.columns):
        df['THI'] = df['기온(°C)'] - 0.55*(1 - (df['습도(%)']/100.0))*(df['기온(°C)'] - 14.5)
        df['CDH'] = np.maximum(0, df['THI'] - 24)  # Cooling Degree Hours(임의 임계: 24)
        df['HDH'] = np.maximum(0, 18 - df['THI'])  # Heating Degree Hours(임의 임계: 18)
    else:
        df['THI'] = 0.0
        df['CDH'] = 0.0
        df['HDH'] = 0.0

    # 업무시간 플래그(대략 9~18시)
    df['is_business_hour'] = ((df['hour'] >= 9) & (df['hour'] <= 18)).astype(int)
    return df

train_df = add_time_weather_features(train_df)
test_df  = add_time_weather_features(test_df)



# ======================
# 5) 결측치 처리(안전 버전)
# ======================

# (선택) 수치형처럼 보이는 날씨 컬럼을 숫자로 강제 변환
num_like = ['일조(hr)', '일사(MJ/m2)', '기온(°C)', '습도(%)']
for df in [train_df, test_df]:
    for c in num_like:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

# 수치형 컬럼 집합(둘 중 하나에라도 있으면 후보로)
num_cols_train = set(train_df.select_dtypes(include=[np.number]).columns)
num_cols_test  = set(test_df.select_dtypes(include=[np.number]).columns)
num_cols_union = sorted((num_cols_train | num_cols_test) - {'전력소비량(kWh)'})

for c in num_cols_union:
    # --- train 쪽 채움 ---
    if c in train_df.columns:
        if train_df[c].isna().any():
            train_df[c] = train_df.groupby('건물번호')[c].transform(lambda s: s.fillna(s.median()))
            train_df[c] = train_df[c].fillna(train_df[c].median())

    # --- test 쪽 채움 ---
    if c in test_df.columns:
        if test_df[c].isna().any():
            # 1) 건물번호 그룹 중앙값 -> 2) train 전역 중앙값 -> 3) test 전역 중앙값
            test_df[c] = test_df.groupby('건물번호')[c].transform(lambda s: s.fillna(s.median()))
            if c in train_df.columns:
                test_df[c] = test_df[c].fillna(train_df[c].median())
            test_df[c] = test_df[c].fillna(test_df[c].median())


# ======================
# 6) OOF 타깃 인코딩(누설 방지)
#    - 건물, 건물×월, 건물×시간 평균 전력
# ======================
def add_oof_target_mean(train, test, key_cols, target_col='전력소비량(kWh)', n_splits=5):
    name = 'TE_' + '_'.join(key_cols)
    train = train.sort_values('일시').copy()
    tr_values = np.zeros(len(train), dtype=float)
    tss = TimeSeriesSplit(n_splits=n_splits)
    for tr_idx, va_idx in tss.split(train):
        tr_part = train.iloc[tr_idx]
        va_part = train.iloc[va_idx]
        mapping = tr_part.groupby(key_cols)[target_col].mean()
        tr_values[va_idx] = va_part[key_cols].merge(
            mapping.rename(name), left_on=key_cols, right_index=True, how='left'
        )[name].values
    global_mean = train[target_col].mean()
    train[name] = pd.Series(tr_values).fillna(global_mean)

    # test는 전체 train으로 학습한 mapping 사용
    mapping_full = train.groupby(key_cols)[target_col].mean()
    test[name] = test[key_cols].merge(
        mapping_full.rename(name), left_on=key_cols, right_index=True, how='left'
    )[name].fillna(global_mean).values
    return train, test

for keys in [['건물번호'], ['건물번호','month'], ['건물번호','hour']]:
    train_df, test_df = add_oof_target_mean(train_df, test_df, keys)


# ======================
# 7) 학습/검증 분할 (최근 N일 hold-out)
# ======================
val_days = 14  # 최근 14일을 검증으로
cutoff = train_df['일시'].max() - pd.Timedelta(days=val_days)
train_part = train_df[train_df['일시'] <= cutoff].copy()
valid_part = train_df[train_df['일시'] >  cutoff].copy()


# ======================
# 8) 피처 선택/정렬  (그대로 OK)
# ======================
def safe_drop(df, cols):
    return df.drop(columns=[c for c in cols if c in df.columns], errors='ignore')

drop_common = ['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)']

train_y = train_df['전력소비량(kWh)']
train_x = safe_drop(train_df, drop_common + ['전력소비량(kWh)'])
test_x  = safe_drop(test_df,  drop_common)

# test에 없는 컬럼은 0으로 채워 맞추기
missing_in_test = [c for c in train_x.columns if c not in test_x.columns]
for c in missing_in_test:
    test_x[c] = 0
test_x = test_x[train_x.columns]

# ======================
# 8.5) 시간 기반 검증 분할 (최근 14일을 검증으로)
# ======================
cutoff = train_df['일시'].max() - pd.Timedelta(days=14)
tr_mask = train_df['일시'] <= cutoff
va_mask = ~tr_mask

X_tr = train_x.loc[tr_mask].copy()
y_tr = np.log1p(train_y.loc[tr_mask])   # 로그 변환
X_va = train_x.loc[va_mask].copy()
y_va = np.log1p(train_y.loc[va_mask])

# test도 학습 컬럼 순서와 동일하게
X_te = test_x.reindex(columns=X_tr.columns, fill_value=0)

# ======================
# 9) 모델 학습 (조기 종료 + 합리적 파라미터)
# ======================
model = XGBRegressor(
    n_estimators=3000,
    learning_rate=0.03,
    max_depth=8,
    min_child_weight=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    objective='reg:squarederror',
    eval_metric='rmse',   
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    early_stopping_rounds=200,
    verbose=100
)

# ======================
# 10) 검증 성능 & 예측
# ======================
from sklearn.metrics import mean_squared_error
import numpy as np

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# 기존 코드 교체
va_pred_log = model.predict(X_va)
va_rmse = rmse(y_va, va_pred_log)
print(f"[Val LOG RMSE] {va_rmse:.5f}")

va_pred = np.expm1(va_pred_log)
val_rmse_real = rmse(np.expm1(y_va), va_pred)
print(f"[Val RMSE(real)] {val_rmse_real:.3f}")

# ======================
# 11) 테스트 예측
# ======================
te_pred_log = model.predict(X_te)
te_pred = np.expm1(te_pred_log)

# ======================
# 12) 제출 파일 저장 + 중요도
# ======================
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI goorm project1/dataset/sample_submission.csv')
submission['answer'] = te_pred
submission.to_csv('submission.csv', index=False)
print("모델 학습/검증/예측 완료. submission.csv 생성!")

imp = pd.Series(model.feature_importances_, index=X_tr.columns).sort_values(ascending=False)
print("\n[Top 20 Feature Importances]")
print(imp.head(20))

