# アンサンブル検討用nb（XGboost、LightGBM、Catboost）

## Mount＆modjule import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 目的：スペインの電力価格を予測
# 目的変数：スペインの電力価格(actual_price)
# 評価指標：RME

import numpy as np
import pandas as pd
import os
import pickle
import gc
# 分布確認
!pip install ydata-profiling
from ydata_profiling import ProfileReport
# 可視化
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder,OneHotEncoder
# モデリング
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
import lightgbm as lgb
import xgboost as xgb
!pip install -U xgboost -q
!pip install -q catboost
from catboost import CatBoostRegressor, Pool

# 日本語表記
!pip install japanize-matplotlib
import japanize_matplotlib
%matplotlib inline
# パラメータ最適化
!pip install optuna
import optuna

# 評価指標
from sklearn.metrics import mean_squared_error

Collecting ydata-profiling
  Downloading ydata_profiling-4.16.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata-profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata-profiling)
  Downloading dacite-1.9.2-py3-none-any.whl.metadata (17 kB)
Collecting puremagic (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting japanize-matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... [?25l[?25hdone
  Created wheel for japanize-matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120257 sha256=2ab24ee846f163a82471a16bfa7f15b1c52abc8013a9dbd643ad371d0279ba6b
  Stored in directory: /root/.cache/pip/wheels/da/a1/71/b8faeb93276fed10edffcca20746f1ef6f8d9e071eee8425fc
Successfully built japanize-matplotlib
Installing collected packages: jap

In [3]:
file_path = '/content/drive/MyDrive/Colab Notebooks/signate/smbc/'

df_train = pd.read_csv(file_path + 'train.csv')
df_test = pd.read_csv(file_path + 'test.csv')

## 前処理

In [4]:
# 時刻をdatetimeに変換
df_train["time"] = pd.to_datetime(df_train["time"],utc=True)
df_test["time"] = pd.to_datetime(df_test["time"],utc=True)

# 年情報のカラム作成（クロスバリデーション用）
df_train["year"] = df_train["time"].dt.year
df_test["year"] = df_test["time"].dt.year

# ケルビンを摂氏に変換  - 273.15
cities = ["valencia","madrid","bilbao","barcelona","seville"]
temp_cols = [f"{c}_{t}" for c in cities for t in ["temp", "temp_min", "temp_max"]]

for df in (df_train, df_test):          # ★ 2 つとも回す
    for col in temp_cols:
        df[col] = df[col] - 273.15

print("train temp range :", df_train["seville_temp_min"].min(), df_train["seville_temp_min"].max())
print("test  temp range :", df_test ["seville_temp_min"].min(), df_test ["seville_temp_min"].max())


train temp range : -4.0 42.0
test  temp range : -3.0 42.0


## 特徴量エンジニアリング

In [5]:
# 季節性（spring:1,summer:2,autumn:3,winter:4）
def get_season(month):
    if month in [3, 4, 5]:
        return 1
    elif month in [6, 7, 8]:
        return 2
    elif month in [9, 10, 11]:
        return 3
    else:
        return 4

# 月を取得 → 季節に変換
df_train["month"] = df_train["time"].dt.month
df_train["season"] = df_train["month"].apply(get_season)

# テストデータにも
df_test["month"] = df_test["time"].dt.month
df_test["season"] = df_test["month"].apply(get_season)

In [6]:
# 時間帯（morning:1,afternoon:2,evening:3,night:4）

def get_time_of_day(hour):
    if 5 <= hour < 11:
        return 1
    elif 11 <= hour < 17:
        return 2
    elif 17 <= hour < 21:
        return 3
    else:
        return 4

df_train["hour"] = df_train["time"].dt.hour
df_train["time_of_day"] = df_train["hour"].apply(get_time_of_day)

# テストデータにも
df_test["hour"] = df_test["time"].dt.hour
df_test["time_of_day"] = df_test["hour"].apply(get_time_of_day)

In [7]:
# 時間帯：夜とそれ以外（night:1,others:0）

def get_time_of_day_only_night(hour):
    if 5 <= hour < 21:
        return 0
    else:
        return 1

df_train["hour"] = df_train["time"].dt.hour
df_train["time_of_day_only_night"] = df_train["hour"].apply(get_time_of_day)

# テストデータにも
df_test["hour"] = df_test["time"].dt.hour
df_test["time_of_day_only_night"] = df_test["hour"].apply(get_time_of_day)

In [8]:
# 週末フラグ

def is_week_end(weekday):
    if weekday in [0,1,2,3,4]:
      return 0
    elif weekday in [5,6]:
      return 1
    else:
      return 2

df_train["weekday"] = df_train["time"].dt.weekday
df_train["is_weekend"] = df_train["weekday"].apply(is_week_end)

# テストデータにも
df_test["weekday"] = df_test["time"].dt.weekday
df_test["is_weekend"] = df_test["weekday"].apply(is_week_end)

In [9]:
# 発電量の和
generation_columns = [col for col in df_train.columns if 'generation' in col]
df_train['generation_sum'] = df_train[generation_columns].sum(axis=1)
df_test['generation_sum'] = df_test[generation_columns].sum(axis=1)

In [10]:
# 発電量のうちの火力発電の構成比[fossil]再生可能エネルギー構成比[renewable]
df_train['generation_sum'] = df_train[generation_columns].sum(axis=1)
df_test['generation_sum'] = df_test[generation_columns].sum(axis=1)

# --- ② 分子になる電源グループ ---
df_train["fossil_total"]     = (
        df_train["generation_fossil_gas"]
      + df_train["generation_fossil_hard_coal"]
      + df_train["generation_fossil_oil"]
)
df_test["fossil_total"]      = (
        df_test["generation_fossil_gas"]
      + df_test["generation_fossil_hard_coal"]
      + df_test["generation_fossil_oil"]
)

df_train["renewable_total"]  = (
        df_train["generation_hydro_pumped_storage_consumption"]
      + df_train["generation_hydro_run_of_river_and_poundage"]
      + df_train["generation_hydro_water_reservoir"]
      + df_train["generation_other_renewable"]
)
df_test["renewable_total"]   = (
        df_test["generation_hydro_pumped_storage_consumption"]
      + df_test["generation_hydro_run_of_river_and_poundage"]
      + df_test["generation_hydro_water_reservoir"]
      + df_test["generation_other_renewable"]
)

# --- ③ 比率を安全に計算（0 除算防止） ---
eps = 1e-6
df_train["fossil_share"]     = df_train["fossil_total"]    / (df_train["generation_sum"] + eps)
df_test["fossil_share"]      = df_test["fossil_total"]     / (df_test["generation_sum"]  + eps)

df_train["renewable_share"]  = df_train["renewable_total"] / (df_train["generation_sum"] + eps)
df_test["renewable_share"]   = df_test["renewable_total"]  / (df_test["generation_sum"]  + eps)

In [None]:
# 祝日と3連休
import holidays, numpy as np

es_holidays = holidays.Spain(years=[2015, 2016, 2017, 2018])

for df in (df_train, df_test):
    df["is_holiday"] = df["time"].dt.date.map(lambda d: 1 if d in es_holidays else 0)
    # 前後1日
    for s in [-1, 1]:
        df[f"hol_adj{s}"] = df["is_holiday"].shift(s).fillna(0)
    # 3連休以上を flag
    df["is_long_wend"] = (
        (df["is_holiday"].rolling(3, min_periods=1).sum() >= 2).astype(int)
    )

In [11]:
# ---------------------------目的変数との相関が-0.24あったバレンシアの気温差---------------------------
# valencia_temp_diff
df_train["valencia_temp_diff"] = df_train["valencia_temp_max"] - df_train["valencia_temp_min"]
df_test["valencia_temp_diff"] = df_test["valencia_temp_max"] - df_test["valencia_temp_min"]

In [12]:
# ---------------------------チュートリアルにあった快適気温との差(valenciaのみ)---------------------------
# temp_dev
COMFORT = 23.0
df_train["temp_dev"] = (df_train["valencia_temp"] - COMFORT).abs()
df_test["temp_dev"] = (df_test["valencia_temp"] - COMFORT).abs()

# 不快指数


In [13]:
# ---------------------------交差項---------------------------
# season×is_weekend
df_train["season_isweekend"] = df_train["season"] * df_train["is_weekend"]
df_test["season_isweekend"] = df_test["season"] * df_test["is_weekend"]

# time_of_day×is_weekend
df_train["time_of_day_isweekend"] = df_train["time_of_day"] * df_train["is_weekend"]
df_test["time_of_day_isweekend"] = df_test["time_of_day"] * df_test["is_weekend"]

In [14]:
# --------------------- fossil_shareやrenewable_shareの変化量：風・太陽が落ちると火力比率↑ → 価格急騰。---------------------------------
# fossil_share_diff1
# renewable_share_diff1
# fossil_share_dev7d
# renewable_share_dev7d
share_cols = ["fossil_share", "renewable_share"]

for df in (df_train, df_test):
    # ---- ① 1 時間前との差分 ----
    for col in share_cols:
        df[f"{col}_diff1"] = df[col] - df[col].shift(1)

    # ---- ② 7日(168h) 移動平均からのずれ ----
    window = 24*7       # 168
    for col in share_cols:
        roll_mean = df[col].rolling(window, min_periods=1).mean()
        df[f"{col}_dev7d"] = df[col] - roll_mean

In [15]:
# ---------------------------price actualのラグ特徴量---------------------------
# ステップ 1：時系列順にソート
df_train = df_train.sort_values("time").reset_index(drop=True)
df_test = df_test.sort_values("time").reset_index(drop=True)

# ステップ 2：df_train にラグ追加（shift）
df_train["price_actual_lag1"]   = df_train["price_actual"].shift(1)
df_train["price_actual_lag24"]  = df_train["price_actual"].shift(24)
df_train["price_actual_lag168"] = df_train["price_actual"].shift(168)

# NaNを含む行を除外（df_train のみ）
df_train = df_train.dropna(subset=["price_actual_lag1", "price_actual_lag24", "price_actual_lag168"])

# ステップ 3：df_test にラグ追加（train の末尾を参照）
train_price = df_train.set_index("time")["price_actual"]

# 時間差からラグ生成（NaNがあっても dropna しない！）
for lag in [1, 24, 168]:
    df_test[f"price_actual_lag{lag}"] = df_test["time"].apply(
        lambda t: train_price.get(t - pd.Timedelta(hours=lag), np.nan)
    )

# ※ここでは df_test.dropna() しない → 提出用行数は保たれる

# モデルで使う test データだけ mask で抽出（あとで予測に使う）
test_mask = df_test[["price_actual_lag1", "price_actual_lag24", "price_actual_lag168"]].notna().all(axis=1)

# ✅ 表示確認（trainのみ）
display(df_train[["price_actual","price_actual_lag1","price_actual_lag24"]])

Unnamed: 0,price_actual,price_actual_lag1,price_actual_lag24
168,67.70,73.14,73.73
169,61.05,67.70,70.99
170,59.61,61.05,68.30
171,58.65,59.61,64.22
172,58.40,58.65,63.53
...,...,...,...
26275,39.90,35.37,62.92
26276,39.54,39.90,62.10
26277,32.90,39.54,60.09
26278,23.85,32.90,55.85


In [16]:
# ---------------------------totalload_actualのラグ特徴量---------------------------
lags = [1,24,168]
lag_targets = ["total_load_actual","generation_sum"]
for df in (df_train,df_test):
  for col in lag_targets:
    for i in lags:
      new_col = f"{col}_lag{i}"
      df[new_col] = df[col].shift(1)
      print(new_col)
df_train.head()

total_load_actual_lag1
total_load_actual_lag24
total_load_actual_lag168
generation_sum_lag1
generation_sum_lag24
generation_sum_lag168
total_load_actual_lag1
total_load_actual_lag24
total_load_actual_lag168
generation_sum_lag1
generation_sum_lag24
generation_sum_lag168


Unnamed: 0,time,generation_biomass,generation_fossil_brown_coal/lignite,generation_fossil_gas,generation_fossil_hard_coal,generation_fossil_oil,generation_hydro_pumped_storage_consumption,generation_hydro_run_of_river_and_poundage,generation_hydro_water_reservoir,generation_nuclear,...,renewable_share_dev7d,price_actual_lag1,price_actual_lag24,price_actual_lag168,total_load_actual_lag1,total_load_actual_lag24,total_load_actual_lag168,generation_sum_lag1,generation_sum_lag24,generation_sum_lag168
168,2015-01-08 23:00:00+00:00,546.0,571.0,4178.0,7280.0,383.0,398.0,658.0,831.0,6741.0,...,-0.104604,73.14,73.73,64.02,,,,,,
169,2015-01-09 00:00:00+00:00,516.0,566.0,3912.0,6774.0,370.0,392.0,628.0,942.0,6741.0,...,-0.098472,67.7,70.99,58.46,26788.0,26788.0,26788.0,28338.0,28338.0,28338.0
170,2015-01-09 01:00:00+00:00,508.0,455.0,3718.0,6349.0,372.0,956.0,631.0,882.0,6742.0,...,-0.077546,61.05,68.3,54.7,25146.0,25146.0,25146.0,27150.0,27150.0,27150.0
171,2015-01-09 02:00:00+00:00,509.0,369.0,3768.0,6078.0,373.0,1088.0,634.0,934.0,6743.0,...,-0.068615,59.61,64.22,54.91,23889.0,23889.0,23889.0,26577.0,26577.0,26577.0
172,2015-01-09 03:00:00+00:00,518.0,367.0,3707.0,5984.0,373.0,1027.0,636.0,799.0,6744.0,...,-0.073815,58.65,63.53,53.07,23046.0,23046.0,23046.0,26117.0,26117.0,26117.0


In [18]:
# 需要・発電量の変化率
for col in ['total_load_actual', 'generation_sum']:
    df_train[f'{col}_pct1h'] = df_train[col].pct_change()
    df_test[f'{col}_pct1h'] = df_test[col].pct_change()

# 時間の周期特徴量
for df in [df_train, df_test]:
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# 交互作用（電力需要と時間）
for df in [df_train,df_test]:
  df["total_load_actual_sin"] = df['hour_sin'] * df["total_load_actual"]
  df["total_load_actual_cos"] = df['hour_cos'] * df["total_load_actual"]

# 交互作用（需要とtime_of_day）
for df in [df_train,df_test]:
  df["total_loadactual_timeofday"] = df['total_load_actual'] * df["time_of_day"]

  df_train[f'{col}_pct1h'] = df_train[col].pct_change()
  df_test[f'{col}_pct1h'] = df_test[col].pct_change()


In [19]:
# ① Inf を NaN に置換
for df in (df_train, df_test):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

# ② まだ NaN が残っている列を簡易補完
na_cols = df_train.columns[df_train.isna().any()]
print("NaN 残列:", na_cols.tolist()[:10], "...")

for df in (df_train, df_test):
    # 平均 or 0 などシンプルで OK（まずは動かす）
    df[na_cols] = df[na_cols].fillna(df[na_cols].median())

# ③ もし極端に大きい値も疑わしいならクリップ
for df in (df_train, df_test):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].clip(-1e6, 1e6)

NaN 残列: ['generation_biomass', 'generation_fossil_brown_coal/lignite', 'generation_fossil_gas', 'generation_fossil_hard_coal', 'generation_fossil_oil', 'generation_hydro_pumped_storage_consumption', 'generation_hydro_run_of_river_and_poundage', 'generation_hydro_water_reservoir', 'generation_nuclear', 'generation_other'] ...


## データ分割＆モデル実装

### 特徴量定義

In [17]:
# 特徴量の整理

base_features = [
    'generation_fossil_gas',
    'generation_fossil_hard_coal',
    #'generation_fossil_oil',
    'generation_hydro_pumped_storage_consumption',
    'generation_hydro_run_of_river_and_poundage',
    'generation_hydro_water_reservoir',
    'generation_other_renewable',
    'total_load_actual',
]

add_features = [
    "generation_biomass",
    "generation_fossil_brown_coal/lignite",
    #"valencia_wind_speed",
    "madrid_wind_speed",
    "bilbao_pressure",
    "bilbao_wind_speed",
    #"bilbao_clouds_all",
    "barcelona_pressure",
    "barcelona_wind_speed",
    "seville_pressure",
    "seville_wind_deg",

    #"season",
    "time_of_day",
    "is_weekend",
    #"is_holiday",
    #"is_long_wend",
    #'generation_sum',
    #"fossil_share",
    #"renewable_share",
    #"time_of_day_only_night",
    "valencia_temp_diff",
    "season_isweekend",
    "time_of_day_isweekend",
    "temp_dev",
    "fossil_share_diff1",
    "renewable_share_diff1",
    "fossil_share_dev7d",
    "renewable_share_dev7d",
    "price_actual_lag1",
    #"price_actual_lag24",
    #"price_actual_lag168",
    "total_load_actual_lag1",
    "total_load_actual_lag24",
    #"total_load_actual_lag168",
    "generation_sum_lag1",
    #"generation_sum_lag24",
    #"generation_sum_lag168",
    "total_load_actual_pct1h",
    "generation_sum_pct1h",
    "hour_sin",
    "hour_cos",
    "total_load_actual_sin",
    "total_load_actual_cos",
    "total_loadactual_timeofday"
]

features = base_features + add_features

### パラメータ定義

In [20]:
params_xgb = {
    "objective":       "reg:squarederror",
    "learning_rate":   0.023378586570803068,
    "max_depth":       7,
    "n_estimators":    317,
    "subsample":       0.9665142871966073,
    "colsample_bytree":0.6519917349250606,
    "min_child_weight":1,
    "gamma":           0.02005153624696357,
    "lambda":          0.6361532102889825,  # L2 正則化
    "tree_method":     "hist",
    "random_state":    42,
    "eval_metric":     "rmse"
}

params_cb = {
    "iterations": 5000,
    "learning_rate": 0.05,
    "depth": 8,
    "loss_function": "RMSE",
    "eval_metric": "RMSE",
    "random_seed": 42,
    "od_type": "Iter",
    "od_wait": 200,   # early stopping
    "verbose": 200
}

params_lgb = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'learning_rate':0.05,
    'num_leaves':12,
    'max_depth':-1,
    'n_estimators':5000,
    "force_col_wise": True,
    'importance_type':'gain'
}


"""



best_params_cb = {
    'iterations': 2855,
    'learning_rate': 0.09865989811944446,
    'depth': 4,
    'l2_leaf_reg': 0.00011419243952339203,
    "loss_function": "RMSE",
    "eval_metric": "RMSE",
    "random_seed": 42,
    "od_type": "Iter",
    "od_wait": 200,   # early stopping
    "verbose": 200
    }

best_params_lgb = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'learning_rate': 0.019977381820598494,
    'num_leaves': 28,
    'feature_fraction': 0.9679195640722122,
    'bagging_fraction': 0.7093899186623671,
    'bagging_freq': 5,
    'lambda_l1': 0.0006846195699654229,
    'lambda_l2': 0.004414436544683086
    }
"""

'\n\n\n\nbest_params_cb = {\n    \'iterations\': 2855,\n    \'learning_rate\': 0.09865989811944446,\n    \'depth\': 4,\n    \'l2_leaf_reg\': 0.00011419243952339203,\n    "loss_function": "RMSE",\n    "eval_metric": "RMSE",\n    "random_seed": 42,\n    "od_type": "Iter",\n    "od_wait": 200,   # early stopping\n    "verbose": 200\n    }\n\nbest_params_lgb = {\n    \'boosting_type\':\'gbdt\',\n    \'objective\':\'regression\',\n    \'metric\':\'rmse\',\n    \'learning_rate\': 0.019977381820598494,\n    \'num_leaves\': 28,\n    \'feature_fraction\': 0.9679195640722122,\n    \'bagging_fraction\': 0.7093899186623671,\n    \'bagging_freq\': 5,\n    \'lambda_l1\': 0.0006846195699654229,\n    \'lambda_l2\': 0.004414436544683086\n    }\n'

### 各モデルの関数定義


In [21]:
# ------------------------------------------------------------
# XGBoost
# ------------------------------------------------------------
def run_xgb(df_train, train_mask, val_mask, features,tag,params_xgb):
    """複数の検証パターンでXGBoostを回すための関数
    Parameters
    ----------
    df_train : 学習用 DataFrame（特徴量列 + 'price_actual'）
    df_test   : 検証用 〃
    features: 学習に使う列名リスト
    tag      : ログに出力するラベル文字列
    params_xgb   : xgboost.XGBRegressor に渡すハイパーパラメータ辞書

    Returns
    -------
    model : 学習済み XGBRegressor
    rmse  : 検証 RMSE
    """
    X_tr = df_train.loc[train_mask, features]
    y_tr = df_train.loc[train_mask, "price_actual"]
    X_va = df_train.loc[val_mask,  features]
    y_va = df_train.loc[val_mask,  "price_actual"]

    model = xgb.XGBRegressor(**params_xgb)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=False,
    )

    pred_va = model.predict(X_va)
    mse = mean_squared_error(y_va, pred_va)
    rmse = np.sqrt(mse)
    print(f"[{tag}]  val RMSE = {rmse:.4f}")
    return model, rmse


In [22]:
# ------------------------------------------------------------
# CatBoost
# ------------------------------------------------------------

def run_cb(df_train, train_mask, val_mask, features,tag,params_cb):
    """複数の検証パターンでXGBoostを回すための関数
    Parameters
    ----------
    df_train : 学習用 DataFrame（特徴量列 + 'price_actual'）
    df_test   : 検証用 〃
    features: 学習に使う列名リスト
    tag      : ログに出力するラベル文字列
    params_xgb   : xgboost.XGBRegressor に渡すハイパーパラメータ辞書

    Returns
    -------
    model : 学習済み XGBRegressor
    rmse  : 検証 RMSE
    """
    X_tr = df_train.loc[train_mask, features]
    y_tr = df_train.loc[train_mask, "price_actual"]
    X_va = df_train.loc[val_mask,  features]
    y_va = df_train.loc[val_mask,  "price_actual"]

    train_pool = Pool(X_tr, y_tr)
    valid_pool = Pool(X_va, y_va)

    model_cb = CatBoostRegressor(**params_cb)
    model_cb.fit(
        train_pool,
        eval_set=valid_pool,
        use_best_model=True,
        verbose=False
        )

    pred_va = model_cb.predict(valid_pool)
    mse_cb = mean_squared_error(y_va, pred_va)
    rmse_cb = np.sqrt(mse_cb)

    print(f"[{tag}]  val RMSE = {rmse_cb:.4f}")
    return model_cb, rmse_cb

In [23]:
# ------------------------------------------------------------
# Light GBM
# ------------------------------------------------------------

def run_lgb(df_train, train_mask, val_mask, features,tag,params_lgb):
    """複数の検証パターンでXGBoostを回すための関数
    Parameters
    ----------
    df_train : 学習用 DataFrame（特徴量列 + 'price_actual'）
    df_test   : 検証用 〃
    features: 学習に使う列名リスト
    tag      : ログに出力するラベル文字列
    params_lgb   : modelに渡すハイパーパラメータ辞書

    Returns
    -------
    model : 学習済み
    rmse  : 検証 RMSE
    """
    X_tr = df_train.loc[train_mask, features]
    y_tr = df_train.loc[train_mask, "price_actual"]
    X_va = df_train.loc[val_mask,  features]
    y_va = df_train.loc[val_mask,  "price_actual"]

    model_lgb = lgb.LGBMRegressor(**params_lgb)
    model_lgb.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(100),
                    lgb.log_evaluation(50)]
    )

    pred_va = model_lgb.predict(X_va)
    mse = mean_squared_error(y_va, pred_va)
    rmse_lgb = np.sqrt(mse)
    print(f"[{tag}]  val RMSE = {rmse_lgb:.4f}")
    return model_lgb, rmse_lgb


In [24]:
"""
# ------------------------------------------------------------------
# 複数モデルの学習→結果、モデル3が良さそう6/19
# ------------------------------------------------------------------

# ------------------------------------------------------------------
# ① Train: 2015-01-01〜2016-12-31 / Val: 2017-01-01〜2017-12-31
# ------------------------------------------------------------------
mask_15_16 = df_train["time"].between("2015-01-01", "2016-12-31")
mask_17    = df_train["time"].between("2017-01-01", "2017-12-31")

model_1, rmse_1 = run_xgb(df_train,mask_15_16, mask_17, features, "Pattern-1_xb", params_xgb)

# ------------------------------------------------------------------
# ③ Train: 2015-01-01〜2017-09-30 / Val: 2017-10-01〜12-31
# ------------------------------------------------------------------
mask_tr_3 = df_train["time"].between("2015-01-01", "2017-09-30")
mask_va_3 = df_train["time"].between("2017-10-01", "2017-12-31")

model_3, rmse_3 = run_xgb(df_train,mask_tr_3, mask_va_3,features,"Pattern-3_xb",params_xgb)
model_cb_3, rmse_cb_3 = run_cb(df_train,mask_tr_3, mask_va_3,features,"Pattern-3_cb",params_cb)
model_lgb_3, rmse_lgb_3 = run_lgb(df_train,mask_tr_3, mask_va_3,features,"Pattern-3_lgb",params_lgb)
"""

# ------------------------------------------------------------------
# ④ Train: 2015-01-01〜2016-12-31（2015-08-01～2016-03-31を除く） / Val: 2017-01-01〜2017-12-31
# ------------------------------------------------------------------
# まず全期間（2015-01-01〜2016-12-31）のマスクを作成
mask_whole = df_train["time"].between("2015-01-01", "2016-12-31")

# カットしたい区間（2015-08-01〜2016-03-31）のマスク
mask_cut = df_train["time"].between("2015-08-01", "2016-03-31")

# “全期間” − “カット区間” ＝ 使いたい期間
mask_15_16 = mask_whole & ~mask_cut
mask_17    = df_train["time"].between("2017-01-01", "2017-12-31")

model_4, rmse_4 = run_xgb(df_train,mask_15_16, mask_17, features, "Pattern-4_xb", params_xgb)
model_cb_4, rmse_cb_4 = run_cb(df_train,mask_15_16, mask_17,features,"Pattern-4_cb",params_cb)
model_lgb_4, rmse_lgb_4 = run_lgb(df_train,mask_15_16, mask_17,features,"Pattern-4_lgb",params_lgb)

[Pattern-4_xb]  val RMSE = 4.0252
[Pattern-4_cb]  val RMSE = 3.4059
[LightGBM] [Info] Total Bins 6474
[LightGBM] [Info] Number of data points in the train set: 11497, number of used features: 37
[LightGBM] [Info] Start training from score 54.647754
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 3.86428
[100]	valid_0's rmse: 3.30858
[150]	valid_0's rmse: 3.22444
[200]	valid_0's rmse: 3.19715
[250]	valid_0's rmse: 3.17353
[300]	valid_0's rmse: 3.16667
[350]	valid_0's rmse: 3.1578
[400]	valid_0's rmse: 3.15208
[450]	valid_0's rmse: 3.15347
Early stopping, best iteration is:
[375]	valid_0's rmse: 3.15086
[Pattern-4_lgb]  val RMSE = 3.1509


### 逐次予測のアンサンブル

In [27]:
from collections import deque
# ------------------------------------------------------------------------------
# 前提：以下変数が定義済み
# df_train, df_test          # pandas.DataFrame
# features                   # 予測に使う特徴量リスト
# params_xgb, params_lgb, params_cb   # 各モデルのハイパーパラメータ dict
# rmse_4, rmse_lgb_4, rmse_cb_4       # Pattern-4 の検証で得た RMSE
# file_path                  # sample_submit.csv へのパス頭
# ------------------------------------------------------------------------------

# 1) 全データ再学習
mask_full = (
    df_train["time"].between("2015-01-01", "2015-07-31") |
    df_train["time"].between("2016-04-01", "2016-12-31")
)
X_full = df_train.loc[mask_full, features]
y_full = df_train.loc[mask_full, "price_actual"]

model_full_xgb = xgb.XGBRegressor(**params_xgb).fit(X_full, y_full, verbose=False)
model_full_lgb = lgb.LGBMRegressor(**params_lgb).fit(X_full, y_full)
model_full_cb  = CatBoostRegressor(**params_cb).fit(X_full, y_full, verbose=False)

# 2) 重み計算（逆数正規化）
w_xgb = 1.0 / rmse_4
w_lgb = 1.0 / rmse_lgb_4
w_cb  = 1.0 / rmse_cb_4
w_sum = w_xgb + w_lgb + w_cb
w_xgb, w_lgb, w_cb = w_xgb/w_sum, w_lgb/w_sum, w_cb/w_sum

# 3) テストを時間順にソート
df_test_rec = df_test.sort_values("time").reset_index(drop=True)

# 4) バッファ初期化
last_price = df_train["price_actual"].iloc[-1]
buf24  = deque(df_train["price_actual"].tail(24),  maxlen=24)
buf168 = deque(df_train["price_actual"].tail(168), maxlen=168)

# 5) 逐次予測＋アンサンブル
preds_recursive = []
for _, row in df_test_rec.iterrows():
    row_feats = row.copy()
    row_feats["price_actual_lag1"]   = last_price
    row_feats["price_actual_lag24"]  = buf24[-1]
    row_feats["price_actual_lag168"] = buf168[-1]

    X_row = row_feats[features].astype(float).to_frame().T
    y1 = model_full_xgb.predict(X_row)[0]
    y2 = model_full_lgb.predict(X_row)[0]
    y3 = model_full_cb.predict(X_row)[0]

    y_ens = w_xgb*y1 + w_lgb*y2 + w_cb*y3
    preds_recursive.append(y_ens)

    last_price = y_ens
    buf24.append(y_ens)
    buf168.append(y_ens)



[LightGBM] [Info] Total Bins 6474
[LightGBM] [Info] Number of data points in the train set: 11451, number of used features: 37
[LightGBM] [Info] Start training from score 54.664645


In [28]:
# 6) 提出ファイル作成
df_sub = pd.read_csv(file_path + 'sample_submit.csv', header=None)
submission = pd.DataFrame({
    'time': df_sub.iloc[:, 0],
    'pred': preds_recursive    # 8760個の予測値
})
submission.to_csv("submission_recursive.csv", index=False, header=False)