In [14]:
%load_ext autoreload
%autoreload 2

import function_list
from pimp import *

import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_squared_log_error

train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])
stores = pd.read_csv("stores.csv")
transactions = pd.read_csv("transactions.csv", parse_dates=["date"])
oil = pd.read_csv("oil.csv", parse_dates=["date"])
holidays = pd.read_csv("holidays_events.csv", parse_dates=["date"])

# ==================
# データの結合
# ==================
df = pd.concat([train, test], axis=0, sort=False)

# [変更点 1] category化を「merge前の仮」ではなく、merge後に確定させるためここでは残してもよいが、後で再度確定する
cat_cols = ["store_nbr", "family"]
for c in cat_cols:
    df[c] = df[c].astype("category")

df = df.merge(stores, on='store_nbr', how='left', validate="m:1")
df = df.merge(transactions, on=['date', 'store_nbr'], how='left', validate="m:1")
df = df.merge(oil, on='date', how='left', validate="m:1")

# 祝日を日単位で集約
holidays_day = holidays.groupby("date").size().reset_index(name="n_holidays")
df = df.merge(holidays_day, on="date", how="left", validate="m:1")
df['n_holidays'] = df['n_holidays'].fillna(0).astype(int)

# [変更点 2] mergeでobjectに戻ることがあるので、merge後にcategoryを「確定」させる（エラー予防）
for c in cat_cols:
    df[c] = df[c].astype("category")

# [変更点 3] transactions/oil の欠損を最低限埋める（LB不安定化の予防）
if "transactions" in df.columns:
    df["transactions"] = df["transactions"].fillna(0)

# oil列名は環境で違うので存在する数値列を前方補完（dcoilwtico等を想定）
oil_cols = [c for c in oil.columns if c != "date"]
for c in oil_cols:
    if c in df.columns:
        df[c] = df[c].ffill()

# ==================
# 特徴量作成
# ==================
df['weekday'] = df['date'].dt.weekday
df['month'] = df['date'].dt.month

# [変更点 4] lag作成の前に必ず時系列順にソート（shiftは行順依存なので必須）
df = df.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)

df['lag_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(7)
df['lag_14'] = df.groupby(['store_nbr', 'family'])['sales'].shift(14)

df["roll_7"] = (
    df.groupby(["store_nbr", "family"])["sales"]
      .transform(lambda s: s.shift(1).rolling(7, min_periods=1).mean())
)
df["roll_14"] = (
    df.groupby(["store_nbr", "family"])["sales"]
      .transform(lambda s: s.shift(1).rolling(14, min_periods=1).mean())
)

# =========================
# trainとtestを分割する
# =========================
cutoff = test['date'].min()
train_df = df[df['date'] < cutoff].copy()
test_df  = df[df['date'] >= cutoff].copy()

# =========================
# 直前専用 CV（2017/08/01-15）
# =========================
valid = train_df[
    (train_df["date"] >= "2017-08-01") &
    (train_df["date"] <  "2017-08-16")
].copy()

train_cv = train_df[train_df["date"] < "2017-08-01"].copy()

features = ["store_nbr", "family", "weekday", "month", "lag_7", "lag_14", "roll_7", "roll_14"]
cat_features = ["store_nbr", "family"]

# =========================
# RMSLE 評価関数（salesスケール）: sklearn API用 (y_true, y_pred)
# =========================
def rmsle_eval(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    val = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return "rmsle", float(val), False

# =========================
# Optuna（単目的）
# =========================
def objective(trial):
    optuna_params = {
        "objective": "regression",
        "learning_rate": 0.05,
        "num_leaves": trial.suggest_int("num_leaves", 32, 128),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 200),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "n_estimators": 600,
        "n_jobs": -1,
        "seed": 42,
    }

    model = lgb.LGBMRegressor(**optuna_params)
    model.fit(
        train_cv[features],
        train_cv["sales"],
        categorical_feature=cat_features,
        eval_set=[(valid[features], valid["sales"])],
        eval_metric=rmsle_eval,
        callbacks=[lgb.early_stopping(50, verbose=False)],
    )

    # [変更点 5] best_iteration_ を後段で使えるように保存
    trial.set_user_attr("best_iteration", int(model.best_iteration_))

    pred = model.predict(valid[features])
    score = np.sqrt(mean_squared_log_error(valid["sales"], np.maximum(pred, 0)))
    return float(score)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

print("BEST RMSLE:", study.best_value)
print("BEST PARAMS:", study.best_params)

# =========================
# 最終モデル（全 train で再学習）
# =========================
# [変更点 6] best_iteration が保存されている前提で、最終のn_estimatorsに反映
best_iter = int(study.best_trial.user_attrs.get("best_iteration", 600))

main_params = {
    **study.best_params,
    "objective": "regression",
    "learning_rate": 0.05,
    "n_estimators": best_iter,
    "n_jobs": -1,
    "seed": 42,
}

main_model = lgb.LGBMRegressor(**main_params)
main_model.fit(
    train_df[features],
    train_df["sales"],
    categorical_feature=cat_features
)

# =========================
# 予測（id順厳守）
# =========================
test_df = test_df.sort_values("id").reset_index(drop=True)

pred = main_model.predict(test_df[features])
pred = np.maximum(pred, 0)

submission = pd.DataFrame({
    "id": test_df["id"],
    "sales": pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved")


  df['lag_7'] = df.groupby(['store_nbr', 'family'])['sales'].shift(7)
  df['lag_14'] = df.groupby(['store_nbr', 'family'])['sales'].shift(14)
  df.groupby(["store_nbr", "family"])["sales"]
  df.groupby(["store_nbr", "family"])["sales"]
[I 2025-12-17 03:21:33,099] A new study created in memory with name: no-name-fb1a66c3-e00d-4a4d-af3f-751004025a64


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:21:54,811] Trial 0 finished with value: 0.4693177638706481 and parameters: {'num_leaves': 126, 'min_child_samples': 132, 'feature_fraction': 0.7368401117089458, 'bagging_fraction': 0.7154528963079622, 'bagging_freq': 2}. Best is trial 0 with value: 0.4693177638706481.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:22:07,695] Trial 1 finished with value: 0.48608922237467506 and parameters: {'num_leaves': 93, 'min_child_samples': 36, 'feature_fraction': 0.734473907895152, 'bagging_fraction': 0.867575865674876, 'bagging_freq': 6}. Best is trial 0 with value: 0.4693177638706481.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021879 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:22:25,379] Trial 2 finished with value: 0.46913263054980675 and parameters: {'num_leaves': 115, 'min_child_samples': 169, 'feature_fraction': 0.7383480385101586, 'bagging_fraction': 0.828433306149776, 'bagging_freq': 1}. Best is trial 2 with value: 0.46913263054980675.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:22:49,401] Trial 3 finished with value: 0.45306487467094575 and parameters: {'num_leaves': 111, 'min_child_samples': 43, 'feature_fraction': 0.8213789727376127, 'bagging_fraction': 0.9672848481931051, 'bagging_freq': 10}. Best is trial 3 with value: 0.45306487467094575.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:23:06,998] Trial 4 finished with value: 0.4597290569211028 and parameters: {'num_leaves': 97, 'min_child_samples': 66, 'feature_fraction': 0.9852070069233853, 'bagging_fraction': 0.7977163657103846, 'bagging_freq': 10}. Best is trial 3 with value: 0.45306487467094575.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:23:37,930] Trial 5 finished with value: 0.4704267985551505 and parameters: {'num_leaves': 86, 'min_child_samples': 114, 'feature_fraction': 0.7127902970041498, 'bagging_fraction': 0.9035272012742631, 'bagging_freq': 2}. Best is trial 3 with value: 0.45306487467094575.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014907 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:23:54,748] Trial 6 finished with value: 0.46878812136058745 and parameters: {'num_leaves': 100, 'min_child_samples': 22, 'feature_fraction': 0.8549975381879883, 'bagging_fraction': 0.9128748098109511, 'bagging_freq': 3}. Best is trial 3 with value: 0.45306487467094575.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:24:12,351] Trial 7 finished with value: 0.45714201099930607 and parameters: {'num_leaves': 107, 'min_child_samples': 131, 'feature_fraction': 0.9060950742274031, 'bagging_fraction': 0.7038025664408022, 'bagging_freq': 9}. Best is trial 3 with value: 0.45306487467094575.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:24:29,344] Trial 8 finished with value: 0.4459250923411441 and parameters: {'num_leaves': 104, 'min_child_samples': 109, 'feature_fraction': 0.9552119708346797, 'bagging_fraction': 0.973335460554759, 'bagging_freq': 3}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014399 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:24:44,017] Trial 9 finished with value: 0.4789497717578256 and parameters: {'num_leaves': 44, 'min_child_samples': 166, 'feature_fraction': 0.8399949790394697, 'bagging_fraction': 0.7527694548890875, 'bagging_freq': 9}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:25:02,198] Trial 10 finished with value: 0.45589167176852685 and parameters: {'num_leaves': 65, 'min_child_samples': 79, 'feature_fraction': 0.9946986585744714, 'bagging_fraction': 0.9802768403348979, 'bagging_freq': 5}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:25:19,403] Trial 11 finished with value: 0.4617276375257517 and parameters: {'num_leaves': 70, 'min_child_samples': 74, 'feature_fraction': 0.8126853739863313, 'bagging_fraction': 0.9987540794505287, 'bagging_freq': 6}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:25:41,427] Trial 12 finished with value: 0.4488609574176982 and parameters: {'num_leaves': 128, 'min_child_samples': 191, 'feature_fraction': 0.9290372540285396, 'bagging_fraction': 0.9441936257061848, 'bagging_freq': 4}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:26:02,602] Trial 13 finished with value: 0.4502757643700438 and parameters: {'num_leaves': 127, 'min_child_samples': 196, 'feature_fraction': 0.9333025728176877, 'bagging_fraction': 0.9336788127552208, 'bagging_freq': 4}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:26:22,990] Trial 14 finished with value: 0.45544635521760823 and parameters: {'num_leaves': 118, 'min_child_samples': 104, 'feature_fraction': 0.9324101959210682, 'bagging_fraction': 0.9440998421961397, 'bagging_freq': 4}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:26:46,437] Trial 15 finished with value: 0.4683642159393217 and parameters: {'num_leaves': 77, 'min_child_samples': 155, 'feature_fraction': 0.9072797086749813, 'bagging_fraction': 0.8858442198479173, 'bagging_freq': 7}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:27:06,021] Trial 16 finished with value: 0.4668068442174875 and parameters: {'num_leaves': 35, 'min_child_samples': 200, 'feature_fraction': 0.9604681716943192, 'bagging_fraction': 0.9513878196919019, 'bagging_freq': 4}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:27:21,022] Trial 17 finished with value: 0.4570598016207605 and parameters: {'num_leaves': 128, 'min_child_samples': 99, 'feature_fraction': 0.8683860939652829, 'bagging_fraction': 0.8469057088360546, 'bagging_freq': 1}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:27:44,646] Trial 18 finished with value: 0.4540816294945279 and parameters: {'num_leaves': 106, 'min_child_samples': 130, 'feature_fraction': 0.8877283268665155, 'bagging_fraction': 0.9986359904304885, 'bagging_freq': 3}. Best is trial 8 with value: 0.4459250923411441.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 2974158, number of used features: 8
[LightGBM] [Info] Start training from score 356.810778


[I 2025-12-17 03:28:05,661] Trial 19 finished with value: 0.45199228773567385 and parameters: {'num_leaves': 89, 'min_child_samples': 149, 'feature_fraction': 0.9605451610755226, 'bagging_fraction': 0.9252004677319888, 'bagging_freq': 7}. Best is trial 8 with value: 0.4459250923411441.


BEST RMSLE: 0.4459250923411441
BEST PARAMS: {'num_leaves': 104, 'min_child_samples': 109, 'feature_fraction': 0.9552119708346797, 'bagging_fraction': 0.973335460554759, 'bagging_freq': 3}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 3000888, number of used features: 8
[LightGBM] [Info] Start training from score 357.775749
submission.csv saved


特徴量を作る