In [None]:
!pip install lightgbm pandas scikit-learn --quiet

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# ① ダミーデータ作成（実務ではCSVなどから読み込み）
n_stores = 10000
n_months = 24
data = []

for store_id in range(n_stores):
    region = np.random.choice(['East', 'West', 'North', 'South'])
    industry = np.random.choice(['FastFood', 'Steak', 'Cafe'])
    for month in range(n_months):
        sales = 100 + store_id * 0.01 + month * 1.2 + np.random.normal(0, 10)
        data.append({
            "store_id": store_id,
            "month_idx": month,
            "sales": sales,
            "region": region,
            "industry": industry
        })

df = pd.DataFrame(data)

# ② ラベルエンコード
for col in ['region', 'industry']:
    df[col] = LabelEncoder().fit_transform(df[col])

# ③ ラグ特徴量作成（1ヶ月〜3ヶ月前）
df = df.sort_values(['store_id', 'month_idx'])
for lag in [1, 2, 3]:
    df[f'sales_lag_{lag}'] = df.groupby('store_id')['sales'].shift(lag)

# ④ train/testデータ作成（train: month_idx 3〜23, test: 24〜35）
train = df[df['month_idx'] >= 3]
test_data = []

for offset in range(12):  # 予測月: 24〜35
    month_idx = 24 + offset
    latest = df[df['month_idx'] == (month_idx - 1)][['store_id', 'sales']].rename(columns={'sales': 'sales_lag_1'})
    latest2 = df[df['month_idx'] == (month_idx - 2)][['store_id', 'sales']].rename(columns={'sales': 'sales_lag_2'})
    latest3 = df[df['month_idx'] == (month_idx - 3)][['store_id', 'sales']].rename(columns={'sales': 'sales_lag_3'})

    base = df[df['month_idx'] == 23][['store_id', 'region', 'industry']].copy()
    base['month_idx'] = month_idx

    base = base.merge(latest, on='store_id', how='left')
    base = base.merge(latest2, on='store_id', how='left')
    base = base.merge(latest3, on='store_id', how='left')

    test_data.append(base)

test = pd.concat(test_data, ignore_index=True)

# ⑤ モデル学習
features = ['region', 'industry', 'month_idx', 'sales_lag_1', 'sales_lag_2', 'sales_lag_3']
X_train = train.dropna()[features]
y_train = train.dropna()['sales']

model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1)
model.fit(X_train, y_train)

# ⑥ 予測
X_test = test[features]
test['predicted_sales'] = model.predict(X_test)

# ⑦ 店舗ごとの予測結果表示（先頭5店舗 × 12ヶ月）
test_result = test[test['store_id'] < 5].sort_values(['store_id', 'month_idx'])
display(test_result[['store_id', 'month_idx', 'predicted_sales']])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 794
[LightGBM] [Info] Number of data points in the train set: 210000, number of used features: 6
[LightGBM] [Info] Start training from score 165.562496


Unnamed: 0,store_id,month_idx,predicted_sales
0,0,24,133.166594
10000,0,25,128.910966
20000,0,26,123.73332
30000,0,27,123.563468
40000,0,28,123.563468
50000,0,29,123.563468
60000,0,30,123.563468
70000,0,31,123.563468
80000,0,32,123.563468
90000,0,33,123.563468
