In [430]:
# ---------------------
# 관련 링크
# ---------------------
# https://www.kaggle.com/datasets/atomicd/retail-store-inventory-and-demand-forecasting

In [431]:
# ---------------------
# 임포트 & 전역변수
# ---------------------
import pandas as pd
import numpy as np
import plot

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

RANDOM_SEED = 100

def print_pred(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAPE: {mape*100:.2f}%")


In [432]:
# ---------------------
# 데이터 로드 및 불필요한 컬럼 제거
# ---------------------
df = pd.read_csv('./dataset/sales_data.csv')
df.drop(labels=["Date"], axis=1, inplace=True)

In [433]:
# ---------------------
# 결측치 탐지
# ---------------------
df.isna().sum().sort_values(ascending=False)

Store ID              0
Product ID            0
Category              0
Region                0
Inventory Level       0
Units Sold            0
Units Ordered         0
Price                 0
Discount              0
Weather Condition     0
Promotion             0
Competitor Pricing    0
Seasonality           0
Epidemic              0
Demand                0
dtype: int64

In [434]:
# ---------------------
# 순서형 라벨 인코딩
# ---------------------
order = {"Spring": 0, "Summer": 1, "Autumn": 2, "Winter": 3}
df["Seasonality"] = df["Seasonality"].map(order)

# ---------------------
# 라벨 인코딩 (카테고리)
# ---------------------
cat_cols = ["Product ID", "Store ID", "Region", "Category", "Weather Condition"]
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    df[col] = df[col].astype("category")

In [435]:
# ---------------------
# Train/Test셋 만들기
# ---------------------
X = df.drop(columns='Demand')
y = df["Demand"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# ============================================
# 3.1 CatBoost
# ============================================
import catboost
cat_model = catboost.CatBoostRegressor(
    iterations=4000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=5.0,
    bagging_temperature=0.5,
    border_count=128,

    task_type="GPU",
    verbose=100,
    cat_features=cat_cols,
    random_state=RANDOM_SEED
)
cat_model.fit(X_train, y_train)
cat_y_pred = cat_model.predict(X_test)

0:	learn: 45.1100146	total: 54.2ms	remaining: 3m 36s
100:	learn: 17.6233519	total: 5.68s	remaining: 3m 39s
200:	learn: 16.0391175	total: 11.6s	remaining: 3m 39s
300:	learn: 14.9360413	total: 17.5s	remaining: 3m 35s
400:	learn: 14.1346526	total: 23.3s	remaining: 3m 29s
500:	learn: 13.4231916	total: 28.9s	remaining: 3m 21s
600:	learn: 12.8094621	total: 34.3s	remaining: 3m 14s
700:	learn: 12.2775067	total: 40.2s	remaining: 3m 9s
800:	learn: 11.7995686	total: 45.6s	remaining: 3m 2s
900:	learn: 11.3251837	total: 50.6s	remaining: 2m 54s
1000:	learn: 10.9074678	total: 55.7s	remaining: 2m 46s
1100:	learn: 10.5178400	total: 1m	remaining: 2m 40s
1200:	learn: 10.1513923	total: 1m 5s	remaining: 2m 31s
1300:	learn: 9.8212026	total: 1m 10s	remaining: 2m 26s
1400:	learn: 9.5153620	total: 1m 15s	remaining: 2m 20s
1500:	learn: 9.2413126	total: 1m 20s	remaining: 2m 13s
1600:	learn: 8.9826354	total: 1m 25s	remaining: 2m 8s
1700:	learn: 8.7169932	total: 1m 30s	remaining: 2m 2s
1800:	learn: 8.4811822	total

In [None]:
# ============================================
# 3.1 XGBoost
# ============================================
import xgboost

xgb_model = xgboost.XGBRegressor(
    n_estimators=4000,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    device = "cuda",
    tree_method = "hist",

    enable_categorical=True,
    random_state=RANDOM_SEED,
    n_jobs=-1,
)
xgb_model.fit(X_train, y_train)

xgb_y_pred = xgb_model.predict(X_test)

[0]	validation_0-rmse:45.57490
[1]	validation_0-rmse:43.85332
[2]	validation_0-rmse:42.23527
[3]	validation_0-rmse:40.72003
[4]	validation_0-rmse:39.73114
[5]	validation_0-rmse:38.43176
[6]	validation_0-rmse:37.10454
[7]	validation_0-rmse:35.79108
[8]	validation_0-rmse:34.62383
[9]	validation_0-rmse:33.53275
[10]	validation_0-rmse:32.49887
[11]	validation_0-rmse:31.47224
[12]	validation_0-rmse:30.73695
[13]	validation_0-rmse:29.81009
[14]	validation_0-rmse:28.94518
[15]	validation_0-rmse:28.18991
[16]	validation_0-rmse:27.61223
[17]	validation_0-rmse:26.89787
[18]	validation_0-rmse:26.23240
[19]	validation_0-rmse:25.69564
[20]	validation_0-rmse:25.10988
[21]	validation_0-rmse:24.60337
[22]	validation_0-rmse:24.13495
[23]	validation_0-rmse:23.70479
[24]	validation_0-rmse:23.26235
[25]	validation_0-rmse:22.88957
[26]	validation_0-rmse:22.57962
[27]	validation_0-rmse:22.25404
[28]	validation_0-rmse:21.91608
[29]	validation_0-rmse:21.67195
[30]	validation_0-rmse:21.37576
[31]	validation_0-

In [None]:
# ============================================
# 3.3 LightGBM
# ============================================
import lightgbm

lgbm_model = lightgbm.LGBMRegressor(
    objective="regression",
    metric="rmse",
    learning_rate=0.05,
    n_estimators=4000,
    num_leaves=63,
    max_depth=-1,
    min_data_in_leaf=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l2=2.0,

    random_state=RANDOM_SEED
)
lgbm_model.fit(X_train, y_train)
lgbm_y_pred = lgbm_model.predict(X_test)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[4000]	valid_0's rmse: 6.78538
MAE: 4.6232
MSE: 46.0413
RMSE: 6.7854
MAPE: 5.51%


In [None]:
results = [cat_y_pred, xgb_y_pred, lgbm_y_pred]
df_result = pd.DataFrame(results)
plot.performance_table(df_result)

models = [
    ("LightGBM", lgbm_model),
    ("XGBoost", xgb_model),
    ("CatBoost", cat_model),
]
plot.feature_importance(models, X_train.columns)

KeyboardInterrupt: 