In [2]:

# 0. Google Drive  &  library installs
!pip install optuna

from google.colab import drive
import os, json, joblib, warnings, math, holidays, optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.metrics        import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from datetime                import datetime

import wandb
warnings.filterwarnings("ignore")


# 1.  Drive + env setup

drive.mount('/content/drive')
%cd /content/drive/MyDrive/ML_FInal_Project
!pip install -q wandb xgboost scikit-learn pandas numpy matplotlib holidays optuna
os.makedirs("models", exist_ok=True)

SEED = 42
np.random.seed(SEED)

# 2.  WandB initialisation

wandb.login()
run = wandb.init(
    project = "walmart-sales-forecasting",
    entity  = "lkata22-free-university-of-tbilisi-",
    name    = "XGBoost_train_test_v2",
    group   = "XGBoost",
    config  = {
        "random_seed": SEED,
        "n_estimators": 2500,
        "learning_rate": 0.02,
        "max_depth": 8,
        "subsample": 0.75,
        "colsample_bytree": 0.75,
        "gamma": 0.1,
        "min_child_weight": 5,
        "reg_alpha": 0.1,
        "reg_lambda": 0.1,
        "early_stopping_rounds": 100,
        "eval_metric": "mae",
        "tree_method": "hist"
    }
)
config = wandb.config


# 3.  Data loading & merge

DATA_PATH = "data"
train    = pd.read_csv(f"{DATA_PATH}/train.csv")
features = pd.read_csv(f"{DATA_PATH}/features.csv")
stores   = pd.read_csv(f"{DATA_PATH}/stores.csv")

raw_df = (train
          .merge(features, on=["Store","Date","IsHoliday"], how="left")
          .merge(stores,   on="Store",                how="left")
         )

# 4.  Feature engineering helper
us_holidays = holidays.US()

def create_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # core date parts
    df["Date"] = pd.to_datetime(df["Date"])
    df["Year"]  = df.Date.dt.year
    df["Month"] = df.Date.dt.month
    df["Week"]  = df.Date.dt.isocalendar().week.astype(int)
    df["Day"]   = df.Date.dt.day
    # holiday flags
    df["IsHoliday"]    = df["IsHoliday"].astype(int)
    df["IsUSHoliday"] = df["Date"].isin(us_holidays).astype(int)
    # type one-hot
    df = pd.get_dummies(df, columns=["Type"], drop_first=True)
    # ensure chronological order for lags
    df.sort_values(["Store","Dept","Date"], inplace=True)
    # simple lags + rolling means
    for lag in [4, 52]:
        df[f"lag_{lag}"] = df.groupby(["Store","Dept"])["Weekly_Sales"].shift(lag)
    for win in [4, 52]:
        df[f"roll_mean_{win}"] = (df
             .groupby(["Store","Dept"])["Weekly_Sales"]
             .shift(1)
             .rolling(window=win, min_periods=1).mean())
    return df.fillna(0)

# apply engineering
df = create_features(raw_df)
TARGET  = "Weekly_Sales"
DROP    = ["Date", TARGET]
FEATURES= [c for c in df.columns if c not in DROP]


# 5.  Time-based 80/20 train-test split

cutoff = df["Date"].quantile(0.8)
train_df = df[df["Date"] <= cutoff]
test_df  = df[df["Date"] >  cutoff]

X_train, y_train = train_df[FEATURES], train_df[TARGET]
X_test,  y_test  = test_df[FEATURES],  test_df[TARGET]
holiday_test     = test_df["IsHoliday"].values


# 6.  XGBoost training with **live logging**

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test,  label=y_test)

params = {k: config[k] for k in [
    "learning_rate","max_depth","subsample","colsample_bytree",
    "gamma","min_child_weight","reg_alpha","reg_lambda",
    "tree_method","eval_metric"]}
params["objective"] = "reg:squarederror"


evals_result = {}
model = xgb.train(
    params,
    dtrain,
    num_boost_round=config.n_estimators,
    evals=[(dtrain,"train"),(dvalid,"valid")],
    early_stopping_rounds=config.early_stopping_rounds,
    evals_result=evals_result,
    verbose_eval=100
)


for i in range(len(evals_result["train"]["mae"])):
    wandb.log({
        "train_mae":  evals_result["train"]["mae"][i],
        "valid_mae":  evals_result["valid"]["mae"][i],
        "step":       i
    })


# 7.  Final evaluation
ypred = model.predict(dvalid)
mae  = mean_absolute_error(y_test, ypred)
rmse = math.sqrt(mean_squared_error(y_test, ypred))
weights = np.where(holiday_test==1, 5, 1)
wmae = np.sum(weights * np.abs(y_test - ypred)) / weights.sum()

wandb.log({"mae_final":mae, "rmse_final":rmse, "wmae_final":wmae})
print(f"MAE:{mae:.2f}  RMSE:{rmse:.2f}  WMAE:{wmae:.2f}")

# feature importance image
fig, ax = plt.subplots(figsize=(10,12))
xgb.plot_importance(model, max_num_features=40, ax=ax)
plt.tight_layout()
wandb.log({"feature_importance": wandb.Image(fig)})
plt.close(fig)


# 8.  Save artefacts

model_name = f"xgb_split_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
model.save_model(os.path.join("models", model_name))
art = wandb.Artifact("xgb_split_model", type="model")
art.add_file(os.path.join("models", model_name))
wandb.log_artifact(art)

wandb.finish()
print("✅  Run complete - metrics & curves logged to WandB.")

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.3-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.3-py3-none-any.whl (246 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.3 colorlog-6.9.0 optuna-4.4.0
Mounted at /content/drive
/content/drive/MyDrive/ML_FInal_Project


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlkata22[0m ([33mlkata22-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[0]	train-mae:14907.10032	valid-mae:14804.28869
[100]	train-mae:3029.80180	valid-mae:2521.02915
[200]	train-mae:1853.33448	valid-mae:1439.26116
[300]	train-mae:1667.26870	valid-mae:1390.59631
[385]	train-mae:1590.81699	valid-mae:1398.29752
MAE:1398.30  RMSE:3074.57  WMAE:1426.32


0,1
mae_final,▁
rmse_final,▁
step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇███
train_mae,█▇▇▆▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_mae,█▆▅▅▄▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
wmae_final,▁

0,1
mae_final,1398.29752
rmse_final,3074.56863
step,385.0
train_mae,1590.81699
valid_mae,1398.29752
wmae_final,1426.31645


✅  Run complete - metrics & curves logged to WandB.
