In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ML_FInal_Project

!pip install -q wandb lightgbm scikit-learn pandas numpy matplotlib holidays joblib

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1h3JmMNvF7pLor34P-qm2FEkIev93euuf/ML_FInal_Project


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import joblib
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import warnings
import os

In [None]:
# wandb.login()

# wandb.init(
#     project="walmart-sales-forecasting",
#     entity="lkata22-free-university-of-tbilisi-",
#     name="LightGBM_experiment_v4",
#     group="LightGBM"
# )

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33masurm22[0m ([33masurm22-free-university-of-tbilisi-6158[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# wandb.log({
#     "mae": mae,
#     "rmse": rmse,
#     "wmae": wmae,
#     "regular_mae": regular_mae,
#     "holiday_mae": holiday_mae,
#     "holiday_ratio": holiday_mae / regular_mae,
#     "feature_importance": wandb.Table(dataframe=pd.DataFrame({
#         'feature': model.feature_name(),
#         'importance': model.feature_importance()
#     }).sort_values('importance', ascending=False))
# })

# train_pred = model.predict(X_train)
# val_pred = model.predict(X_valid)

# print(f"Train MAE: {mean_absolute_error(y_train, train_pred):.2f}")
# print(f"Valid MAE: {mean_absolute_error(y_valid, val_pred):.2f}")
# print(f"Gap: {(mean_absolute_error(y_train, train_pred) - mean_absolute_error(y_valid, val_pred)):.2f}")

# joblib.dump(model, "models/lightgbm_model.pkl")

# artifact = wandb.Artifact(
#     name=f"lightgbm-model-{wandb.run.id}",
#     type="model",
#     metadata={
#         "features": list(X_train.columns),
#         "num_features": len(X_train.columns),
#         "categorical_features": list(X_train.select_dtypes('category').columns)
#     }
# )
# artifact.add_file("models/lightgbm_model.pkl")
# wandb.log_artifact(artifact)
# wandb.finish()

In [None]:
DATA_PATH = "data"

train_raw = pd.read_csv(f"{DATA_PATH}/train.csv")
test_raw = pd.read_csv(f"{DATA_PATH}/test.csv")
features_df_raw = pd.read_csv(f"{DATA_PATH}/features.csv")
stores_df_raw = pd.read_csv(f"{DATA_PATH}/stores.csv")

merged_train_df = pd.merge(
    train_raw,
    features_df_raw.drop('IsHoliday', axis=1),
    on=['Store', 'Date'], how='left'
)
df_full_train = pd.merge(merged_train_df, stores_df_raw, on=['Store'], how='left')

merged_test_df_initial = pd.merge(
    test_raw,
    features_df_raw.drop('IsHoliday', axis=1),
    on=['Store', 'Date'], how='left'
)
df_full_test = pd.merge(merged_test_df_initial, stores_df_raw, on=['Store'], how='left')


for df in [df_full_train, df_full_test]:
    df['Date'] = pd.to_datetime(df['Date'])



max_date = df_full_train['Date'].max()
cutoff_date = max_date - pd.Timedelta(weeks=30)

train_df_split = df_full_train[df_full_train['Date'] <= cutoff_date].copy()
val_df_split   = df_full_train[df_full_train['Date'] >  cutoff_date].copy()

X_train = train_df_split.drop('Weekly_Sales', axis=1)
y_train = train_df_split['Weekly_Sales']
X_val   = val_df_split.drop('Weekly_Sales', axis=1)
y_val   = val_df_split['Weekly_Sales']

# Prepare inputs for pipeline: drop Date and Id
for df in [X_train, X_val]:
    df.drop(columns=['Date'], errors='ignore', inplace=True)
    if 'Id' in df.columns:
        df.drop(columns=['Id'], inplace=True)

X_train_for_pipeline = X_train.copy()
X_val_for_pipeline   = X_val.copy()


class WalmartPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, target_columns=None):
        self.label_encoders = {}
        self.store_dept_means = {}
        self.store_means = {}
        self.dept_means = {}
        self.global_mean = 0.0
        self.numerical_medians = {}
        self.target_columns = target_columns

    def fit(self, X, y=None):
        Xc = X.copy()
        if y is not None:
            self.global_mean = y.mean()
            tmp = Xc.copy(); tmp['Weekly_Sales'] = y
            tmp['Store'] = tmp['Store'].astype(str).fillna('unknown_store')
            tmp['Dept']  = tmp['Dept'].astype(str).fillna('unknown_dept')
            self.store_dept_means = tmp.groupby(['Store','Dept'])['Weekly_Sales'].mean().to_dict()
            self.store_means      = tmp.groupby('Store')['Weekly_Sales'].mean().to_dict()
            self.dept_means       = tmp.groupby('Dept')['Weekly_Sales'].mean().to_dict()

        # Fit label encoders
        for col in ['Store','Dept','Type']:
            le = LabelEncoder()
            vals = Xc[col].astype(str).fillna('unknown') if col in Xc else ['unknown']
            le.fit(vals)
            self.label_encoders[col] = le

        # Fit medians
        num_cols = ['Temperature','Fuel_Price','CPI','Unemployment','Size'] + [f'MarkDown{i}' for i in range(1,6)]
        for col in num_cols:
            if col in Xc:
                self.numerical_medians[col] = Xc[col].median() if not Xc[col].isnull().all() else 0.0
            else:
                self.numerical_medians[col] = 0.0
        return self

    def transform(self, X):
        Xc = X.copy()
        # Date features
        if 'Date' in Xc:
            Xc['Date']=pd.to_datetime(Xc['Date'])
            Xc['Year']=Xc['Date'].dt.year; Xc['Month']=Xc['Date'].dt.month
            Xc['Week']=Xc['Date'].dt.isocalendar().week.astype(int)
            Xc['DayOfYear']=Xc['Date'].dt.dayofyear; Xc['Quarter']=Xc['Date'].dt.quarter
            Xc['Month_sin']=np.sin(2*np.pi*Xc['Month']/12)
            Xc['Month_cos']=np.cos(2*np.pi*Xc['Month']/12)
            Xc['Week_sin']=np.sin(2*np.pi*Xc['Week']/52)
            Xc['Week_cos']=np.cos(2*np.pi*Xc['Week']/52)
            Xc.drop('Date',axis=1,inplace=True)

        # Impute numeric
        for col, med in self.numerical_medians.items():
            Xc[col] = Xc.get(col, pd.Series([np.nan]*len(Xc))).fillna(med)

        # Interaction features
        Xc['Store_str']=Xc['Store'].astype(str);
        Xc['Dept_str']=Xc['Dept'].astype(str)
        Xc['Store_Dept_Mean_Sales'] = Xc.apply(
            lambda r: self.store_dept_means.get((r['Store_str'],r['Dept_str']), self.global_mean), axis=1
        )
        Xc['Store_Mean_Sales']=Xc['Store_str'].map(self.store_means).fillna(self.global_mean)
        Xc['Dept_Mean_Sales']=Xc['Dept_str'].map(self.dept_means).fillna(self.global_mean)
        Xc.drop(['Store_str','Dept_str'],axis=1,inplace=True)
        Xc['Sales_per_Size'] = Xc['Store_Mean_Sales'] / (Xc['Size']+1e-6)

        # Holiday interactions
        if 'IsHoliday' in Xc:
            Xc['IsHoliday_int'] = Xc['IsHoliday'].astype(int)
            Xc['Holiday_Dept'] = Xc['IsHoliday_int'] * Xc['Dept'].astype(float)
            Xc['Holiday_Store']= Xc['IsHoliday_int'] * Xc['Store'].astype(float)
            Xc.drop('IsHoliday_int',axis=1,inplace=True)
        else:
            Xc['Holiday_Dept']=0.0; Xc['Holiday_Store']=0.0

        # Encode categoricals
        for col, le in self.label_encoders.items():
            vals = Xc[col].astype(str).fillna('unknown')
            unseen = ~vals.isin(le.classes_)
            if unseen.any(): vals.loc[unseen]='unknown'
            Xc[col] = le.transform(vals).astype(int)

        # Drop target if present
        Xc.drop('Weekly_Sales', axis=1, errors='ignore', inplace=True)

        # Final column selection
        if self.target_columns:
            for c in set(self.target_columns)-set(Xc.columns): Xc[c]=0.0
            Xc = Xc[self.target_columns]
            # Ensure ints
            for c in ['Store','Dept','Type','Year','Month','Week','DayOfYear','Quarter']:
                if c in Xc: Xc[c]=Xc[c].astype(int)
            # Floats
            for c in Xc.columns:
                if pd.api.types.is_numeric_dtype(Xc[c]) and not pd.api.types.is_integer_dtype(Xc[c]):
                    Xc[c]=Xc[c].astype(float)
        return Xc


class LightGBMRegressor(BaseEstimator):
    def __init__(self, **params):
        self.params = {
            'objective':'regression','metric':'mae','boosting_type':'gbdt',
            'num_leaves':64,'learning_rate':0.05,'feature_fraction':0.8,
            'bagging_fraction':0.8,'bagging_freq':5,'min_child_samples':20,
            'min_child_weight':0.001,'reg_alpha':0.1,'reg_lambda':0.1,
            'random_state':42,'verbose':-1,'n_jobs':-1
        }
        self.params.update(params)
        self.model=None
    def fit(self,X,y,eval_set=None,early_stopping_rounds=100,verbose=False):
        dtrain = lgb.Dataset(X,label=y)
        valid_sets=[dtrain]; valid_names=['train']
        if eval_set is not None:
            dval = lgb.Dataset(eval_set[0],label=eval_set[1],reference=dtrain)
            valid_sets.append(dval); valid_names.append('valid')
        self.model = lgb.train(
            self.params, dtrain,
            valid_sets=valid_sets, valid_names=valid_names,
            num_boost_round=10000,
            callbacks=[
                lgb.early_stopping(early_stopping_rounds, verbose=verbose),
                lgb.log_evaluation(100 if verbose else 0)
            ]
        )
        return self
    def predict(self,X):
        return self.model.predict(X, num_iteration=self.model.best_iteration)
    def get_feature_importance(self):
        return self.model.feature_importance(importance_type='gain')

def weighted_mae(y_true, y_pred, is_holiday):
    weights = np.where(is_holiday, 5, 1)
    return np.sum(weights * np.abs(y_true-y_pred)) / np.sum(weights)

def evaluate_model(y_true, y_pred, is_holiday_series):
    mae = mean_absolute_error(y_true, y_pred)
    wmae= weighted_mae(y_true, y_pred, is_holiday_series.values)
    print(f"MAE: {mae:.4f}, WMAE: {wmae:.4f}")
    return {'mae':mae,'wmae':wmae}


expected_features_after_preprocessing = [
    'Store','Dept','IsHoliday','Size','Temperature','Fuel_Price','CPI','Unemployment',
    'MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','Type',
    'Year','Month','Week','DayOfYear','Quarter',
    'Month_sin','Month_cos','Week_sin','Week_cos',
    'Store_Dept_Mean_Sales','Store_Mean_Sales','Dept_Mean_Sales','Sales_per_Size',
    'Holiday_Dept','Holiday_Store'
]

# 7.1 Fit preprocessor
pre = WalmartPreprocessor(target_columns=expected_features_after_preprocessing)
pre.fit(X_train_for_pipeline, y_train)

# 7.2 Transform
X_train_trans = pre.transform(X_train_for_pipeline)
X_val_trans   = pre.transform(X_val_for_pipeline)

# 7.3 Train LightGBM
model = LightGBMRegressor()
model.fit(
    X_train_trans, y_train,
    eval_set=(X_val_trans, y_val),
    early_stopping_rounds=100,
    verbose=True
)

# 7.4 Wrap into a pipeline
full_prediction_pipeline = Pipeline([
    ('preprocessor', pre),
    ('lgbm_model',    model)
])

# Save pipeline
joblib.dump(full_prediction_pipeline, "models/lightgbm_full_pipeline_approach2.pkl")


y_train_pred = full_prediction_pipeline.predict(X_train_for_pipeline)
y_val_pred   = full_prediction_pipeline.predict(X_val_for_pipeline)

print("Training performance:")
evaluate_model(y_train, y_train_pred, X_train['IsHoliday'])
print("Validation performance:")
evaluate_model(y_val, y_val_pred, X_val['IsHoliday'])


y_test_pred = full_prediction_pipeline.predict(df_full_test)
y_test_pred[y_test_pred < 0] = 0

submission = pd.DataFrame({
    'Id': df_full_test['Store'].astype(str) + '_' + \
          df_full_test['Dept'].astype(str) + '_' + \
          df_full_test['Date'].dt.strftime('%Y-%m-%d'),
    'Weekly_Sales': y_test_pred
})

submission.to_csv('submission_approach2.csv', index=False)
print(submission.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1h3JmMNvF7pLor34P-qm2FEkIev93euuf/ML_FInal_Project
Training until validation scores don't improve for 100 rounds
[100]	train's l1: 2476.96	valid's l1: 2626.57
[200]	train's l1: 2206.26	valid's l1: 2533.19
[300]	train's l1: 2068.57	valid's l1: 2502.76
[400]	train's l1: 1978.78	valid's l1: 2467.22
[500]	train's l1: 1906.2	valid's l1: 2444.3
[600]	train's l1: 1851.51	valid's l1: 2441.01
[700]	train's l1: 1802.75	valid's l1: 2426.7
[800]	train's l1: 1761.68	valid's l1: 2420
[900]	train's l1: 1727.23	valid's l1: 2419
[1000]	train's l1: 1694.46	valid's l1: 2415.07
[1100]	train's l1: 1663.29	valid's l1: 2409.82
[1200]	train's l1: 1635.04	valid's l1: 2402.11
[1300]	train's l1: 1611.72	valid's l1: 2403.17
Early stopping, best iteration is:
[1263]	train's l1: 1620.68	valid's l1: 2400.06
Training performance:
MAE: 1620.6770, WMAE: 