In [14]:
!pip install dagshub mlflow --quiet

import warnings
from statsmodels.tools.sm_exceptions import ValueWarning

warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore")

print("Done!")

Done!


In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [16]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from statsmodels.tsa.arima.model import ARIMA

import mlflow
import dagshub
import joblib
from tqdm import tqdm

dagshub.init(repo_owner='lkhok22', repo_name='ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting', mlflow=True)
mlflow.set_experiment("ARIMA")


<Experiment: artifact_location='mlflow-artifacts:/10c60c956d2d432ebf8207329ce8cec7', creation_time=1752605185184, experiment_id='2', last_update_time=1752605185184, lifecycle_stage='active', name='ARIMA', tags={}>

In [17]:
class TemporalFeatureGenerator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["week_id"] = X["Date"].dt.to_period("W").rank(method="dense").astype(int) - 1
        X["sin_13"] = np.sin(2 * np.pi * X["week_id"] / 13)
        X["cos_13"] = np.cos(2 * np.pi * X["week_id"] / 13)
        X["sin_23"] = np.sin(2 * np.pi * X["week_id"] / 23)
        X["cos_23"] = np.cos(2 * np.pi * X["week_id"] / 23)
        return X.drop(columns=["Date"])

time_features = ["week_id", "sin_13", "cos_13", "sin_23", "cos_23"]
extra_features = time_features

class FeatureCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")

class NamedColumnTransformer(ColumnTransformer):
    def get_feature_names_out(self, input_features=None):
        return super().get_feature_names_out(input_features)

    def transform(self, X):
        result = super().transform(X)
        col_names = [name.split("__")[-1] for name in self.get_feature_names_out()]
        return pd.DataFrame(result, columns=col_names, index=X.index)

    def fit_transform(self, X, y=None):
        result = super().fit_transform(X, y)
        col_names = [name.split("__")[-1] for name in self.get_feature_names_out()]
        return pd.DataFrame(result, columns=col_names, index=X.index)

class IndexRestorer(BaseEstimator, TransformerMixin):
    def __init__(self, index_columns=["Date", "Store", "Dept"]):
        self.index_columns = index_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X = X.set_index(self.index_columns, drop=False)  
        return X


In [18]:
class ARIMAForecaster(BaseEstimator, RegressorMixin):
    def __init__(self, order=(2, 1, 0), store_col='Store', dept_col='Dept'):
        self.order = order
        self.store_col = store_col
        self.dept_col = dept_col

    def fit(self, X, y):
        if not isinstance(X.index, pd.MultiIndex):
            raise ValueError("Index must be MultiIndex with (Date, Store, Dept)")

        self.models = {}
        self.fallback = {}

        df = X.copy()
        df["target"] = y.values

        grouped_data = df.groupby(level=[self.store_col, self.dept_col])

        for keys, group in grouped_data:
            store, dept = keys
            series = group["target"]
            series.index = pd.to_datetime(group.index.get_level_values("Date"))

            if series.nunique() < 2 or len(series) < (self.order[0] + self.order[2] + 1):
                self.fallback[keys] = series.mean() if len(series) > 0 else 0.0
                continue

            try:
                fitted = ARIMA(endog=series, order=self.order).fit()
                self.models[keys] = fitted
            except Exception as e:
                print(f"Error fitting ({store}, {dept}): {e}")
                self.fallback[keys] = series.mean() if len(series) > 0 else 0.0

        return self

    def predict(self, X):
        if not isinstance(X.index, pd.MultiIndex):
            raise ValueError("Index must be MultiIndex with (Date, Store, Dept)")

        predictions = pd.Series(index=X.index, dtype=float)

        for keys, group in X.groupby(level=[self.store_col, self.dept_col]):
            model = self.models.get(keys, None)
            if model:
                steps = len(group)
                forecast = model.forecast(steps=steps)
                predictions.loc[group.index] = forecast.to_numpy()
            else:
                avg = self.fallback.get(keys, 0.0)
                predictions.loc[group.index] = avg

        return predictions.to_numpy()


In [19]:
train = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip", parse_dates=["Date"])
features = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip", parse_dates=["Date"])
stores = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv")

merged = train.merge(features, on=["Store", "Date", "IsHoliday"], how="left")
merged = merged.merge(stores, on="Store", how="left")


In [20]:
categorical_vars = ["Store", "Dept", "Type", "IsHoliday"]
numerical_vars = ["Temperature", "Fuel_Price", "CPI", "Unemployment",
                  "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]

numeric_pipe = SimpleImputer(strategy="mean")
categorical_pipe = OneHotEncoder(handle_unknown="ignore")

full_transformer = NamedColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numerical_vars + extra_features),
        ("cat", categorical_pipe, categorical_vars),
    ],
    sparse_threshold=0.0
)

arima_pipe = Pipeline([
    ("index_set", IndexRestorer()),
    ("time_gen", TemporalFeatureGenerator()),
    ("preprocess", full_transformer),
    ("dropper", FeatureCleaner(columns=[])),
    ("forecast", ARIMAForecaster(order=(2, 1, 0)))
])

param_grid = {
    "forecast__order": [(1, 1, 1), (2, 1, 2), (3, 1, 0), (5, 1, 0)]
}

# Optional: use GridSearchCV for tuning
# grid = GridSearchCV(estimator=arima_pipe, param_grid=param_grid, cv=3, scoring="neg_mean_absolute_error")


In [21]:
with mlflow.start_run(run_name="Feature_Generation"):
    mlflow.log_param("engineered_features", extra_features)

target = merged["Weekly_Sales"]
features_X = merged.drop(columns=["Weekly_Sales"])

X_tr, X_te, y_tr, y_te = train_test_split(features_X, target, test_size=0.2, random_state=42)


🏃 View run Feature_Generation at: https://dagshub.com/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/2/runs/4ae44101f33b41aeb8628913460868a5
🧪 View experiment at: https://dagshub.com/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/2


In [22]:
with mlflow.start_run(run_name="ARIMA_Training_Run"):
    arima_pipe.fit(X_tr, y_tr)
    preds = arima_pipe.predict(X_te)

    mae = mean_absolute_error(y_te, preds)
    weight = X_te["IsHoliday"].apply(lambda x: 5 if x else 1)
    wmae = (weight * np.abs(y_te - preds)).sum() / weight.sum()

    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("WMAE", wmae)

    model_filename = "arima_model.pkl"
    joblib.dump(arima_pipe, model_filename)
    mlflow.log_artifact(model_filename)


🏃 View run ARIMA_Training_Run at: https://dagshub.com/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/2/runs/93a3d57e981a4f9b96c1aeafc8c38b40
🧪 View experiment at: https://dagshub.com/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/2
