In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [2]:
# Install required logging tools
%pip install -q dagshub 
%pip install -U mlflow

# Initialize DagsHub MLflow integration
import dagshub
dagshub.init(repo_owner='AleksandreBakhtadze', repo_name='ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting', mlflow=True)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.16.1 requires dacite>=1.8, but you have dacite 1.6.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-an



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=614c61ad-a568-43d8-91e2-4c29582f1448&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=9a6840e620bb8f960bca50f750ab69ffe81fc72ade8510be88af893cac9100ad




Output()

In [4]:
import pandas as pd
import numpy as np
import zipfile
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import mlflow
import mlflow.sklearn
import joblib
import os
import optuna
from sklearn.model_selection import TimeSeriesSplit

# Helper functions
def read_zipped_csv(path):
    with zipfile.ZipFile(path) as z:
        file_name = z.namelist()[0]
        return pd.read_csv(z.open(file_name))

def merge_datasets(X, features_df, stores_df):
    df = X.copy()
    df_full = df.merge(features_df, on=["Store", "Date"], how="left")
    df_full = df_full.merge(stores_df, on="Store", how="left")
    df_full = df_full.drop(columns=['IsHoliday_y'], errors='ignore')
    df_full = df_full.rename(columns={'IsHoliday_x': 'IsHoliday'})
    return df_full

def add_markdown_indicators(df):
    df = df.copy()
    for i in range(1, 6):
        df[f'MarkDown{i}_Missing'] = df[f'MarkDown{i}'].isnull().astype(int)
    return df

def feature_engineering(df, is_train=True, train_df=None):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week.astype(int)
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Quarter'] = df['Date'].dt.quarter
    df['IsHoliday'] = df['IsHoliday'].astype(int)
    df['Type'] = df['Type'].map({'A': 0, 'B': 1, 'C': 2})

    # Holiday proximity features
    df['DaysToChristmas'] = (pd.to_datetime(df['Year'].astype(str) + '-12-25') - df['Date']).dt.days
    df['DaysToThanksgiving'] = (pd.to_datetime(df['Year'].astype(str) + '-11-28') - df['Date']).dt.days

    # Interaction features
    df['Store_Dept'] = df['Store'].astype(str) + '_' + df['Dept'].astype(str)

    if is_train and train_df is not None:
        # Ensure train_df['Date'] is datetime
        train_df_sorted = train_df.copy()
        train_df_sorted['Date'] = pd.to_datetime(train_df_sorted['Date'], format='%Y-%m-%d')
        train_df_sorted = train_df_sorted.sort_values(['Store', 'Dept', 'Date'])
        
        # Create lag and rolling features
        train_df_sorted['Lag1_Weekly_Sales'] = train_df_sorted.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
        train_df_sorted['Rolling_Mean_7'] = train_df_sorted.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
            lambda x: x.rolling(window=7, min_periods=1).mean()
        ).shift(1)
        
        # Merge lag and rolling features
        df = df.merge(
            train_df_sorted[['Store', 'Dept', 'Date', 'Lag1_Weekly_Sales', 'Rolling_Mean_7']],
            on=['Store', 'Dept', 'Date'],
            how='left'
        )

        # Impute missing lag/rolling features
        df['Lag1_Weekly_Sales'] = df['Lag1_Weekly_Sales'].fillna(df['Lag1_Weekly_Sales'].median())
        df['Rolling_Mean_7'] = df['Rolling_Mean_7'].fillna(df['Rolling_Mean_7'].median())

    df = add_markdown_indicators(df)
    return df

def feature_engineering_train(X):
    return feature_engineering(X, is_train=True, train_df=train_df)

def feature_engineering_test(X):
    return feature_engineering(X, is_train=False, train_df=None)

def convert_date(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    return df

class ForwardFillImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        return X.ffill()

class DatasetMerger(BaseEstimator, TransformerMixin):
    def __init__(self, features_df, stores_df):
        self.features_df = features_df
        self.stores_df = stores_df
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return merge_datasets(X, self.features_df, self.stores_df)

# Load data
train_df = read_zipped_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
features_df = read_zipped_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
stores_df = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv')
test_df = read_zipped_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip')

# Convert features_df['Date'] to datetime
features_df['Date'] = pd.to_datetime(features_df['Date'], format='%Y-%m-%d')

# Ensure train_df['Date'] is datetime
train_df['Date'] = pd.to_datetime(train_df['Date'], format='%Y-%m-%d')

# Feature columns
feature_cols = [
    'Store', 'Dept', 'Type', 'Size', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
    'MarkDown1_Missing', 'MarkDown2_Missing', 'MarkDown3_Missing', 'MarkDown4_Missing', 'MarkDown5_Missing',
    'IsHoliday', 'Year', 'Month', 'Week', 'DayOfWeek', 'Quarter',
    'DaysToChristmas', 'DaysToThanksgiving', 'Store_Dept', 'Lag1_Weekly_Sales', 'Rolling_Mean_7'
]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num_ffill', ForwardFillImputer(), ['CPI', 'Unemployment']),
        ('markdown_fill', SimpleImputer(strategy='median'), 
         ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']),
        ('scale', StandardScaler(), ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size', 
                                    'DaysToChristmas', 'DaysToThanksgiving', 'Lag1_Weekly_Sales', 'Rolling_Mean_7']),
        ('passthrough', 'passthrough', 
         ['Store', 'Dept', 'Type', 'IsHoliday', 'Year', 'Month', 'Week', 'DayOfWeek', 'Quarter',
          'MarkDown1_Missing', 'MarkDown2_Missing', 'MarkDown3_Missing', 'MarkDown4_Missing', 'MarkDown5_Missing',
          'Store_Dept'])
    ]
)

# WMAE metric
def calculate_wmae(y_true, y_pred, is_holiday):
    weights = np.where(is_holiday, 5, 1)
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

# Hyperparameter tuning with Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),  # Expanded range
        'max_depth': trial.suggest_int('max_depth', 3, 12),  # Deeper trees
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5, log=True),  # Wider range
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),  # Expanded range
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),  # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1)  # L2 regularization
    }
    
    pipeline = Pipeline([
        ('convert_date', FunctionTransformer(convert_date, validate=False)),
        ('merge', DatasetMerger(features_df=features_df, stores_df=stores_df)),
        ('feature_engineering', FunctionTransformer(feature_engineering_train, validate=False)),
        ('preprocess', preprocessor),
        ('model', XGBRegressor(**params, random_state=42, n_jobs=-1))
    ])
    
    tscv = TimeSeriesSplit(n_splits=3)
    wmae_scores = []
    
    for train_idx, val_idx in tscv.split(X_train):
        X_t, y_t = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_v, y_v = X_train.iloc[val_idx], y_train.iloc[val_idx]
        is_holiday_v = X_v['IsHoliday']
        
        pipeline.fit(X_t, y_t)
        y_pred = pipeline.predict(X_v)
        wmae = calculate_wmae(y_v, y_pred, is_holiday_v)
        wmae_scores.append(wmae)
    
    return np.mean(wmae_scores)

# Prepare training data
train_full = train_df.copy()
y = train_full['Weekly_Sales']
train_full = train_full.drop(columns=['Weekly_Sales'])
train_full['Date'] = pd.to_datetime(train_full['Date'], format='%Y-%m-%d')

# Split data
train_full = train_full.sort_values('Date')
split_date = train_full['Date'].quantile(0.9)
train_idx = train_full['Date'] < split_date
X_train = train_full[train_idx]
y_train = np.clip(y[train_idx], 0, y.quantile(0.99))
X_val = train_full[~train_idx]
y_val = y[~train_idx]
is_holiday_val = X_val['IsHoliday']

# Optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # Increased trials for better search

# Best parameters
best_params = study.best_params
print("Best parameters:", best_params)

# Final pipeline with best parameters
pipeline = Pipeline([
    ('convert_date', FunctionTransformer(convert_date, validate=False)),
    ('merge', DatasetMerger(features_df=features_df, stores_df=stores_df)),
    ('feature_engineering', FunctionTransformer(feature_engineering_train, validate=False)),
    ('preprocess', preprocessor),
    ('model', XGBRegressor(**best_params, random_state=42, n_jobs=-1))
])

# MLflow setup
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/AleksandreBakhtadze/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting.mlflow'
os.environ['MLFLOW_TRACKING_USERNAME'] = 'AleksandreBakhtadze'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '034b77b38fbceb0a45865e04299f524469d930d4'

mlflow.set_experiment("XGBoost_Improved_Pipeline")

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="XGBoost_Optimized"):
    # Fit pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = pipeline.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    wmae = calculate_wmae(y_val, y_pred, is_holiday_val)
    print(f"Validation MAE: {mae:.2f}")
    print(f"Validation WMAE: {wmae:.2f}")
    
    # Log parameters and metrics
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("val_mae", mae)
    mlflow.log_metric("val_wmae", wmae)
    
    # Save model as artifact (skip registration due to endpoint error)
    os.makedirs("/kaggle/working/models", exist_ok=True)
    model_path = "/kaggle/working/models/xgboost_optimized.joblib"
    joblib.dump(pipeline, model_path)
    mlflow.log_artifact(model_path)
    print(f"Model saved as artifact: {model_path}")

# Test predictions with modified feature engineering for test set
test_full = test_df.copy()
pipeline.named_steps['feature_engineering'] = FunctionTransformer(feature_engineering_test, validate=False)
test_preds = pipeline.predict(test_full)

# Submission
test_full_transformed = pipeline.named_steps['convert_date'].transform(test_full)
test_full_transformed = pipeline.named_steps['merge'].transform(test_full_transformed)
test_full_transformed = pipeline.named_steps['feature_engineering'].transform(test_full_transformed)
test_full_transformed['Id'] = test_full_transformed['Store'].astype(str) + '_' + test_full_transformed['Dept'].astype(str) + '_' + test_full_transformed['Date'].dt.strftime('%Y-%m-%d')
submission_df = pd.DataFrame({'Id': test_full_transformed['Id'], 'Weekly_Sales': test_preds})
submission_df.to_csv("submission.csv", index=False)
mlflow.log_artifact("submission.csv")

[I 2025-07-06 18:19:57,226] A new study created in memory with name: no-name-3bd13cb3-ceee-4e4a-9d58-5f0b8958b400
[I 2025-07-06 18:20:30,864] Trial 0 finished with value: 15858.626930233222 and parameters: {'n_estimators': 252, 'max_depth': 11, 'learning_rate': 0.04329351674961105, 'subsample': 0.6831684776872025, 'colsample_bytree': 0.7752516137814925, 'min_child_weight': 13, 'reg_alpha': 0.8619552349619864, 'reg_lambda': 0.06662380736081364}. Best is trial 0 with value: 15858.626930233222.
[I 2025-07-06 18:21:24,933] Trial 1 finished with value: 26011.873001592478 and parameters: {'n_estimators': 461, 'max_depth': 10, 'learning_rate': 0.4264227009977883, 'subsample': 0.5097233193840252, 'colsample_bytree': 0.9520237560868337, 'min_child_weight': 7, 'reg_alpha': 0.3080627279135333, 'reg_lambda': 0.7928366018646736}. Best is trial 0 with value: 15858.626930233222.
[I 2025-07-06 18:21:48,635] Trial 2 finished with value: 14850.876089265335 and parameters: {'n_estimators': 128, 'max_dept

Best parameters: {'n_estimators': 50, 'max_depth': 6, 'learning_rate': 0.010558594508774749, 'subsample': 0.9368769091423239, 'colsample_bytree': 0.8746968410442786, 'min_child_weight': 2, 'reg_alpha': 0.18504540374831363, 'reg_lambda': 0.16537426759935459}
Validation MAE: 14431.84
Validation WMAE: 14150.16
Model saved as artifact: /kaggle/working/models/xgboost_optimized.joblib
🏃 View run XGBoost_Optimized at: https://dagshub.com/AleksandreBakhtadze/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/4/runs/4bff9df295f94ac29899c469ed8f47e2
🧪 View experiment at: https://dagshub.com/AleksandreBakhtadze/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/4
