<a href="https://colab.research.google.com/github/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_SARIMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ‚úÖ Only install what you need for SARIMAX and logging
!pip install statsmodels wandb --quiet

In [3]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import wandb
import os
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Install required libraries
!pip install pandas numpy matplotlib seaborn scikit-learn torch torchvision wandb pyyaml darts --quiet
import wandb
wandb.login(key="eccf2c915699fc032ad678daf0fd4b5ac60bf87c")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabakh22[0m ([33mabakh22-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:

# Mount Google Drive and extract data
from google.colab import drive
import zipfile
import os
drive.mount('/content/drive')
zip_path = '/content/drive/MyDrive/ML-FinalProject/data.zip'
extract_to = '/content/walmart_data/'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
for file_name in os.listdir(extract_to):
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(os.path.join(extract_to, file_name), 'r') as zip_ref:
            zip_ref.extractall(extract_to)
print("‚úÖ Extracted files:", os.listdir(extract_to))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Extracted files: ['stores.csv', 'sampleSubmission.csv', 'train.csv.zip', 'features.csv', 'test.csv.zip', 'test.csv', 'sampleSubmission.csv.zip', 'train.csv', 'features.csv.zip']


In [18]:
# Load and preprocess data
train = pd.read_csv('/content/walmart_data/train.csv')
features = pd.read_csv('/content/walmart_data/features.csv')
stores = pd.read_csv('/content/walmart_data/stores.csv')
test = pd.read_csv('/content/walmart_data/test.csv')

# Merge train with features and stores
df = pd.merge(train, features, on=['Store', 'Date'], how='left')
df = pd.merge(df, stores, on='Store', how='left')
df = df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Store', 'Dept', 'Date'])

# ‚úÖ IMPROVED PREPROCESSING
# Create IsHolidayWeight column
df['IsHolidayWeight'] = df['IsHoliday'].apply(lambda x: 5 if x else 1)

# ‚úÖ Compute department-level means for holiday and non-holiday weeks
dept_means = df.groupby(['Dept', 'IsHoliday'])['Weekly_Sales'].mean().unstack().fillna(df['Weekly_Sales'].mean())

# ‚úÖ BETTER HOLIDAY FEATURE ENGINEERING
holiday_dates = {
    'SuperBowl': ['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'],
    'LaborDay': ['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'],
    'Thanksgiving': ['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'],
    'Christmas': ['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']
}

# Add more sophisticated holiday features
for holiday, dates in holiday_dates.items():
    holiday_pd_dates = pd.to_datetime(dates)
    df[f'{holiday}_Week'] = 0
    for date in holiday_pd_dates:
        # Mark the exact holiday week
        df.loc[df['Date'] == date, f'{holiday}_Week'] = 1
        # Mark weeks before and after with decreasing weights
        for i in range(1, 4):  # 3 weeks before/after
            before_date = date - pd.Timedelta(weeks=i)
            after_date = date + pd.Timedelta(weeks=i)
            weight = 1.0 / (i + 1)  # Decreasing weight
            df.loc[df['Date'] == before_date, f'{holiday}_Week'] = weight
            df.loc[df['Date'] == after_date, f'{holiday}_Week'] = weight

# ‚úÖ ENHANCED TIME-BASED FEATURES
df['WeekOfYear'] = df['Date'].dt.isocalendar().week
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year - df['Date'].dt.year.min()
df['Quarter'] = df['Date'].dt.quarter

# Add cyclical features
df['Week_sin'] = np.sin(2 * np.pi * df['WeekOfYear'] / 52)
df['Week_cos'] = np.cos(2 * np.pi * df['WeekOfYear'] / 52)
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

# ‚úÖ HANDLE MISSING VALUES BETTER
# Fill missing values with more sophisticated methods
df['Temperature'].fillna(df.groupby(['Store', 'Month'])['Temperature'].transform('mean'), inplace=True)
df['Fuel_Price'].fillna(df.groupby('Store')['Fuel_Price'].transform('mean'), inplace=True)
df['CPI'].fillna(df.groupby('Store')['CPI'].transform('mean'), inplace=True)
df['Unemployment'].fillna(df.groupby('Store')['Unemployment'].transform('mean'), inplace=True)

# Fill markdown columns with 0 (more meaningful than NaN for promotions)
markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for col in markdown_cols:
    df[col].fillna(0, inplace=True)

# ‚úÖ CREATE INTERACTION FEATURES
df['Temp_Unemployment'] = df['Temperature'] * df['Unemployment']
df['Holiday_Markdown'] = df['IsHoliday'] * (df['MarkDown1'] + df['MarkDown2'] + df['MarkDown3'] + df['MarkDown4'] + df['MarkDown5'])
df['Total_Markdown'] = df['MarkDown1'] + df['MarkDown2'] + df['MarkDown3'] + df['MarkDown4'] + df['MarkDown5']

# ‚úÖ APPLY SAME PREPROCESSING TO TEST DATA
test_df = pd.merge(test, features, on=['Store', 'Date'], how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')
test_df = test_df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df = test_df.sort_values(by=['Store', 'Dept', 'Date'])

test_df['IsHolidayWeight'] = test_df['IsHoliday'].apply(lambda x: 5 if x else 1)

# Apply same holiday features to test
for holiday, dates in holiday_dates.items():
    holiday_pd_dates = pd.to_datetime(dates)
    test_df[f'{holiday}_Week'] = 0
    for date in holiday_pd_dates:
        test_df.loc[test_df['Date'] == date, f'{holiday}_Week'] = 1
        for i in range(1, 4):
            before_date = date - pd.Timedelta(weeks=i)
            after_date = date + pd.Timedelta(weeks=i)
            weight = 1.0 / (i + 1)
            test_df.loc[test_df['Date'] == before_date, f'{holiday}_Week'] = weight
            test_df.loc[test_df['Date'] == after_date, f'{holiday}_Week'] = weight

# Apply same time features to test
test_df['WeekOfYear'] = test_df['Date'].dt.isocalendar().week
test_df['Month'] = test_df['Date'].dt.month
test_df['Year'] = test_df['Date'].dt.year - df['Date'].dt.year.min()
test_df['Quarter'] = test_df['Date'].dt.quarter
test_df['Week_sin'] = np.sin(2 * np.pi * test_df['WeekOfYear'] / 52)
test_df['Week_cos'] = np.cos(2 * np.pi * test_df['WeekOfYear'] / 52)
test_df['Month_sin'] = np.sin(2 * np.pi * test_df['Month'] / 12)
test_df['Month_cos'] = np.cos(2 * np.pi * test_df['Month'] / 12)

# Apply same missing value handling to test
test_df['Temperature'].fillna(test_df.groupby(['Store', 'Month'])['Temperature'].transform('mean'), inplace=True)
test_df['Fuel_Price'].fillna(test_df.groupby('Store')['Fuel_Price'].transform('mean'), inplace=True)
test_df['CPI'].fillna(test_df.groupby('Store')['CPI'].transform('mean'), inplace=True)
test_df['Unemployment'].fillna(test_df.groupby('Store')['Unemployment'].transform('mean'), inplace=True)

for col in markdown_cols:
    test_df[col].fillna(0, inplace=True)

# Apply same interaction features to test
test_df['Temp_Unemployment'] = test_df['Temperature'] * test_df['Unemployment']
test_df['Holiday_Markdown'] = test_df['IsHoliday'] * (test_df['MarkDown1'] + test_df['MarkDown2'] + test_df['MarkDown3'] + test_df['MarkDown4'] + test_df['MarkDown5'])
test_df['Total_Markdown'] = test_df['MarkDown1'] + test_df['MarkDown2'] + test_df['MarkDown3'] + test_df['MarkDown4'] + test_df['MarkDown5']

In [19]:


def calculate_wmae(y_true, y_pred, weights):
    """Calculate Weighted Mean Absolute Error"""
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

def calculate_mae(y_true, y_pred):
    """Calculate Mean Absolute Error"""
    return np.mean(np.abs(y_true - y_pred))

In [20]:
# ‚úÖ MODIFIED SARIMAX CLASS WITH REFINED EXTERNAL REGRESSORS AND REVERTED PARAMETERS
class ImprovedSARIMAX:
    def __init__(self, order=(1, 0, 2), seasonal_order=(1, 1, 1, 52)):  # ‚úÖ Reverted to original order
        self.order = order
        self.seasonal_order = seasonal_order
        self.models = {}
        self.fallback_means = {}
        # Refined exogenous variables focusing on holidays and seasonality
        self.exog_vars = [
            'Total_Markdown', 'Holiday_Markdown',
            'SuperBowl_Week', 'Thanksgiving_Week', 'Christmas_Week',
            'Week_sin', 'Week_cos'
        ]

    def _prepare_exog(self, data):
        """Prepare exogenous variables"""
        try:
            exog = data[self.exog_vars].copy()
            # Handle any remaining NaN values
            exog = exog.fillna(method='ffill').fillna(method='bfill')
            return exog.values
        except:
            return None

    def fit_predict_store_dept(self, store, dept, train_data, test_data):
        try:
            # Get training data for this store-dept
            train_group = train_data[(train_data['Store'] == store) & (train_data['Dept'] == dept)].copy()
            test_group = test_data[(test_data['Store'] == store) & (test_data['Dept'] == dept)].copy()

            if len(train_group) < 30:  # Need more data for SARIMAX with external regressors
                mean_sales = train_group['Weekly_Sales'].mean() if len(train_group) > 0 else 0
                self.fallback_means[(store, dept)] = mean_sales
                return self._create_fallback_result(mean_sales, train_group, test_group)

            # Sort by date
            train_group = train_group.sort_values('Date')
            test_group = test_group.sort_values('Date')

            # Prepare time series and exogenous variables
            y_train = train_group['Weekly_Sales'].values
            exog_train = self._prepare_exog(train_group)

            # Skip if constant or near-constant series
            if len(set(y_train)) <= 2 or np.std(y_train) < 1e-3:
                mean_sales = np.mean(y_train)
                self.fallback_means[(store, dept)] = mean_sales
                return self._create_fallback_result(mean_sales, train_group, test_group)

            # ENHANCED SARIMAX with external regressors
            model = SARIMAX(
                endog=y_train,
                exog=exog_train,
                order=self.order,
                seasonal_order=self.seasonal_order,
                enforce_stationarity=False,
                enforce_invertibility=False,
                simple_differencing=True
            )

            # Fit with more iterations for better convergence
            fitted_model = model.fit(
                disp=False,
                maxiter=100,
                method='lbfgs',
                optim_score='harvey'
            )

            self.models[(store, dept)] = fitted_model

            # Validation split
            split_idx = int(len(y_train) * 0.8)
            val_exog = exog_train[split_idx:] if exog_train is not None else None
            val_pred = fitted_model.forecast(steps=len(y_train) - split_idx, exog=val_exog)
            val_actual = y_train[split_idx:]
            val_weights = train_group['IsHolidayWeight'].iloc[split_idx:].values

            # Test prediction
            test_pred = None
            if len(test_group) > 0:
                exog_test = self._prepare_exog(test_group)
                test_pred = fitted_model.forecast(steps=len(test_group), exog=exog_test)

            return val_pred, val_actual, val_weights, test_pred, test_group

        except Exception as e:
            # Fallback to mean for problematic series
            mean_sales = train_group['Weekly_Sales'].mean() if len(train_group) > 0 else 0
            self.fallback_means[(store, dept)] = mean_sales
            return self._create_fallback_result(mean_sales, train_group, test_group)

    def _create_fallback_result(self, mean_sales, train_group, test_group):
        """Create fallback result using holiday/non-holiday means"""
        if len(train_group) == 0:
            return None, None, None, None, test_group

        split_idx = int(len(train_group) * 0.8)
        val_length = len(train_group) - split_idx

        # Calculate holiday and non-holiday means
        holiday_mean = train_group[train_group['IsHoliday']]['Weekly_Sales'].mean() if any(train_group['IsHoliday']) else mean_sales
        non_holiday_mean = train_group[~train_group['IsHoliday']]['Weekly_Sales'].mean() if any(~train_group['IsHoliday']) else mean_sales
        holiday_mean = holiday_mean if not pd.isna(holiday_mean) else mean_sales
        non_holiday_mean = non_holiday_mean if not pd.isna(non_holiday_mean) else mean_sales

        # Validation predictions using holiday/non-holiday means
        val_pred = np.where(train_group['IsHoliday'].iloc[split_idx:], holiday_mean, non_holiday_mean)
        val_actual = train_group['Weekly_Sales'].iloc[split_idx:].values
        val_weights = train_group['IsHolidayWeight'].iloc[split_idx:].values

        # Test predictions using holiday/non-holiday means
        test_pred = None
        if len(test_group) > 0:
            test_pred = np.where(test_group['IsHoliday'], holiday_mean, non_holiday_mean)

        return val_pred, val_actual, val_weights, test_pred, test_group

In [23]:
# Initialize wandb
wandb.init(project="walmart-sales-forecasting", name="sarimax-improved-missing-depts", config={
    "model": "SARIMAX_with_refined_regressors_fallback_missing_depts",
    "seasonal_period": 52,
    "order": (1, 1, 1),  # ‚úÖ Reverted to original
    "seasonal_order": (1, 1, 1, 52),
    "external_regressors": True,
    "features": "refined_holidays_cyclical_improved_fallback_missing_depts"
})

# ‚úÖ TRAINING LOOP
model = ImprovedSARIMAX(order=(1, 0, 2), seasonal_order=(1, 1, 1, 52))  # ‚úÖ Reverted to original order

val_predictions = []
val_actuals = []
val_weights = []

# Get unique store-dept combinations from TRAINING data
store_dept_combinations = df[['Store', 'Dept']].drop_duplicates()

print(f"Training improved models for {len(store_dept_combinations)} store-department combinations...")

import time
start_time = time.time()
fallback_count = 0
success_count = 0

# Dictionary to store predictions for each store-dept
predictions_dict = {}

for idx, (_, row) in enumerate(store_dept_combinations.iterrows()):
    store, dept = row['Store'], row['Dept']

    # Progress updates every 50 iterations
    if idx % 50 == 0:
        elapsed = time.time() - start_time
        rate = idx / elapsed if elapsed > 0 else 0
        eta = (len(store_dept_combinations) - idx) / rate if rate > 0 else 0
        print(f"Progress: {idx}/{len(store_dept_combinations)} ({idx/len(store_dept_combinations)*100:.1f}%) | "
              f"Rate: {rate:.1f}/sec | ETA: {eta/60:.1f} min | Success: {success_count}, Fallback: {fallback_count}")

    result = model.fit_predict_store_dept(store, dept, df, test_df)

    if result[0] is None:  # Complete failure case
        fallback_count += 1
        predictions_dict[(store, dept)] = {'type': 'failed', 'value': 0}
    else:
        val_pred, val_actual, val_w, test_pred, test_group = result

        if val_pred is not None and val_actual is not None:
            success_count += 1
            val_predictions.extend(val_pred)
            val_actuals.extend(val_actual)
            val_weights.extend(val_w)
        else:
            fallback_count += 1

        # Store predictions in dictionary
        if test_pred is not None:
            predictions_dict[(store, dept)] = {'type': 'model', 'predictions': test_pred}
        else:
            # Use historical mean as fallback
            mean_sales = df[(df['Store'] == store) & (df['Dept'] == dept)]['Weekly_Sales'].mean()
            predictions_dict[(store, dept)] = {'type': 'mean', 'value': mean_sales if not pd.isna(mean_sales) else 0}

print(f"\n‚úÖ Model training completed!")

Training improved models for 3331 store-department combinations...
Progress: 0/3331 (0.0%) | Rate: 0.0/sec | ETA: 0.0 min | Success: 0, Fallback: 0
Progress: 50/3331 (1.5%) | Rate: 107.3/sec | ETA: 0.5 min | Success: 50, Fallback: 0
Progress: 100/3331 (3.0%) | Rate: 107.8/sec | ETA: 0.5 min | Success: 100, Fallback: 0
Progress: 150/3331 (4.5%) | Rate: 109.0/sec | ETA: 0.5 min | Success: 150, Fallback: 0
Progress: 200/3331 (6.0%) | Rate: 109.8/sec | ETA: 0.5 min | Success: 200, Fallback: 0
Progress: 250/3331 (7.5%) | Rate: 108.8/sec | ETA: 0.5 min | Success: 250, Fallback: 0
Progress: 300/3331 (9.0%) | Rate: 109.8/sec | ETA: 0.5 min | Success: 300, Fallback: 0
Progress: 350/3331 (10.5%) | Rate: 109.9/sec | ETA: 0.5 min | Success: 350, Fallback: 0
Progress: 400/3331 (12.0%) | Rate: 109.9/sec | ETA: 0.4 min | Success: 400, Fallback: 0
Progress: 450/3331 (13.5%) | Rate: 109.9/sec | ETA: 0.4 min | Success: 450, Fallback: 0
Progress: 500/3331 (15.0%) | Rate: 109.8/sec | ETA: 0.4 min | Succes

In [24]:
# ‚úÖ GENERATE PREDICTIONS FOR ALL TEST DATA ROWS
print(f"Generating predictions for ALL test data rows...")

# Calculate overall mean as ultimate fallback
overall_mean = df['Weekly_Sales'].mean()

submission = []
missing_store_depts = set()

# Process EVERY row in test data
for idx, test_row in test_df.iterrows():
    store, dept = test_row['Store'], test_row['Dept']

    # Create the ID
    test_id = f"{store}_{dept}_{test_row['Date'].strftime('%Y-%m-%d')}"

    # Get prediction for this store-dept combination
    if (store, dept) in predictions_dict:
        pred_info = predictions_dict[(store, dept)]
        if pred_info['type'] == 'model':
            # Find which prediction index this test row corresponds to
            test_group = test_df[(test_df['Store'] == store) & (test_df['Dept'] == dept)].sort_values('Date')
            row_idx = test_group.index.get_loc(idx)
            if row_idx < len(pred_info['predictions']):
                pred_value = pred_info['predictions'][row_idx]
            else:
                pred_value = pred_info['predictions'][-1]  # Use last prediction
        else:
            pred_value = pred_info['value']
    else:
        # ‚úÖ Use department-specific mean based on holiday status
        is_holiday = test_row['IsHoliday']
        pred_value = dept_means.loc[dept, is_holiday] if dept in dept_means.index else overall_mean
        missing_store_depts.add((store, dept))

    # ‚úÖ ENSURE NON-NEGATIVE PREDICTIONS
    pred_value = max(0, pred_value)

    submission.append({
        'Id': test_id,
        'Weekly_Sales': pred_value
    })

print(f"‚úÖ Missing store-dept combinations (using department-level mean): {len(missing_store_depts)}")
print(f"‚úÖ Generated predictions for {len(submission)} test rows")

# Verify we have the correct number of predictions
expected_rows = len(test_df)
actual_rows = len(submission)

print(f"\nüìä SUBMISSION VERIFICATION:")
print(f"Expected test rows: {expected_rows}")
print(f"Generated predictions: {actual_rows}")
print(f"Match: {'‚úÖ YES' if expected_rows == actual_rows else '‚ùå NO'}")

total_time = time.time() - start_time
print(f"\n‚úÖ Processing completed in {total_time/60:.1f} minutes")
print(f"‚úÖ Successful SARIMAX fits: {success_count}")
print(f"‚ö†Ô∏è  Fallback predictions: {fallback_count}")

# Calculate overall WMAE
if len(val_predictions) > 0:
    overall_wmae = calculate_wmae(np.array(val_actuals), np.array(val_predictions), np.array(val_weights))
    wandb.log({
        "overall_wmae": overall_wmae,
        "success_count": success_count,
        "fallback_count": fallback_count,
        "total_time_minutes": total_time/60,
        "submission_rows": len(submission),
        "expected_rows": expected_rows
    })
    print(f"‚úÖ Overall Validation WMAE: {overall_wmae:.4f}")
else:
    print("‚ùå No validation predictions generated")

# Create submission DataFrame
submission_df = pd.DataFrame(submission)
print(f"‚úÖ Generated {len(submission_df)} predictions")
print(f"‚úÖ Sample predictions:\n{submission_df.head()}")

# ‚úÖ QUALITY CHECKS
print(f"\nüìà PREDICTION QUALITY CHECKS:")
pred_values = submission_df['Weekly_Sales'].values
print(f"Min prediction: {pred_values.min():.2f}")
print(f"Max prediction: {pred_values.max():.2f}")
print(f"Mean prediction: {pred_values.mean():.2f}")
print(f"Negative predictions: {(pred_values < 0).sum()}")

# Save submission
submission_df.to_csv('/content/sarimax_improved_missing_depts_submission.csv', index=False)
print("‚úÖ Submission saved to /content/sarimax_improved_missing_depts_submission.csv")

wandb.finish()

Generating predictions for ALL test data rows...
‚úÖ Missing store-dept combinations (using department-level mean): 11
‚úÖ Generated predictions for 115064 test rows

üìä SUBMISSION VERIFICATION:
Expected test rows: 115064
Generated predictions: 115064
Match: ‚úÖ YES

‚úÖ Processing completed in 5.3 minutes
‚úÖ Successful SARIMAX fits: 3331
‚ö†Ô∏è  Fallback predictions: 0
‚úÖ Overall Validation WMAE: 2494.3902
‚úÖ Generated 115064 predictions
‚úÖ Sample predictions:
               Id  Weekly_Sales
0  1_1_2012-11-02  22270.821353
1  1_1_2012-11-09  22270.821353
2  1_1_2012-11-16  22270.821353
3  1_1_2012-11-23  25738.594000
4  1_1_2012-11-30  22270.821353

üìà PREDICTION QUALITY CHECKS:
Min prediction: 0.00
Max prediction: 263476.20
Mean prediction: 15986.92
Negative predictions: 0
‚úÖ Submission saved to /content/sarimax_improved_missing_depts_submission.csv


0,1
expected_rows,‚ñÅ
fallback_count,‚ñÅ
overall_wmae,‚ñÅ
submission_rows,‚ñÅ
success_count,‚ñÅ
total_time_minutes,‚ñÅ

0,1
expected_rows,115064.0
fallback_count,0.0
overall_wmae,2494.39024
submission_rows,115064.0
success_count,3331.0
total_time_minutes,5.25849


# public score 4600