<a href="https://colab.research.google.com/github/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_ARIMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Libraries and Initialize wandb
import pandas as pd
import numpy as np
import zipfile
import glob
from google.colab import drive
import warnings
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error
warnings.filterwarnings('ignore')

# Initialize wandb
import wandb
wandb.init(project="walmart-sales-forecasting", name="arima-enhanced")

# Mount Google Drive
drive.mount('/content/drive')

# Unzip and Load Data
zip_path = "/content/drive/MyDrive/ML-FinalProject/data.zip"
extract_path = "/content/data"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
for zip_file in glob.glob(f"{extract_path}/*.csv.zip"):
    with zipfile.ZipFile(zip_file, 'r') as z:
        z.extractall(extract_path)

# Load data
train = pd.read_csv(f"{extract_path}/train.csv")
test = pd.read_csv(f"{extract_path}/test.csv")
stores = pd.read_csv(f"{extract_path}/stores.csv")
features = pd.read_csv(f"{extract_path}/features.csv")

# Log data loading completion
wandb.log({"step": "data_loading", "status": "completed"})
print("Data loaded successfully.")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlkhok22[0m ([33mlkhok22-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Mounted at /content/drive
Data loaded successfully.


In [2]:
# Enhanced Preprocessing
# Convert Date column to datetime
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge datasets
train_merged = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
train_merged = train_merged.merge(stores, on='Store', how='left')
test_merged = test.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_merged = test_merged.merge(stores, on='Store', how='left')

# Handle missing values in features
for col in ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']:
    train_merged[col] = train_merged[col].fillna(train_merged[col].median())
    test_merged[col] = test_merged[col].fillna(test_merged[col].median())

# Add specific holiday flags
holidays = {
    'Super Bowl': ['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'],
    'Labor Day': ['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'],
    'Thanksgiving': ['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'],
    'Christmas': ['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']
}
for holiday, dates in holidays.items():
    dates = pd.to_datetime(dates)
    train_merged[holiday] = train_merged['Date'].isin(dates).astype(int)
    test_merged[holiday] = test_merged['Date'].isin(dates).astype(int)

# Detect and cap outliers in Weekly_Sales
q1 = train_merged['Weekly_Sales'].quantile(0.25)
q3 = train_merged['Weekly_Sales'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
train_merged['Weekly_Sales'] = train_merged['Weekly_Sales'].clip(lower_bound, upper_bound)

# Sort by Store, Dept, and Date
train_merged = train_merged.sort_values(['Store', 'Dept', 'Date'])
test_merged = test_merged.sort_values(['Store', 'Dept', 'Date'])

# Filter Store-Dept pairs with at least 10 observations
train_grouped = train_merged.groupby(['Store', 'Dept']).size().reset_index(name='count')
valid_pairs = train_grouped[train_grouped['count'] >= 10][['Store', 'Dept']]
train_filtered = train_merged.merge(valid_pairs, on=['Store', 'Dept'], how='inner')

# Handle missing Weekly_Sales
train_filtered['Weekly_Sales'] = train_filtered.groupby(['Store', 'Dept'])['Weekly_Sales'].ffill()
missing_sales = train_filtered['Weekly_Sales'].isna().sum()
if missing_sales > 0:
    mean_sales = train_filtered.groupby(['Store', 'Dept'])['Weekly_Sales'].transform('mean')
    train_filtered['Weekly_Sales'] = train_filtered['Weekly_Sales'].fillna(mean_sales)

# Log preprocessing
wandb.log({"step": "preprocessing", "missing_sales": missing_sales, "valid_pairs": len(valid_pairs)})
print(f"Train filtered shape: {train_filtered.shape}, Valid pairs: {len(valid_pairs)}")
print(f"Missing Weekly_Sales: {missing_sales}")

Train filtered shape: (420927, 20), Valid pairs: 3167
Missing Weekly_Sales: 0


In [3]:
# Stationarity Check and Time Series Preparation
# Function to check stationarity using ADF test
def check_stationarity(series, store, dept):
    try:
        # Check if series is constant
        if series.max() == series.min():
            wandb.log({
                f"stationarity_store_{store}_dept_{dept}": {
                    "p_value": 1.0,  # High p-value for constant series
                    "is_stationary": False,
                    "adf_statistic": 0.0
                }
            })
            return False
        # Run ADF test
        result = adfuller(series.dropna(), autolag='AIC')
        p_value = result[1]
        is_stationary = p_value < 0.05  # 5% significance level
        wandb.log({
            f"stationarity_store_{store}_dept_{dept}": {
                "p_value": p_value,
                "is_stationary": is_stationary,
                "adf_statistic": result[0]
            }
        })
        return is_stationary
    except Exception as e:
        print(f"Stationarity check failed for Store {store}, Dept {dept}: {e}")
        wandb.log({
            f"stationarity_store_{store}_dept_{dept}": {
                "p_value": 1.0,
                "is_stationary": False,
                "adf_statistic": 0.0
            }
        })
        return False

# Create dictionary to store time series for each Store-Dept pair
time_series_dict = {}

# Iterate over valid Store-Dept pairs
for _, row in valid_pairs.iterrows():
    store, dept = row['Store'], row['Dept']
    # Extract time series
    ts = train_filtered[(train_filtered['Store'] == store) &
                       (train_filtered['Dept'] == dept)][['Date', 'Weekly_Sales']]
    # Set Date as index and ensure weekly frequency
    ts = ts.set_index('Date')['Weekly_Sales']
    ts.index = ts.index.to_period('W-FRI').to_timestamp('W-FRI')

    # Check for sufficient data
    if len(ts) >= 10:
        # Check stationarity
        is_stationary = check_stationarity(ts, store, dept)
        time_series_dict[(store, dept)] = {
            'series': ts,
            'is_stationary': is_stationary
        }

# Count stationary and non-stationary series
stationary_count = sum(1 for v in time_series_dict.values() if v['is_stationary'])
print(f"Number of stationary time series: {stationary_count}")
print(f"Number of non-stationary time series: {len(time_series_dict) - stationary_count}")
print(f"Total Store-Dept pairs prepared: {len(time_series_dict)}")

# Log stationarity check
wandb.log({
    "step": "stationarity_check",
    "stationary_series_count": stationary_count,
    "total_series_count": len(time_series_dict)
})

Number of stationary time series: 2261
Number of non-stationary time series: 906
Total Store-Dept pairs prepared: 3167


In [4]:
# ARIMA Training and Validation
# Function to calculate WMAE
def calculate_wmae(y_true, y_pred, is_holiday):
    weights = np.where(is_holiday, 5, 1)
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

# Initialize storage for models and predictions
val_predictions = []
arima_models = {}
batch_size = 500  # Process pairs in batches to manage runtime
arima_orders = [(1,0,2), (0,1,2), (1,1,1), (0,1,1)]  # Orders to test
subset_size = 100  # Subset for testing alternative orders

# Global mean for fallback
global_mean_sales = train_filtered['Weekly_Sales'].mean() if not train_filtered['Weekly_Sales'].isna().all() else 0.0

# Process pairs in batches
all_pairs = list(time_series_dict.keys())
print(f"Training ARIMA on {len(all_pairs)} Store-Dept pairs")

for batch_start in range(0, len(all_pairs), batch_size):
    batch_pairs = all_pairs[batch_start:batch_start + batch_size]
    print(f"Processing batch {batch_start // batch_size + 1}/{len(all_pairs) // batch_size + 1}")

    for store, dept in batch_pairs:
        ts = time_series_dict[(store, dept)]['series']

        # Split into train (80%) and validation (20%)
        dates = ts.index.sort_values()
        train_size = int(0.8 * len(dates))
        train_dates = dates[:train_size]
        val_dates = dates[train_size:]
        train_ts = ts[train_dates]
        val_ts = ts[val_dates]

        if len(train_ts) < 10 or len(val_ts) < 1:
            continue

        # Test multiple ARIMA orders on subset
        orders_to_test = arima_orders if (store, dept) in all_pairs[:subset_size] else [(1,0,2)]

        for order in orders_to_test:
            try:
                # Train ARIMA
                model = ARIMA(train_ts, order=order).fit()
                if order == (1,0,2):
                    arima_models[(store, dept)] = model

                # Forecast validation
                val_pred = model.forecast(steps=len(val_ts))
                val_actual = val_ts.values
                val_holidays = train_filtered[(train_filtered['Store'] == store) &
                                            (train_filtered['Dept'] == dept) &
                                            (train_filtered['Date'].isin(val_dates))]['IsHoliday'].values

                if len(val_holidays) != len(val_pred):
                    print(f"Holiday mismatch for Store {store}, Dept {dept}, order {order}")
                    continue

                # Calculate WMAE
                wmae = calculate_wmae(val_actual, val_pred, val_holidays)

                # Log metrics
                wandb.log({
                    f"wmae_store_{store}_dept_{dept}_order_{order}": wmae,
                    f"aic_store_{store}_dept_{dept}_order_{order}": model.aic
                })

                # Store predictions for (1,0,2)
                if order == (1,0,2):
                    for date, pred, actual, holiday in zip(val_ts.index, val_pred, val_actual, val_holidays):
                        val_predictions.append({
                            'Store': store,
                            'Dept': dept,
                            'Date': date,
                            'Weekly_Sales_Pred': pred,
                            'Weekly_Sales_Actual': actual,
                            'IsHoliday': holiday
                        })

            except Exception as e:
                print(f"ARIMA failed for Store {store}, Dept {dept}, order {order}: {e}")
                # Fallback to last value
                last_value = train_ts[-1] if len(train_ts) > 0 else global_mean_sales
                val_pred = np.full(len(val_ts), last_value)
                val_holidays = train_filtered[(train_filtered['Store'] == store) &
                                            (train_filtered['Dept'] == dept) &
                                            (train_filtered['Date'].isin(val_dates))]['IsHoliday'].values

                if len(val_holidays) != len(val_pred):
                    print(f"Holiday mismatch in fallback for Store {store}, Dept {dept}")
                    continue

                for date in zip(val_ts.index, val_pred, val_actual, val_holidays):
                    val_predictions.append({
                        'Store': store,
                        'Dept': dept,
                        'Date': date,
                        'Weekly_Sales_Pred': val_pred,
                        'Weekly_Sales_Actual': val_actual,
                        'IsHoliday': holiday
                    })

# Convert predictions to DataFrame
val_predictions_df = pd.DataFrame(val_predictions)

# Validate and compute overall WMAE
if val_predictions_df.empty:
    print("Error: No valid predictions generated.")
else:
    overall_wmae = calculate_wmae(val_predictions_df['Weekly_Sales_Actual'],
                                 val_predictions_df['Weekly_Sales_Pred'],
                                 val_predictions_df['IsHoliday'])
    print(f"Overall WMAE for ARIMA(1,0,2): {overall_wmae}")
    wandb.log({"step": "arima_validation", "arima_order": "1,0,2", "overall_wmae": overall_wmae})
    print("Validation predictions shape:", val_predictions_df.shape)

Training ARIMA on 3167 Store-Dept pairs
Processing batch 1/7
Processing batch 2/7
Processing batch 3/7
Processing batch 4/7
Processing batch 5/7
Processing batch 6/7
Processing batch 7/7
Overall WMAE for ARIMA(1,0,2): 1763.8795916807408
Validation predictions shape: (85376, 6)


In [5]:
# Test Set Predictions and Submission
# Global mean as fallback for missing data
global_mean_sales = train_filtered['Weekly_Sales'].mean() if not train_filtered['Weekly_Sales'].isna().all() else 0.0
print(f"Global mean sales (fallback): {global_mean_sales}")

# Check departments in test set not in train set
train_depts = set(train_filtered['Dept'].unique())
test_depts = set(test_merged['Dept'].unique())
missing_depts = test_depts - train_depts
print(f"Departments in test set but not in train set: {missing_depts}")

# Generate test set predictions
test_predictions = []

# Iterate over test set Store-Dept-Date triplets
test_grouped = test_merged.groupby(['Store', 'Dept'])
for (store, dept), group in test_grouped:
    test_dates = group['Date'].sort_values()

    # Initialize predictions
    pred = None

    # Check if we have a trained model
    if (store, dept) in arima_models:
        try:
            model = arima_models[(store, dept)]
            pred = model.forecast(steps=len(test_dates))
        except Exception as e:
            print(f"Forecast failed for Store {store}, Dept {dept}: {e}")

    # If no model or forecast failed, use fallback
    if pred is None or np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
        train_ts = time_series_dict.get((store, dept), {}).get('series', None)
        if train_ts is not None and len(train_ts) > 0:
            pred = np.full(len(test_dates), train_ts[-1])
        else:
            dept_mean = train_filtered[train_filtered['Dept'] == dept]['Weekly_Sales'].mean()
            pred = np.full(len(test_dates), dept_mean if not np.isnan(dept_mean) else global_mean_sales)

    # Ensure valid floats and round to two decimal places
    pred = np.where(np.isnan(pred) | np.isinf(pred), global_mean_sales, pred)
    pred = np.round(pred, 2).astype(float)

    # Store predictions
    for date, pred_sales in zip(test_dates, pred):
        test_predictions.append({
            'Store': store,
            'Dept': dept,
            'Date': date,
            'Weekly_Sales': pred_sales
        })

# Convert to DataFrame
test_predictions_df = pd.DataFrame(test_predictions)

# Create Id column in the format Store_Dept_Date
test_predictions_df['Id'] = test_predictions_df.apply(
    lambda x: f"{int(x['Store'])}_{int(x['Dept'])}_{x['Date'].strftime('%Y-%m-%d')}", axis=1)

# Prepare submission file
submission = test_predictions_df[['Id', 'Weekly_Sales']]
submission['Weekly_Sales'] = submission['Weekly_Sales'].round(2)
submission['Weekly_Sales'] = submission['Weekly_Sales'].fillna(global_mean_sales)
submission['Weekly_Sales'] = submission['Weekly_Sales'].apply(lambda x: global_mean_sales if pd.isna(x) or np.isinf(x) else x)

# Verify no invalid values
print("Submission file shape:", submission.shape)
print(submission.head())
print("\nNaN values in Weekly_Sales:", submission['Weekly_Sales'].isna().sum())
print("Infinite values in Weekly_Sales:", np.isinf(submission['Weekly_Sales']).sum())
print("Empty or non-numeric values in Weekly_Sales:",
      submission['Weekly_Sales'].apply(lambda x: isinstance(x, str) and x.strip() == '').sum())

# Check problematic rows from previous submission
problem_lines = [4925, 9497, 36405, 47110, 49709, 49710, 49711, 49712, 49713, 49714]
problem_rows = [line - 2 for line in problem_lines]
print("\nChecking problematic rows in new submission:")
print(submission.iloc[problem_rows])

# Save submission file
submission.to_csv('/content/submission.csv', index=False)

# Log submission to wandb
wandb.log({"step": "test_predictions", "submission_file": "created"})
wandb.save('/content/submission.csv')
print("Submission file saved: /content/submission.csv")

Global mean sales (fallback): 13670.121618418989
Departments in test set but not in train set: {np.int64(43), np.int64(77), np.int64(39)}
Submission file shape: (115064, 2)
               Id  Weekly_Sales
0  1_1_2012-11-02      40687.77
1  1_1_2012-11-09      27693.91
2  1_1_2012-11-16      23990.66
3  1_1_2012-11-23      23291.52
4  1_1_2012-11-30      23159.53

NaN values in Weekly_Sales: 0
Infinite values in Weekly_Sales: 0
Empty or non-numeric values in Weekly_Sales: 0

Checking problematic rows in new submission:
                     Id  Weekly_Sales
4923    2_77_2013-01-11      13670.12
9495    4_39_2013-07-12      13670.12
36403  14_43_2012-12-07      13670.12
47108  18_43_2013-04-26      13670.12
49707  19_39_2012-11-02      13670.12
49708  19_39_2012-11-09      13670.12
49709  19_39_2012-11-16      13670.12
49710  19_39_2012-11-23      13670.12
49711  19_39_2012-11-30      13670.12
49712  19_39_2012-12-07      13670.12




Submission file saved: /content/submission.csv


#Score: 6765.77723