<a href="https://colab.research.google.com/github/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_ARIMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Libraries and Initialize wandb
import pandas as pd
import numpy as np
import zipfile
import glob
from google.colab import drive
import warnings
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error
warnings.filterwarnings('ignore')

# Initialize wandb
import wandb
wandb.init(project="walmart-sales-forecasting", name="arima-enhanced")

# Mount Google Drive
drive.mount('/content/drive')

# Unzip and Load Data
zip_path = "/content/drive/MyDrive/ML-FinalProject/data.zip"
extract_path = "/content/data"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
for zip_file in glob.glob(f"{extract_path}/*.csv.zip"):
    with zipfile.ZipFile(zip_file, 'r') as z:
        z.extractall(extract_path)

# Load data
train = pd.read_csv(f"{extract_path}/train.csv")
test = pd.read_csv(f"{extract_path}/test.csv")
stores = pd.read_csv(f"{extract_path}/stores.csv")
features = pd.read_csv(f"{extract_path}/features.csv")

# Log data loading completion
wandb.log({"step": "data_loading", "status": "completed"})
print("Data loaded successfully.")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlkhok22[0m ([33mlkhok22-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Mounted at /content/drive
Data loaded successfully.


In [2]:
# Preprocessing
# Convert Date column to datetime
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge datasets
train_merged = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
train_merged = train_merged.merge(stores, on='Store', how='left')
test_merged = test.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_merged = test_merged.merge(stores, on='Store', how='left')

# Impute MarkDown columns with 0 and add missingness indicators
markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
for col in markdown_cols:
    train_merged[f'{col}_was_missing'] = train_merged[col].isna().astype(int)
    test_merged[f'{col}_was_missing'] = test_merged[col].isna().astype(int)
    train_merged[col] = train_merged[col].fillna(0)
    test_merged[col] = test_merged[col].fillna(0)

# Impute numerical columns with ffill/bfill, fallback to mean
numerical_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
means = {col: train_merged[col].mean() for col in numerical_cols}
for col in numerical_cols:
    train_merged[col] = train_merged[col].ffill().bfill()
    test_merged[col] = test_merged[col].ffill().bfill()
    train_merged[col] = train_merged[col].fillna(means[col])
    test_merged[col] = test_merged[col].fillna(means[col])

# Sort by Store, Dept, and Date
train_merged = train_merged.sort_values(['Store', 'Dept', 'Date'])
test_merged = test_merged.sort_values(['Store', 'Dept', 'Date'])

# Filter Store-Dept pairs with at least 10 observations
train_grouped = train_merged.groupby(['Store', 'Dept']).size().reset_index(name='count')
valid_pairs = train_grouped[train_grouped['count'] >= 10][['Store', 'Dept']]
train_filtered = train_merged.merge(valid_pairs, on=['Store', 'Dept'], how='inner')

# Handle missing Weekly_Sales
train_filtered['Weekly_Sales'] = train_filtered.groupby(['Store', 'Dept'])['Weekly_Sales'].ffill().bfill()
missing_sales = train_filtered['Weekly_Sales'].isna().sum()
if missing_sales > 0:
    mean_sales = train_filtered.groupby(['Store', 'Dept'])['Weekly_Sales'].transform('mean')
    train_filtered['Weekly_Sales'] = train_filtered['Weekly_Sales'].fillna(mean_sales)

# Log preprocessing
wandb.log({"step": "preprocessing", "missing_sales": missing_sales, "valid_pairs": len(valid_pairs)})
print(f"Train filtered shape: {train_filtered.shape}, Valid pairs: {len(valid_pairs)}")
print(f"Missing Weekly_Sales: {missing_sales}")

Train filtered shape: (420927, 21), Valid pairs: 3167
Missing Weekly_Sales: 0


In [4]:
# ARIMA Training, Validation, and Submission
# Function to check stationarity
def check_stationarity(series, store, dept):
    try:
        if series.max() == series.min():
            wandb.log({f"stationarity_store_{store}_dept_{dept}": {"p_value": 1.0, "is_stationary": False, "adf_statistic": 0.0}})
            return False
        result = adfuller(series.dropna(), autolag='AIC')
        p_value = result[1]
        is_stationary = p_value < 0.05
        wandb.log({f"stationarity_store_{store}_dept_{dept}": {"p_value": p_value, "is_stationary": is_stationary, "adf_statistic": result[0]}})
        return is_stationary
    except Exception as e:
        print(f"Stationarity check failed for Store {store}, Dept {dept}: {e}")
        wandb.log({f"stationarity_store_{store}_dept_{dept}": {"p_value": 1.0, "is_stationary": False, "adf_statistic": 0.0}})
        return False

# Prepare time series
time_series_dict = {}
for _, row in valid_pairs.iterrows():
    store, dept = row['Store'], row['Dept']
    ts = train_filtered[(train_filtered['Store'] == store) &
                       (train_filtered['Dept'] == dept)][['Date', 'Weekly_Sales']]
    ts = ts.set_index('Date')['Weekly_Sales'].asfreq('W-FRI')
    if len(ts) >= 10:
        is_stationary = check_stationarity(ts, store, dept)
        time_series_dict[(store, dept)] = {'series': ts, 'is_stationary': is_stationary}

stationary_count = sum(1 for v in time_series_dict.values() if v['is_stationary'])
print(f"Stationary series: {stationary_count}, Non-stationary: {len(time_series_dict) - stationary_count}, Total pairs: {len(time_series_dict)}")
wandb.log({"step": "stationarity_check", "stationary_series_count": stationary_count, "total_series_count": len(time_series_dict)})

# Function to calculate WMAE
def calculate_wmae(y_true, y_pred, is_holiday):
    weights = np.where(is_holiday, 5, 1)
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

# Calculate department-specific holiday multipliers
holiday_multipliers = {}
for dept in train_filtered['Dept'].unique():
    dept_data = train_filtered[train_filtered['Dept'] == dept]
    holiday_sales = dept_data[dept_data['IsHoliday'] == 1]['Weekly_Sales'].mean()
    non_holiday_sales = dept_data[dept_data['IsHoliday'] == 0]['Weekly_Sales'].mean()
    if non_holiday_sales > 0:
        holiday_multipliers[dept] = holiday_sales / non_holiday_sales if holiday_sales > 0 else 1.5
    else:
        holiday_multipliers[dept] = 1.5

# Train and validate
val_predictions = []
arima_models = {}
batch_size = 500
arima_orders = [(1,0,2), (1,0,1), (0,1,2), (1,1,1)]
subset_size = 100
global_mean_sales = train_filtered['Weekly_Sales'].mean() if not train_filtered['Weekly_Sales'].isna().all() else 0.0

all_pairs = list(time_series_dict.keys())
print(f"Training ARIMA on {len(all_pairs)} Store-Dept pairs")

# Date-based validation split
validation_cutoff_date = pd.to_datetime('2012-09-01')

for batch_start in range(0, len(all_pairs), batch_size):
    batch_pairs = all_pairs[batch_start:batch_start + batch_size]
    print(f"Processing batch {batch_start // batch_size + 1}/{len(all_pairs) // batch_size + 1}")

    for store, dept in batch_pairs:
        ts = time_series_dict[(store, dept)]['series']
        train_ts = ts[ts.index < validation_cutoff_date]
        val_ts = ts[ts.index >= validation_cutoff_date]

        if len(train_ts) < 10 or len(val_ts) < 1:
            continue

        orders_to_test = arima_orders if (store, dept) in all_pairs[:subset_size] else [(1,0,2)]

        for order in orders_to_test:
            try:
                model = ARIMA(train_ts, order=order).fit()
                if order == (1,0,2):
                    arima_models[(store, dept)] = model

                val_pred = model.forecast(steps=len(val_ts))
                val_actual = val_ts.values
                val_dates = val_ts.index
                val_holidays = train_filtered[(train_filtered['Store'] == store) &
                                            (train_filtered['Dept'] == dept) &
                                            (train_filtered['Date'].isin(val_dates))]['IsHoliday'].reindex(val_dates, fill_value=0).values

                if len(val_holidays) != len(val_pred):
                    print(f"Holiday mismatch for Store {store}, Dept {dept}, order {order}, lengths: {len(val_holidays)} vs {len(val_pred)}")
                    continue

                wmae = calculate_wmae(val_actual, val_pred, val_holidays)
                wandb.log({
                    f"wmae_store_{store}_dept_{dept}_order_{order}": wmae,
                    f"aic_store_{store}_dept_{dept}_order_{order}": model.aic
                })

                if order == (1,0,2):
                    for date, pred, actual, holiday in zip(val_dates, val_pred, val_actual, val_holidays):
                        val_predictions.append({
                            'Store': store,
                            'Dept': dept,
                            'Date': date,
                            'Weekly_Sales_Pred': pred,
                            'Weekly_Sales_Actual': actual,
                            'IsHoliday': holiday
                        })

            except Exception as e:
                print(f"ARIMA failed for Store {store}, Dept {dept}, order {order}: {e}")
                last_value = train_ts[-1] if len(train_ts) > 0 else global_mean_sales
                val_pred = np.full(len(val_ts), last_value)
                val_holidays = train_filtered[(train_filtered['Store'] == store) &
                                            (train_filtered['Dept'] == dept) &
                                            (train_filtered['Date'].isin(val_dates))]['IsHoliday'].reindex(val_dates, fill_value=0).values

                if len(val_holidays) != len(val_pred):
                    print(f"Holiday mismatch in fallback for Store {store}, Dept {dept}, lengths: {len(val_holidays)} vs {len(val_pred)}")
                    continue

                if order == (1,0,2):
                    for date, pred, actual, holiday in zip(val_dates, val_pred, val_actual, val_holidays):
                        val_predictions.append({
                            'Store': store,
                            'Dept': dept,
                            'Date': date,
                            'Weekly_Sales_Pred': pred,
                            'Weekly_Sales_Actual': actual,
                            'IsHoliday': holiday
                        })

# Validate
val_predictions_df = pd.DataFrame(val_predictions)
if not val_predictions_df.empty:
    overall_wmae = calculate_wmae(val_predictions_df['Weekly_Sales_Actual'],
                                 val_predictions_df['Weekly_Sales_Pred'],
                                 val_predictions_df['IsHoliday'])
    print(f"Overall WMAE for ARIMA(1,0,2): {overall_wmae}")
    wandb.log({"step": "arima_validation", "arima_order": "1,0,2", "overall_wmae": overall_wmae})
else:
    print("Error: No valid predictions generated.")

# Test set predictions
test_predictions = []
train_depts = set(train_filtered['Dept'].unique())
test_depts = set(test_merged['Dept'].unique())
missing_depts = test_depts - train_depts
print(f"Departments in test set but not in train set: {missing_depts}")

test_grouped = test_merged.groupby(['Store', 'Dept'])
for (store, dept), group in test_grouped:
    test_dates = group['Date'].sort_values()
    holiday_adjust = group['IsHoliday'].values
    pred = None

    if (store, dept) in arima_models:
        try:
            model = arima_models[(store, dept)]
            pred = model.forecast(steps=len(test_dates))
        except Exception as e:
            print(f"Forecast failed for Store {store}, Dept {dept}: {e}")

    if pred is None or np.any(np.isnan(pred)) or np.any(np.isinf(pred)):
        train_ts = time_series_dict.get((store, dept), {}).get('series', None)
        if train_ts is not None and len(train_ts) > 0:
            pred = np.full(len(test_dates), train_ts[-1])
        else:
            dept_median = train_filtered[train_filtered['Dept'] == dept]['Weekly_Sales'].median()
            pred = np.full(len(test_dates), dept_median if not np.isnan(dept_median) else global_mean_sales)

    # Department-specific holiday adjustment
    holiday_multiplier = holiday_multipliers.get(dept, 1.5)
    pred = np.where(holiday_adjust, pred * holiday_multiplier, pred)
    pred = np.where(np.isnan(pred) | np.isinf(pred), global_mean_sales, pred)
    pred = np.round(pred, 2).astype(float)

    for date, pred_sales in zip(test_dates, pred):
        test_predictions.append({
            'Store': store,
            'Dept': dept,
            'Date': date,
            'Weekly_Sales': pred_sales
        })

# Prepare submission
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df['Id'] = test_predictions_df.apply(
    lambda x: f"{int(x['Store'])}_{int(x['Dept'])}_{x['Date'].strftime('%Y-%m-%d')}", axis=1)
submission = test_predictions_df[['Id', 'Weekly_Sales']]
submission['Weekly_Sales'] = submission['Weekly_Sales'].round(2)
submission['Weekly_Sales'] = submission['Weekly_Sales'].fillna(global_mean_sales)
submission['Weekly_Sales'] = submission['Weekly_Sales'].apply(lambda x: global_mean_sales if pd.isna(x) or np.isinf(x) else x)

# Verify submission
print("Submission file shape:", submission.shape)
print(submission.head())
print("\nNaN values in Weekly_Sales:", submission['Weekly_Sales'].isna().sum())
print("Infinite values in Weekly_Sales:", np.isinf(submission['Weekly_Sales']).sum())
print("Empty or non-numeric values in Weekly_Sales:",
      submission['Weekly_Sales'].apply(lambda x: isinstance(x, str) and x.strip() == '').sum())
problem_lines = [4925, 9497, 36405, 47110, 49709, 49710, 49711, 49712, 49713, 49714]
problem_rows = [line - 2 for line in problem_lines]
print("\nChecking problematic rows:")
print(submission.iloc[problem_rows])

# Save submission
submission.to_csv('/content/submission.csv', index=False)
wandb.log({"step": "arima_submission", "submission_file": "created"})
wandb.save('/content/submission.csv')
print("Submission file saved: /content/submission.csv")

Stationary series: 2275, Non-stationary: 892, Total pairs: 3167
Training ARIMA on 3167 Store-Dept pairs
Processing batch 1/7
Processing batch 2/7
Processing batch 3/7
Processing batch 4/7
Processing batch 5/7
Processing batch 6/7
Processing batch 7/7
Overall WMAE for ARIMA(1,0,2): 1931.0866113120837
Departments in test set but not in train set: {np.int64(43), np.int64(77), np.int64(39)}
Submission file shape: (115064, 2)
               Id  Weekly_Sales
0  1_1_2012-11-02      19996.11
1  1_1_2012-11-09      22016.22
2  1_1_2012-11-16      22484.27
3  1_1_2012-11-23      23178.52
4  1_1_2012-11-30      22568.45

NaN values in Weekly_Sales: 0
Infinite values in Weekly_Sales: 0
Empty or non-numeric values in Weekly_Sales: 0

Checking problematic rows:
                     Id  Weekly_Sales
4923    2_77_2013-01-11      16005.54
9495    4_39_2013-07-12      16005.54
36403  14_43_2012-12-07      16005.54
47108  18_43_2013-04-26      16005.54
49707  19_39_2012-11-02      16005.54
49708  19_39_2

#Score: 4680.49545
