<a href="https://colab.research.google.com/github/AleksandreBakhtadze/ML-abakh22-assignment-1/blob/main/model_experiment_SARIMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# ✅ Only install what you need for SARIMAX and logging
!pip install statsmodels wandb --quiet

In [16]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import wandb
import os
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [17]:
# Install required libraries
!pip install pandas numpy matplotlib seaborn scikit-learn torch torchvision wandb pyyaml darts --quiet
import wandb
wandb.login(key="eccf2c915699fc032ad678daf0fd4b5ac60bf87c")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [18]:
# Mount Google Drive and extract data
from google.colab import drive
import zipfile
import os
drive.mount('/content/drive')
zip_path = '/content/drive/MyDrive/ML-FinalProject/data.zip'
extract_to = '/content/walmart_data/'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
for file_name in os.listdir(extract_to):
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(os.path.join(extract_to, file_name), 'r') as zip_ref:
            zip_ref.extractall(extract_to)
print("✅ Extracted files:", os.listdir(extract_to))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Extracted files: ['test.csv.zip', 'features.csv', 'train.csv.zip', 'train.csv', 'features.csv.zip', 'test.csv', 'stores.csv', 'sampleSubmission.csv.zip', 'sampleSubmission.csv']


In [19]:
# Load and preprocess data
train = pd.read_csv('/content/walmart_data/train.csv')
features = pd.read_csv('/content/walmart_data/features.csv')
stores = pd.read_csv('/content/walmart_data/stores.csv')
test = pd.read_csv('/content/walmart_data/test.csv')

# Merge train with features and stores
df = pd.merge(train, features, on=['Store', 'Date'], how='left')
df = pd.merge(df, stores, on='Store', how='left')
df = df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Store', 'Dept', 'Date'])

# ✅ CREATE THE MISSING IsHolidayWeight COLUMN
df['IsHolidayWeight'] = df['IsHoliday'].apply(lambda x: 5 if x else 1)

# Add holiday-specific features
holiday_dates = {
    'SuperBowl': ['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'],
    'LaborDay': ['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'],
    'Thanksgiving': ['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'],
    'Christmas': ['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']
}
for holiday, dates in holiday_dates.items():
    df[holiday] = df['Date'].isin(pd.to_datetime(dates)).astype(int)

# Add time-based features
df['WeekOfYear'] = df['Date'].dt.isocalendar().week
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year - df['Date'].dt.year.min()

# Add holiday proximity features
for holiday, dates in holiday_dates.items():
    for date in pd.to_datetime(dates):
        df[f'{holiday}_Before'] = ((df['Date'] < date) & (df['Date'] >= date - pd.Timedelta(weeks=2))).astype(int)
        df[f'{holiday}_After'] = ((df['Date'] > date) & (df['Date'] <= date + pd.Timedelta(weeks=2))).astype(int)

# Load and preprocess test data
test_df = pd.merge(test, features, on=['Store', 'Date'], how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')
test_df = test_df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df = test_df.sort_values(by=['Store', 'Dept', 'Date'])

# ✅ ADD IsHolidayWeight TO TEST DATA TOO
test_df['IsHolidayWeight'] = test_df['IsHoliday'].apply(lambda x: 5 if x else 1)

# Add holiday-specific features to test
for holiday, dates in holiday_dates.items():
    test_df[holiday] = test_df['Date'].isin(pd.to_datetime(dates)).astype(int)

# Add time-based features to test
test_df['WeekOfYear'] = test_df['Date'].dt.isocalendar().week
test_df['Month'] = test_df['Date'].dt.month
# ✅ FIX: Use the same min year as training data
test_df['Year'] = test_df['Date'].dt.year - df['Date'].dt.year.min()


In [20]:
def calculate_wmae(y_true, y_pred, weights):
    """Calculate Weighted Mean Absolute Error"""
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

def calculate_mae(y_true, y_pred):
    """Calculate Mean Absolute Error"""
    return np.mean(np.abs(y_true - y_pred))

In [21]:
# ✅ FAST SARIMAX APPROACH with optimizations
class FastSARIMAX:
    def __init__(self, order=(1, 0, 0), seasonal_order=(0, 0, 0, 52)):  # Simpler default
        self.order = order
        self.seasonal_order = seasonal_order
        self.models = {}
        self.fallback_means = {}

    def fit_predict_store_dept(self, store, dept, train_data, test_data):
        try:
            # Get training data for this store-dept
            train_group = train_data[(train_data['Store'] == store) & (train_data['Dept'] == dept)].copy()
            test_group = test_data[(test_data['Store'] == store) & (test_data['Dept'] == dept)].copy()

            if len(train_group) < 20:  # Need more data for SARIMAX
                mean_sales = train_group['Weekly_Sales'].mean() if len(train_group) > 0 else 0
                self.fallback_means[(store, dept)] = mean_sales
                return self._create_fallback_result(mean_sales, train_group, test_group)

            # Sort by date
            train_group = train_group.sort_values('Date')
            test_group = test_group.sort_values('Date')

            # Prepare time series
            y_train = train_group['Weekly_Sales'].values

            # Skip if constant or near-constant series
            if len(set(y_train)) <= 2 or np.std(y_train) < 1e-3:
                mean_sales = np.mean(y_train)
                self.fallback_means[(store, dept)] = mean_sales
                return self._create_fallback_result(mean_sales, train_group, test_group)

            # ✅ FASTER SARIMAX with reduced complexity and timeout
            model = SARIMAX(
                endog=y_train,
                order=self.order,
                seasonal_order=self.seasonal_order,
                enforce_stationarity=False,
                enforce_invertibility=False,
                simple_differencing=True  # Faster differencing
            )

            # ✅ FASTER FITTING with strict limits
            fitted_model = model.fit(
                disp=False,
                maxiter=50,  # Reduced iterations
                method='lbfgs',  # Faster optimizer
                optim_score='harvey',  # Faster scoring
                low_memory=True
            )

            self.models[(store, dept)] = fitted_model

            # Validation split
            split_idx = int(len(y_train) * 0.8)
            val_pred = fitted_model.forecast(steps=len(y_train) - split_idx)
            val_actual = y_train[split_idx:]
            val_weights = train_group['IsHolidayWeight'].iloc[split_idx:].values

            # Test prediction
            test_pred = None
            if len(test_group) > 0:
                test_pred = fitted_model.forecast(steps=len(test_group))

            return val_pred, val_actual, val_weights, test_pred, test_group

        except Exception as e:
            # Fallback to mean for problematic series
            mean_sales = train_group['Weekly_Sales'].mean() if len(train_group) > 0 else 0
            self.fallback_means[(store, dept)] = mean_sales
            return self._create_fallback_result(mean_sales, train_group, test_group)

    def _create_fallback_result(self, mean_sales, train_group, test_group):
        """Create fallback result using mean prediction"""
        if len(train_group) == 0:
            return None, None, None, None, test_group

        split_idx = int(len(train_group) * 0.8)
        val_length = len(train_group) - split_idx

        val_pred = np.full(val_length, mean_sales)
        val_actual = train_group['Weekly_Sales'].iloc[split_idx:].values
        val_weights = train_group['IsHolidayWeight'].iloc[split_idx:].values

        test_pred = None
        if len(test_group) > 0:
            test_pred = np.full(len(test_group), mean_sales)

        return val_pred, val_actual, val_weights, test_pred, test_group

In [22]:
# Initialize wandb
wandb.init(project="walmart-sales-forecasting", name="sarimax-fast-model", config={
    "model": "SARIMAX",
    "seasonal_period": 52,
    "order": (1, 0, 0),  # Simpler for speed
    "seasonal_order": (0, 0, 0, 52)  # Non-seasonal for speed
})

# ✅ FAST TRAINING LOOP with progress tracking
model = FastSARIMAX(order=(1, 0, 0), seasonal_order=(0, 0, 0, 52))  # Simpler for speed

val_predictions = []
val_actuals = []
val_weights = []

# Get unique store-dept combinations from TRAINING data
store_dept_combinations = df[['Store', 'Dept']].drop_duplicates()

print(f"Training models for {len(store_dept_combinations)} store-department combinations...")

import time
start_time = time.time()
fallback_count = 0
success_count = 0

# Dictionary to store predictions for each store-dept
predictions_dict = {}

for idx, (_, row) in enumerate(store_dept_combinations.iterrows()):
    store, dept = row['Store'], row['Dept']

    # Progress updates every 100 iterations
    if idx % 100 == 0:
        elapsed = time.time() - start_time
        rate = idx / elapsed if elapsed > 0 else 0
        eta = (len(store_dept_combinations) - idx) / rate if rate > 0 else 0
        print(f"Progress: {idx}/{len(store_dept_combinations)} ({idx/len(store_dept_combinations)*100:.1f}%) | "
              f"Rate: {rate:.1f}/sec | ETA: {eta/60:.1f} min | Success: {success_count}, Fallback: {fallback_count}")

    result = model.fit_predict_store_dept(store, dept, df, test_df)

    if result[0] is None:  # Complete failure case
        fallback_count += 1
        predictions_dict[(store, dept)] = {'type': 'failed', 'value': 0}
    else:
        val_pred, val_actual, val_w, test_pred, test_group = result

        if val_pred is not None and val_actual is not None:
            success_count += 1
            val_predictions.extend(val_pred)
            val_actuals.extend(val_actual)
            val_weights.extend(val_w)

            # Log less frequently to save time
            if idx % 500 == 0:
                val_mae_score = calculate_mae(val_actual, val_pred)
                wandb.log({
                    f"Store_{store}_Dept_{dept}_val_mae": val_mae_score,
                    "store": store,
                    "dept": dept
                })
        else:
            fallback_count += 1

        # Store predictions in dictionary
        if test_pred is not None:
            predictions_dict[(store, dept)] = {'type': 'model', 'predictions': test_pred}
        else:
            # Use historical mean as fallback
            mean_sales = df[(df['Store'] == store) & (df['Dept'] == dept)]['Weekly_Sales'].mean()
            predictions_dict[(store, dept)] = {'type': 'mean', 'value': mean_sales if not pd.isna(mean_sales) else 0}

print(f"\n✅ Model training completed!")

Training models for 3331 store-department combinations...
Progress: 0/3331 (0.0%) | Rate: 0.0/sec | ETA: 0.0 min | Success: 0, Fallback: 0
Progress: 100/3331 (3.0%) | Rate: 13.5/sec | ETA: 4.0 min | Success: 100, Fallback: 0
Progress: 200/3331 (6.0%) | Rate: 16.7/sec | ETA: 3.1 min | Success: 200, Fallback: 0
Progress: 300/3331 (9.0%) | Rate: 20.7/sec | ETA: 2.4 min | Success: 300, Fallback: 0
Progress: 400/3331 (12.0%) | Rate: 24.7/sec | ETA: 2.0 min | Success: 400, Fallback: 0
Progress: 500/3331 (15.0%) | Rate: 27.0/sec | ETA: 1.7 min | Success: 500, Fallback: 0
Progress: 600/3331 (18.0%) | Rate: 28.5/sec | ETA: 1.6 min | Success: 600, Fallback: 0
Progress: 700/3331 (21.0%) | Rate: 30.8/sec | ETA: 1.4 min | Success: 700, Fallback: 0
Progress: 800/3331 (24.0%) | Rate: 32.7/sec | ETA: 1.3 min | Success: 800, Fallback: 0
Progress: 900/3331 (27.0%) | Rate: 34.5/sec | ETA: 1.2 min | Success: 900, Fallback: 0
Progress: 1000/3331 (30.0%) | Rate: 35.7/sec | ETA: 1.1 min | Success: 1000, Fall

In [23]:
# ✅ NOW GENERATE PREDICTIONS FOR ALL TEST DATA ROWS
print(f"Generating predictions for ALL test data rows...")

# Calculate overall mean as ultimate fallback
overall_mean = df['Weekly_Sales'].mean()

submission = []
missing_store_depts = set()

# Process EVERY row in test data
for idx, test_row in test_df.iterrows():
    store, dept = test_row['Store'], test_row['Dept']

    # Create the ID
    test_id = f"{store}_{dept}_{test_row['Date'].strftime('%Y-%m-%d')}"

    # Get prediction for this store-dept combination
    if (store, dept) in predictions_dict:
        pred_info = predictions_dict[(store, dept)]
        if pred_info['type'] == 'model':
            # Find which prediction index this test row corresponds to
            test_group = test_df[(test_df['Store'] == store) & (test_df['Dept'] == dept)].sort_values('Date')
            row_idx = test_group.index.get_loc(idx)
            if row_idx < len(pred_info['predictions']):
                pred_value = pred_info['predictions'][row_idx]
            else:
                pred_value = pred_info['predictions'][-1]  # Use last prediction
        else:
            pred_value = pred_info['value']
    else:
        # This store-dept combination wasn't in training data
        missing_store_depts.add((store, dept))
        pred_value = overall_mean  # Use overall mean as fallback

    submission.append({
        'Id': test_id,
        'Weekly_Sales': pred_value
    })

print(f"✅ Missing store-dept combinations (using overall mean): {len(missing_store_depts)}")
if missing_store_depts:
    print(f"Examples: {list(missing_store_depts)[:5]}")

print(f"✅ Generated predictions for {len(submission)} test rows")

# Verify we have the correct number of predictions
expected_rows = len(test_df)
actual_rows = len(submission)

print(f"\n📊 SUBMISSION VERIFICATION:")
print(f"Expected test rows: {expected_rows}")
print(f"Generated predictions: {actual_rows}")
print(f"Match: {'✅ YES' if expected_rows == actual_rows else '❌ NO'}")

if expected_rows != actual_rows:
    print(f"❌ Missing {expected_rows - actual_rows} predictions!")

    # Find missing test rows
    test_ids = set(f"{row['Store']}_{row['Dept']}_{row['Date'].strftime('%Y-%m-%d')}"
                   for _, row in test_df.iterrows())
    submission_ids = set(row['Id'] for row in submission)
    missing_ids = test_ids - submission_ids

    print(f"Missing IDs: {len(missing_ids)}")
    if missing_ids:
        print(f"Examples: {list(missing_ids)[:5]}")

        # Add missing predictions with overall mean
        for missing_id in missing_ids:
            submission.append({
                'Id': missing_id,
                'Weekly_Sales': overall_mean
            })

        print(f"✅ Added {len(missing_ids)} missing predictions")
        print(f"✅ Final submission size: {len(submission)}")

total_time = time.time() - start_time
print(f"\n✅ Processing completed in {total_time/60:.1f} minutes")
print(f"✅ Successful SARIMAX fits: {success_count}")
print(f"⚠️  Fallback predictions: {fallback_count}")

# Calculate overall WMAE
if len(val_predictions) > 0:
    overall_wmae = calculate_wmae(np.array(val_actuals), np.array(val_predictions), np.array(val_weights))
    wandb.log({
        "overall_wmae": overall_wmae,
        "success_count": success_count,
        "fallback_count": fallback_count,
        "total_time_minutes": total_time/60,
        "submission_rows": len(submission),
        "expected_rows": expected_rows
    })
    print(f"✅ Overall Validation WMAE: {overall_wmae:.4f}")
else:
    print("❌ No validation predictions generated")

# Create submission DataFrame
submission_df = pd.DataFrame(submission)
print(f"✅ Generated {len(submission_df)} predictions")
print(f"✅ Sample predictions:\n{submission_df.head()}")

# Save submission
submission_df.to_csv('/content/sarimax_submission.csv', index=False)
print("✅ Submission saved to /content/sarimax_submission.csv")

wandb.finish()

Generating predictions for ALL test data rows...
✅ Missing store-dept combinations (using overall mean): 11
Examples: [(18, 43), (24, 43), (36, 30), (37, 29), (25, 99)]
✅ Generated predictions for 115064 test rows

📊 SUBMISSION VERIFICATION:
Expected test rows: 115064
Generated predictions: 115064
Match: ✅ YES

✅ Processing completed in 4.1 minutes
✅ Successful SARIMAX fits: 3331
⚠️  Fallback predictions: 0
✅ Overall Validation WMAE: 5034.1327
✅ Generated 115064 predictions
✅ Sample predictions:
               Id  Weekly_Sales
0  1_1_2012-11-02  25478.484605
1  1_1_2012-11-09  23699.670720
2  1_1_2012-11-16  22045.047064
3  1_1_2012-11-23  20505.943134
4  1_1_2012-11-30  19074.293766
✅ Submission saved to /content/sarimax_submission.csv


0,1
Store_14_Dept_13_val_mae,▁
Store_1_Dept_1_val_mae,▁
Store_20_Dept_51_val_mae,▁
Store_27_Dept_10_val_mae,▁
Store_33_Dept_97_val_mae,▁
Store_41_Dept_6_val_mae,▁
Store_7_Dept_52_val_mae,▁
dept,▁▅▂▅▂█▁
expected_rows,▁
fallback_count,▁

0,1
Store_14_Dept_13_val_mae,7953.76273
Store_1_Dept_1_val_mae,8880.30376
Store_20_Dept_51_val_mae,11.06424
Store_27_Dept_10_val_mae,5431.11492
Store_33_Dept_97_val_mae,1640.43571
Store_41_Dept_6_val_mae,4382.32999
Store_7_Dept_52_val_mae,475.3113
dept,6.0
expected_rows,115064.0
fallback_count,0.0


# public score 7900