<a href="https://colab.research.google.com/github/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_Ensemble_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

# Install required libraries
!pip install pandas numpy matplotlib seaborn scikit-learn lightgbm xgboost wandb pyyaml --quiet
import wandb
wandb.login(key="eccf2c915699fc032ad678daf0fd4b5ac60bf87c")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabakh22[0m ([33mabakh22-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
# Mount Google Drive and extract data
from google.colab import drive
import zipfile
import os
drive.mount('/content/drive')
zip_path = '/content/drive/MyDrive/ML-FinalProject/data.zip'
extract_to = '/content/walmart_data/'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
for file_name in os.listdir(extract_to):
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(os.path.join(extract_to, file_name), 'r') as zip_ref:
            zip_ref.extractall(extract_to)
print("✅ Extracted files:", os.listdir(extract_to))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Extracted files: ['test.csv.zip', 'stores.csv', 'sampleSubmission.csv', 'features.csv', 'test.csv', 'train.csv.zip', 'train.csv', 'sampleSubmission.csv.zip', 'features.csv.zip']


In [6]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import wandb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [7]:
# Load data
data_path = '/content/walmart_data/'
train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path + 'test.csv')
features_df = pd.read_csv(data_path + 'features.csv')
stores_df = pd.read_csv(data_path + 'stores.csv')

# Merge datasets, keeping IsHoliday from features.csv
train_df = train_df.merge(stores_df, on='Store').merge(
    features_df, on=['Store', 'Date'], suffixes=('', '_features'), how='left'
)
test_df = test_df.merge(stores_df, on='Store').merge(
    features_df, on=['Store', 'Date'], suffixes=('', '_features'), how='left'
)

# Drop redundant IsHoliday and rename
train_df = train_df.drop(columns=['IsHoliday'], errors='ignore').rename(
    columns={'IsHoliday_features': 'IsHoliday'}
)
test_df = test_df.drop(columns=['IsHoliday'], errors='ignore').rename(
    columns={'IsHoliday_features': 'IsHoliday'}
)

# Convert Date to datetime
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

# Handle missing values in MarkDown columns
markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
train_df[markdown_cols] = train_df[markdown_cols].fillna(0)
test_df[markdown_cols] = test_df[markdown_cols].fillna(0)

# Convert IsHoliday to numeric
train_df['IsHoliday'] = train_df['IsHoliday'].astype(int)
test_df['IsHoliday'] = test_df['IsHoliday'].astype(int)

# Fill missing values in other columns
numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
for col in numeric_cols:
    train_df[col] = train_df[col].fillna(train_df[col].mean())
    test_df[col] = test_df[col].fillna(train_df[col].mean())

def create_features(df):
    """Create time-based and lag features for tree-based models"""
    df = df.copy()

    # Time-based features
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Quarter'] = df['Date'].dt.quarter
    df['IsYearEnd'] = (df['Date'].dt.month == 12).astype(int)
    df['IsYearStart'] = (df['Date'].dt.month == 1).astype(int)
    df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)
    df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)

    # Store-Department combination
    df['Store_Dept'] = df['Store'].astype(str) + '_' + df['Dept'].astype(str)

    # Encode categorical variables
    le_type = LabelEncoder()
    df['Type_encoded'] = le_type.fit_transform(df['Type'])

    return df

# Create features for both datasets
train_df = create_features(train_df)
test_df = create_features(test_df)

def create_lag_features(df, target_col='Weekly_Sales', lags=[1, 2, 3, 4, 8, 12]):
    """Create lag features for time series"""
    df = df.copy()
    df = df.sort_values(['Store', 'Dept', 'Date'])

    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = df.groupby(['Store', 'Dept'])[target_col].shift(lag)

    # Rolling statistics
    for window in [3, 4, 8]:
        df[f'{target_col}_rolling_mean_{window}'] = df.groupby(['Store', 'Dept'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        df[f'{target_col}_rolling_std_{window}'] = df.groupby(['Store', 'Dept'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )

    return df

# Create lag features for training data
train_df = create_lag_features(train_df)

# For test data, we need to append it to train data to create proper lags
combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
combined_df = combined_df.sort_values(['Store', 'Dept', 'Date'])

# Create lag features for combined data
combined_df = create_lag_features(combined_df, target_col='Weekly_Sales')

# Split back into train and test
train_with_lags = combined_df[combined_df['Weekly_Sales'].notna()].copy()
test_with_lags = combined_df[combined_df['Weekly_Sales'].isna()].copy()

# Define feature columns
feature_cols = [
    'Store', 'Dept', 'Size', 'Type_encoded',
    'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
    'IsHoliday', 'Year', 'Month', 'Week', 'Day', 'DayOfWeek', 'Quarter',
    'IsYearEnd', 'IsYearStart', 'IsMonthEnd', 'IsMonthStart'
]

# Add lag features to feature columns
lag_cols = [col for col in train_with_lags.columns if 'lag_' in col or 'rolling_' in col]
feature_cols.extend(lag_cols)

# Remove rows with missing lag features for training
train_with_lags = train_with_lags.dropna(subset=lag_cols)

# Fill missing lag features in test set with 0 (or forward fill)
test_with_lags[lag_cols] = test_with_lags[lag_cols].fillna(0)

# Prepare training data
X_train = train_with_lags[feature_cols]
y_train = train_with_lags['Weekly_Sales']

# Prepare test data
X_test = test_with_lags[feature_cols]

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Features: {len(feature_cols)}")

# Time series split for validation
tscv = TimeSeriesSplit(n_splits=3)
train_indices, val_indices = list(tscv.split(X_train))[-1]  # Use last split

X_train_split = X_train.iloc[train_indices]
y_train_split = y_train.iloc[train_indices]
X_val_split = X_train.iloc[val_indices]
y_val_split = y_train.iloc[val_indices]

Training data shape: (382955, 36)
Test data shape: (115064, 36)
Features: 36


In [9]:
# Initialize models
models = {
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=8,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    'XGBoost': xgb.XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,  # L1 regularization
        reg_lambda=0.1,  # L2 regularization
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ),
    'RandomForest': RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
}

# Initialize wandb
wandb.init(project="walmart-sales-forecasting-trees", config={
    "models": list(models.keys()),
    "features": len(feature_cols),
    "train_size": len(X_train_split),
    "val_size": len(X_val_split)
})

print("🚀 Starting model training...")

# Train models and get validation predictions
val_predictions = {}
model_objects = {}

for name, model in models.items():
    print(f"Training {name}...")

    if name == 'LightGBM':
        model.fit(
            X_train_split, y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
        )
    elif name == 'XGBoost':
        model.fit(
            X_train_split, y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            verbose=False
        )
    else:  # RandomForest
        model.fit(X_train_split, y_train_split)

    # Validation predictions
    val_pred = model.predict(X_val_split)
    val_predictions[name] = val_pred
    model_objects[name] = model

    # Calculate validation MAE
    val_mae = mean_absolute_error(y_val_split, val_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val_split, val_pred))

    print(f"{name} - Validation MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")

    # Log to wandb
    wandb.log({
        f"val_{name}_MAE": val_mae,
        f"val_{name}_RMSE": val_rmse
    })

# Create ensemble prediction (simple average)
ensemble_val_pred = np.mean(list(val_predictions.values()), axis=0)
ensemble_mae = mean_absolute_error(y_val_split, ensemble_val_pred)
ensemble_rmse = np.sqrt(mean_squared_error(y_val_split, ensemble_val_pred))

print(f"Ensemble - Validation MAE: {ensemble_mae:.4f}, RMSE: {ensemble_rmse:.4f}")
wandb.log({
    "val_ensemble_MAE": ensemble_mae,
    "val_ensemble_RMSE": ensemble_rmse
})

print("✅ Validation completed!")

# Train final models on full training data
print("🚀 Training final models on full dataset...")

final_models = {}
final_predictions = {}

for name, model in models.items():
    print(f"Final training {name}...")

    # Create fresh model instance
    if name == 'LightGBM':
        final_model = lgb.LGBMRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=8,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
        final_model.fit(X_train, y_train)
    elif name == 'XGBoost':
        final_model = xgb.XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42,
            n_jobs=-1,
            verbosity=0
        )
        final_model.fit(X_train, y_train)
    else:  # RandomForest
        final_model = RandomForestRegressor(
            n_estimators=200,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
        final_model.fit(X_train, y_train)

    # Generate test predictions
    test_pred = final_model.predict(X_test)
    final_predictions[name] = test_pred
    final_models[name] = final_model

# Create final ensemble prediction
final_ensemble_pred = np.mean(list(final_predictions.values()), axis=0)

# Prepare submission
submission_df = test_df[['Store', 'Dept', 'Date']].copy()

# Create submission ID
submission_df['Id'] = (submission_df['Store'].astype(str) + '_' +
                      submission_df['Dept'].astype(str) + '_' +
                      submission_df['Date'].dt.strftime('%Y-%m-%d'))

# Add ensemble predictions
submission_df['Weekly_Sales'] = final_ensemble_pred

# Ensure non-negative predictions
submission_df['Weekly_Sales'] = np.maximum(submission_df['Weekly_Sales'], 0)

# Final submission format
final_submission = submission_df[['Id', 'Weekly_Sales']].copy()

print("📊 Submission Summary:")
print(f"Total predictions: {len(final_submission)}")
print(f"Mean prediction: {final_submission['Weekly_Sales'].mean():.2f}")
print(f"Std prediction: {final_submission['Weekly_Sales'].std():.2f}")
print(f"Min prediction: {final_submission['Weekly_Sales'].min():.2f}")
print(f"Max prediction: {final_submission['Weekly_Sales'].max():.2f}")

# Feature importance analysis
print("\n📈 Feature Importance (LightGBM):")
lgb_model = final_models['LightGBM']
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

# Save submission
final_submission.to_csv('/content/walmart_submission_trees.csv', index=False)
print("✅ Submission saved to /content/walmart_submission_trees.csv")

# Log final metrics
wandb.log({
    "total_predictions": len(final_submission),
    "mean_prediction": final_submission['Weekly_Sales'].mean(),
    "std_prediction": final_submission['Weekly_Sales'].std(),
    "min_prediction": final_submission['Weekly_Sales'].min(),
    "max_prediction": final_submission['Weekly_Sales'].max()
})

# Display first few predictions
print("\n📋 First 10 predictions:")
print(final_submission.head(10))

# Save individual model predictions for analysis
individual_predictions = pd.DataFrame({
    'Id': submission_df['Id'],
    'LightGBM': final_predictions['LightGBM'],
    'XGBoost': final_predictions['XGBoost'],
    'RandomForest': final_predictions['RandomForest'],
    'Ensemble': final_ensemble_pred
})

individual_predictions.to_csv('/content/individual_predictions.csv', index=False)
print("✅ Individual predictions saved to /content/individual_predictions.csv")

wandb.finish()

0,1
val_LightGBM_MAE,▁
val_LightGBM_RMSE,▁

0,1
val_LightGBM_MAE,364.54116
val_LightGBM_RMSE,1367.99121


🚀 Starting model training...
Training LightGBM...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 1.8714e+06
LightGBM - Validation MAE: 364.5412, RMSE: 1367.9912
Training XGBoost...
XGBoost - Validation MAE: 334.3273, RMSE: 1577.1431
Training RandomForest...
RandomForest - Validation MAE: 258.4051, RMSE: 1378.3604
Ensemble - Validation MAE: 283.7097, RMSE: 1329.0928
✅ Validation completed!
🚀 Training final models on full dataset...
Final training LightGBM...
Final training XGBoost...
Final training RandomForest...
📊 Submission Summary:
Total predictions: 115064
Mean prediction: 942.93
Std prediction: 6354.92
Min prediction: 0.00
Max prediction: 193788.49

📈 Feature Importance (LightGBM):
                        feature  importance
30  Weekly_Sales_rolling_mean_3        4657
25           Weekly_Sales_lag_2        3796
24           Weekly_Sales_lag_1        3768
31   Weekly_Sales_rolling_std_3        2761


0,1
max_prediction,▁
mean_prediction,▁
min_prediction,▁
std_prediction,▁
total_predictions,▁
val_LightGBM_MAE,▁
val_LightGBM_RMSE,▁
val_RandomForest_MAE,▁
val_RandomForest_RMSE,▁
val_XGBoost_MAE,▁

0,1
max_prediction,193788.49278
mean_prediction,942.92583
min_prediction,0.0
std_prediction,6354.91883
total_predictions,115064.0
val_LightGBM_MAE,364.54116
val_LightGBM_RMSE,1367.99121
val_RandomForest_MAE,258.40506
val_RandomForest_RMSE,1378.36039
val_XGBoost_MAE,334.32732


In [11]:
# Read the individual predictions file
individual_predictions = pd.read_csv('/content/individual_predictions.csv')

# Create final submission with only Id and Ensemble predictions
final_submission_clean = individual_predictions[['Id', 'Ensemble']].copy()
final_submission_clean = final_submission_clean.rename(columns={'Ensemble': 'Weekly_Sales'})

# Save the clean submission file
final_submission_clean.to_csv('/content/walmart_submission_final.csv', index=False)

print("✅ Clean submission file saved to /content/walmart_submission_final.csv")
print(f"Columns: {list(final_submission_clean.columns)}")
print(f"Shape: {final_submission_clean.shape}")


✅ Clean submission file saved to /content/walmart_submission_final.csv
Columns: ['Id', 'Weekly_Sales']
Shape: (115064, 2)
