# ML Models for MCC Aggregates Forecasting
CatBoost model with colab compatibility and per-category metrics


In [6]:
!pip install catboost -q

In [7]:
# colab setup
try:
    import google.colab
    print("Running in Google Colab")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'catboost', '-q'])
    print("✓ CatBoost installed")
except ImportError:
    print("Running in local environment")


Running in Google Colab
✓ CatBoost installed


In [8]:
# imports
import pandas as pd
import numpy as np
import json
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

print("Loading ML features dataset...")


Loading ML features dataset...


In [9]:
# Environment setup
def detect_environment():
    try:
        import google.colab
        from google.colab import drive
        drive.mount('/content/drive/')
        return 'colab', '/content/drive/MyDrive/fcst/'
    except ImportError:
        return 'local', '..'

environment, base_path = detect_environment()
print(f"Environment: {environment}")
print(f"Base path: {base_path}")
import sys
sys.path.append(base_path+'MCC Aggregates Forecasting')
from evaluation import evaluate_and_report_mcc, evaluate_mcc_forecasting


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Environment: colab
Base path: /content/drive/MyDrive/fcst/


In [10]:
# load data
train_parquet = f'{base_path}/data/features/ml_train.parquet'
test_parquet = f'{base_path}/data/features/ml_test.parquet'

# Try parquet first, fallback to CSV
try:
    train_df = pd.read_parquet(train_parquet)
    test_df = pd.read_parquet(test_parquet)
    print("✓ Loaded parquet files")
except:
    try:
        train_df = pd.read_csv(f'{base_path}/data/features/ml_train.csv')
        test_df = pd.read_csv(f'{base_path}/data/features/ml_test.csv')
        print("✓ Loaded CSV files")
    except:
        print("❌ Could not load ML feature files")
        print("Please run feature_engineering_final.py first to create the ML features")
        exit()

train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")


✓ Loaded parquet files
Train shape: (1836701, 72)
Test shape: (462255, 72)


In [11]:
# prepare features
exclude_cols = ['client_id', 'category', 'date', 'amount', 'log_amount', 'sqrt_amount',
                'target', 'target_log', 'target_sqrt', 'split', 'week', 'address']
feature_cols = [col for col in train_df.columns if col not in exclude_cols]

# Handle categorical features (only gender needs encoding)
if 'gender' in feature_cols:
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    train_df = train_df.copy()
    test_df = test_df.copy()
    train_df['gender'] = le.fit_transform(train_df['gender'].astype(str))
    test_df['gender'] = le.transform(test_df['gender'].astype(str))

print(f"Number of features: {len(feature_cols)}")

# Remove NaN values and prepare data
train_df = train_df.dropna(subset=feature_cols + ['amount'])
test_df = test_df.dropna(subset=feature_cols + ['amount'])

X_train = train_df[feature_cols]
y_train = train_df['amount']
X_test = test_df[feature_cols]
y_test = test_df['amount']

print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")


Number of features: 60
Train set: (1523349, 60), Test set: (462255, 60)


In [12]:
# catboost model
print("\n=== CatBoost Model ===")

# Lightweight parameter search
tscv = TimeSeriesSplit(n_splits=2)
catboost_params = {
    'iterations': [100, 200],
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1]
}

cb = CatBoostRegressor(random_state=42, verbose=False)
grid_cb = GridSearchCV(cb, catboost_params, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_cb.fit(X_train, y_train)

best_catboost = grid_cb.best_estimator_
print(f"CatBoost best params: {grid_cb.best_params_}")
print("CatBoost training completed")



=== CatBoost Model ===
CatBoost best params: {'depth': 6, 'iterations': 200, 'learning_rate': 0.1}
CatBoost training completed


In [13]:
# catboost evaluation
y_pred_cb = best_catboost.predict(X_test)
test_categories = test_df['category'].values
y_train_for_rmsse = np.concatenate(train_df.groupby(['client_id', 'category'])['amount'].apply(lambda x: x.values).tolist())

cb_metrics, cb_report = evaluate_and_report_mcc(
    'CatBoost',
    y_test.values,
    y_pred_cb,
    y_train_for_rmsse,
    test_categories,
    print_report=True
)



MCC AGGREGATES FORECASTING REPORT: CATBOOST

OVERALL PERFORMANCE:
Metric          Value          
------------------------------
sMAPE_w         48.2336        
RMSSE_w         0.5136         
MAE             71.3287        
RMSE            117.4321       

PER-CATEGORY PERFORMANCE:
Category       sMAPE_w     RMSSE_w     MAE         RMSE        
---------------------------------------------------------------
food           44.9578     0.5253      82.2198     120.0990    
retail         44.9307     0.4291      61.8586     98.1098     
services       47.8982     0.6092      74.2266     139.2803    
specialty      49.8053     0.2566      38.5581     58.6763     
transport      55.0005     0.6334      91.3880     144.8089    




In [14]:
# per-category metrics
print("\n=== CatBoost Per-Category Results ===")
categories = sorted(test_df['category'].unique())
category_metrics = {}

for cat in categories:
    mask = test_df['category'] == cat
    if mask.sum() > 0:
        y_true_cat = y_test[mask].values
        y_pred_cat = y_pred_cb[mask]

        # Get training data for this category for RMSSE
        train_cat_mask = train_df['category'] == cat
        y_train_cat = train_df[train_cat_mask]['amount'].values

        # Calculate metrics
        mae = np.mean(np.abs(y_true_cat - y_pred_cat))
        rmse = np.sqrt(np.mean((y_true_cat - y_pred_cat)**2))
        smape = 100 * np.mean(2 * np.abs(y_pred_cat - y_true_cat) / (np.abs(y_true_cat) + np.abs(y_pred_cat) + 1e-8))

        # RMSSE
        if len(y_train_cat) > 1:
            naive_error = np.mean(np.abs(np.diff(y_train_cat)))
            rmsse = rmse / (naive_error + 1e-8)
        else:
            rmsse = np.nan

        category_metrics[cat] = {
            'MAE': mae,
            'RMSE': rmse,
            'sMAPE': smape,
            'RMSSE': rmsse,
            'n_samples': int(mask.sum())
        }

        print(f"{cat:15} | MAE: {mae:8.2f} | RMSE: {rmse:8.2f} | sMAPE: {smape:6.2f}% | RMSSE: {rmsse:6.4f} | n={mask.sum()}")



=== CatBoost Per-Category Results ===
food            | MAE:    82.22 | RMSE:   120.10 | sMAPE:  44.96% | RMSSE: 0.7527 | n=117203
retail          | MAE:    61.86 | RMSE:    98.11 | sMAPE:  44.93% | RMSSE: 0.8079 | n=100439
services        | MAE:    74.23 | RMSE:   139.28 | sMAPE:  47.90% | RMSSE: 0.9822 | n=78508
specialty       | MAE:    38.56 | RMSE:    58.68 | sMAPE:  49.81% | RMSSE: 0.7717 | n=73533
transport       | MAE:    91.39 | RMSE:   144.81 | sMAPE:  55.00% | RMSSE: 0.8174 | n=92572


In [15]:
# save results
results = {
    'timestamp': datetime.now().isoformat(),
    'environment': environment,
    'train_size': len(X_train),
    'test_size': len(X_test),
    'n_features': len(feature_cols),
    'models': {
        'CatBoost': {
            'params': best_catboost.get_params(),
            'metrics': cb_metrics,
            'cv_score': -grid_cb.best_score_,
            'category_metrics': category_metrics
        }
    }
}

output_file = f'{base_path}/MCC Aggregates Forecasting/ML models/ml_models_results.json'
with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\nResults saved to {output_file}")

print("\n=== ML MODEL SUMMARY ===")
print(f"Environment: {environment}")
print(f"CatBoost Overall sMAPE: {cb_metrics['sMAPE_w']:.2f}%")
print(f"CatBoost Overall RMSSE: {cb_metrics['RMSSE_w']:.4f}")
print("ML model evaluation completed!")


Results saved to /content/drive/MyDrive/fcst//MCC Aggregates Forecasting/ML models/ml_models_results.json

=== ML MODEL SUMMARY ===
Environment: colab
CatBoost Overall sMAPE: 48.23%
CatBoost Overall RMSSE: 0.5136
ML model evaluation completed!
