In [1]:

import numpy as np
from tqdm import tqdm
import os
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define base directory
base_dir = '/content/drive/MyDrive/amazon_ml_challenge'

Mounted at /content/drive


In [2]:
import torch
# Load your CLIP-processed test data
test_data_path = '/content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_full_with_alignment.pt'
test_data = torch.load(test_data_path)
train_input = test_data['train_input']
val_input = test_data['val_input']
train_targets = test_data['train_targets']
val_targets = test_data['val_targets']


In [11]:
# =========================================================================
# PRICE-BASED SPECIALIST MODELS - PHASE 0 & 1
# Phase 0: Data Preparation & Binning
# Phase 1: Train Specialist Ensembles
# =========================================================================
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# =========================================================================
# PHASE 0: DATA PREPARATION
# =========================================================================
print("\n" + "="*70)
print("üì¶ PHASE 0: DATA PREPARATION & PRICE BINNING")
print("="*70)

# Assuming you already have:
# train_input, val_input, train_targets, val_targets loaded

print(f"\n‚úÖ Data loaded:")
print(f"  Train: {train_input.shape}")
print(f"  Val: {val_input.shape}")

# Convert log prices to original scale
train_prices_orig = np.expm1(train_targets.cpu().numpy().squeeze())
val_prices_orig = np.expm1(val_targets.cpu().numpy().squeeze())

print(f"\nüìä Price Statistics:")
print(f"  Train prices - Min: ${train_prices_orig.min():.2f}, Max: ${train_prices_orig.max():.2f}")
print(f"  Train prices - Mean: ${train_prices_orig.mean():.2f}, Median: ${np.median(train_prices_orig):.2f}")

# Define price bins
BIN_BOUNDARIES = {
    'affordable': (0, 50),      # Bin 0
    'mid': (50, 100),           # Bin 1
    'premium': (100, float('inf'))  # Bin 2
}

print(f"\nüóÇÔ∏è  Price Bin Definitions:")
print(f"  Bin 0 (Affordable): $0 - $50")
print(f"  Bin 1 (Mid-range): $50 - $100")
print(f"  Bin 2 (Premium): $100+")

# Create masks for training data
train_affordable_mask = train_prices_orig < 40
train_mid_mask = (train_prices_orig >= 40) & (train_prices_orig < 75)
train_premium_mask = train_prices_orig >= 75

# Create masks for validation data
val_affordable_mask = val_prices_orig < 40
val_mid_mask = (val_prices_orig >= 40) & (val_prices_orig < 75)
val_premium_mask = val_prices_orig >= 75

# Print distribution
print(f"\nüìà Training Data Distribution:")
print(f"  Affordable (<$50):   {train_affordable_mask.sum():6d} samples ({train_affordable_mask.sum()/len(train_prices_orig)*100:5.1f}%)")
print(f"  Mid ($50-$100):      {train_mid_mask.sum():6d} samples ({train_mid_mask.sum()/len(train_prices_orig)*100:5.1f}%)")
print(f"  Premium ($100+):     {train_premium_mask.sum():6d} samples ({train_premium_mask.sum()/len(train_prices_orig)*100:5.1f}%)")

print(f"\nüìà Validation Data Distribution:")
print(f"  Affordable (<$50):   {val_affordable_mask.sum():6d} samples ({val_affordable_mask.sum()/len(val_prices_orig)*100:5.1f}%)")
print(f"  Mid ($50-$100):      {val_mid_mask.sum():6d} samples ({val_mid_mask.sum()/len(val_prices_orig)*100:5.1f}%)")
print(f"  Premium ($100+):     {val_premium_mask.sum():6d} samples ({val_premium_mask.sum()/len(val_prices_orig)*100:5.1f}%)")

# Split data by bins
print(f"\nüî™ Splitting data by price bins...")

# Training splits
train_affordable_input = train_input[train_affordable_mask]
train_affordable_targets = train_targets[train_affordable_mask]

train_mid_input = train_input[train_mid_mask]
train_mid_targets = train_targets[train_mid_mask]

train_premium_input = train_input[train_premium_mask]
train_premium_targets = train_targets[train_premium_mask]

# Validation splits
val_affordable_input = val_input[val_affordable_mask]
val_affordable_targets = val_targets[val_affordable_mask]

val_mid_input = val_input[val_mid_mask]
val_mid_targets = val_targets[val_mid_mask]

val_premium_input = val_input[val_premium_mask]
val_premium_targets = val_targets[val_premium_mask]

print(f"‚úÖ Data splitting complete!")

Using device: cuda

üì¶ PHASE 0: DATA PREPARATION & PRICE BINNING

‚úÖ Data loaded:
  Train: torch.Size([60000, 1046])
  Val: torch.Size([15000, 1046])

üìä Price Statistics:
  Train prices - Min: $0.13, Max: $1280.00
  Train prices - Mean: $23.60, Median: $14.09

üóÇÔ∏è  Price Bin Definitions:
  Bin 0 (Affordable): $0 - $50
  Bin 1 (Mid-range): $50 - $100
  Bin 2 (Premium): $100+

üìà Training Data Distribution:
  Affordable (<$50):    50903 samples ( 84.8%)
  Mid ($50-$100):        6056 samples ( 10.1%)
  Premium ($100+):       3041 samples (  5.1%)

üìà Validation Data Distribution:
  Affordable (<$50):    12743 samples ( 85.0%)
  Mid ($50-$100):        1494 samples ( 10.0%)
  Premium ($100+):        763 samples (  5.1%)

üî™ Splitting data by price bins...
‚úÖ Data splitting complete!


In [12]:
# =========================================================================
# MODEL DEFINITION
# =========================================================================
class RegressionMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),

            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.model(x)
# =========================================================================
# SMAPE LOSS FUNCTIONS
# =========================================================================
def smape_loss(predictions, targets):
    """SMAPE loss for training - optimizes for SMAPE metric"""
    numerator = torch.abs(predictions - targets)
    denominator = (torch.abs(predictions) + torch.abs(targets)) / 2
    return torch.mean(numerator / (denominator + 1e-8))

def weighted_smape_loss(predictions, targets, premium_threshold=np.log1p(85)):
    """
    Weighted SMAPE loss - 3x penalty for premium items (>$85)
    This forces the model to focus more on expensive items
    """
    smape = torch.abs(predictions - targets) / ((torch.abs(predictions) + torch.abs(targets))/2 + 1e-8)

    # 3x weight for premium items, 1.5x for mid-range
    weights = torch.where(targets > premium_threshold, 5.0,
                         torch.where(targets > np.log1p(40), 1.5, 1.0))

    return torch.mean(weights * smape)

def smape_metric(y_true_log, y_pred_log):
    """Calculate SMAPE in original scale for validation"""
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2 + 1e-8)) * 100

# =========================================================================
# TRAINING FUNCTION WITH SMAPE OPTIMIZATION
# =========================================================================
def train_ensemble(X_train, y_train, X_val, y_val,
                   n_models, bin_name, epochs=30, lr=1e-3, use_weighted=True,
                   model_class=None):
    """
    Train an ensemble of models optimized for SMAPE metric

    Args:
        use_weighted: If True, use weighted SMAPE (3x weight on premium items)
                     If False, use standard SMAPE loss
        model_class: Custom model class (if None, uses RegressionMLP)
    """

    print(f"\n{'='*70}")
    print(f"üöÄ Training {bin_name.upper()} Ensemble ({n_models} models)")
    print(f"   Loss: {'Weighted SMAPE' if use_weighted else 'SMAPE'}")
    print(f"{'='*70}")
    print(f"Training samples: {X_train.shape[0]}")
    print(f"Validation samples: {X_val.shape[0]}")

    models = []
    val_smapes = []
    val_maes = []

    # Move data to device
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_val = X_val.to(device)
    y_val = y_val.to(device)

    # For validation metrics (original scale)
    val_true_orig = np.expm1(y_val.cpu().numpy().squeeze())

    for seed in range(n_models):
        torch.manual_seed(seed)
        np.random.seed(seed)

        print(f"\nüîÑ Training model {seed+1}/{n_models} (seed={seed})...")

        # Initialize model (use custom class if provided)
        ModelClass = model_class if model_class is not None else RegressionMLP
        model = ModelClass(X_train.shape[1]).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

        # Choose loss function
        if use_weighted:
            criterion = weighted_smape_loss
        else:
            criterion = smape_loss

        # Data loader
        train_loader = DataLoader(
            TensorDataset(X_train, y_train),
            batch_size=256,
            shuffle=True
        )

        # Training loop
        best_val_smape = float('inf')
        patience_counter = 0
        patience_limit = 5

        for epoch in range(epochs):
            model.train()
            epoch_loss = 0

            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                preds = model(xb)
                loss = criterion(preds, yb)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

            # Validation every 10 epochs
            if (epoch + 1) % 10 == 0 or epoch == epochs - 1:
                model.eval()
                with torch.no_grad():
                    val_preds = model(X_val)

                    # Clip predictions to reasonable range
                    # Min: $3 (log1p(3) ‚âà 1.39), Max: $500 (log1p(500) ‚âà 6.21)
                    val_preds_clipped = torch.clamp(val_preds,
                                                    min=np.log1p(3),
                                                    max=np.log1p(3000))

                    # Calculate SMAPE (primary metric)
                    val_smape = smape_metric(y_val.cpu().numpy().squeeze(),
                                           val_preds_clipped.cpu().numpy().squeeze())

                    # Also calculate MAE for reference
                    val_preds_orig = np.expm1(val_preds_clipped.cpu().numpy().squeeze())
                    val_mae = mean_absolute_error(val_true_orig, val_preds_orig)

                avg_loss = epoch_loss / len(train_loader)
                print(f"   Epoch {epoch+1}/{epochs}: Loss = {avg_loss:.4f}, "
                      f"Val SMAPE = {val_smape:.2f}%, Val MAE = ${val_mae:.2f}")

                # Early stopping based on SMAPE (not MAE!)
                if val_smape < best_val_smape:
                    best_val_smape = val_smape
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= patience_limit:
                        print(f"   ‚è∏Ô∏è  Early stopping triggered at epoch {epoch+1}")
                        break

        models.append(model)
        val_smapes.append(best_val_smape)
        val_maes.append(val_mae)
        print(f"‚úÖ Model {seed+1}/{n_models} complete - Best SMAPE: {best_val_smape:.2f}%")

    # Ensemble validation
    print(f"\nüìä Ensemble Validation for {bin_name}:")
    all_preds = []
    for model in models:
        model.eval()
        with torch.no_grad():
            pred = model(X_val)
            pred_clipped = torch.clamp(pred,
                                      min=np.log1p(3),
                                      max=np.log1p(500))
            all_preds.append(pred_clipped.cpu())

    # Average predictions
    ensemble_pred = torch.stack(all_preds).mean(dim=0)
    ensemble_pred_orig = np.expm1(ensemble_pred.numpy().squeeze())

    # Calculate both metrics
    ensemble_smape = smape_metric(y_val.cpu().numpy().squeeze(),
                                 ensemble_pred.numpy().squeeze())
    ensemble_mae = mean_absolute_error(val_true_orig, ensemble_pred_orig)

    print(f"   Individual models SMAPE range: {min(val_smapes):.2f}% - {max(val_smapes):.2f}%")
    print(f"   Individual models SMAPE mean: {np.mean(val_smapes):.2f}%")
    print(f"   üèÜ Ensemble SMAPE: {ensemble_smape:.2f}%")
    print(f"   üìä Ensemble MAE: ${ensemble_mae:.2f}")

    return models, ensemble_smape, val_smapes  # Return SMAPE instead of MAE



In [14]:

# =========================================================================
# PHASE 1: TRAIN ALL SPECIALISTS
# =========================================================================
print("\n" + "="*70)
print("üèãÔ∏è PHASE 1: TRAINING SPECIALIST ENSEMBLES")
print("="*70)

# Store all results
all_specialists = {}
all_results = {}

# -------------------------------------------------------------------------
# 1. Train AFFORDABLE Specialist (<$50)
# -------------------------------------------------------------------------
if train_affordable_input.shape[0] > 1000:  # Only if enough data
    affordable_models, affordable_smape, affordable_smapes = train_ensemble(
        X_train=train_affordable_input,
        y_train=train_affordable_targets,
        X_val=val_affordable_input,
        y_val=val_affordable_targets,
        n_models=10,  # Plenty of data
        bin_name="Affordable",
        epochs=30,
        lr=1e-3,
        use_weighted=True
    )
    all_specialists['affordable'] = affordable_models
    all_results['affordable'] = affordable_smape
else:
    print("\n‚ö†Ô∏è  Too few affordable samples, skipping...")
    all_specialists['affordable'] = None

# -------------------------------------------------------------------------
# 2. Train MID-RANGE Specialist ($50-$100)
# -------------------------------------------------------------------------
if train_mid_input.shape[0] > 500:  # Only if enough data
    mid_models, mid_mae, mid_smapes  = train_ensemble(
        X_train=train_mid_input,
        y_train=train_mid_targets,
        X_val=val_mid_input,
        y_val=val_mid_targets,
        n_models=7,  # Less data, fewer models
        bin_name="Mid-range",
        epochs=40,  # More epochs, less data
        lr=5e-4,  # Lower learning rate
        use_weighted=False
    )
    all_specialists['mid'] = mid_models
    all_results['mid'] = mid_mae
else:
    print("\n‚ö†Ô∏è  Too few mid-range samples, skipping...")
    all_specialists['mid'] = None

# -------------------------------------------------------------------------
# 3. Train PREMIUM Specialist ($85+)  # Updated threshold!
# -------------------------------------------------------------------------
if train_premium_input.shape[0] > 300:
    # Define regularized model for small dataset
    class RegularizedMLP(nn.Module):
        def __init__(self, input_dim):
            super().__init__()
            self.model = nn.Sequential(
                nn.Linear(input_dim, 1024),
                nn.ReLU(),
                nn.BatchNorm1d(1024),
                nn.Dropout(0.4),

                nn.Linear(1024, 512),
                nn.ReLU(),
                nn.BatchNorm1d(512),
                nn.Dropout(0.4),

                nn.Linear(512, 256),
                nn.ReLU(),
                nn.BatchNorm1d(256),
                nn.Dropout(0.3),

                nn.Linear(256, 1)
            )

        def forward(self, x):
            return self.model(x)

    # Pass the custom model class directly
    premium_models, premium_smape, premium_smapes = train_ensemble(
        X_train=train_premium_input,
        y_train=train_premium_targets,
        X_val=val_premium_input,
        y_val=val_premium_targets,
        n_models=5,
        bin_name="Premium",
        epochs=60,
        lr=2e-4,
        use_weighted=True,
        model_class=RegularizedMLP
    )

# Sort models by SMAPE and keep only top 3
sorted_pairs = sorted(zip(premium_smapes, premium_models))
top_3_models = [m for _, m in sorted_pairs[:3]]
top_3_smapes = [s for s, _ in sorted_pairs[:3]]

print(f"\nüîç Premium Model Selection:")
print(f"   Best 3 models SMAPE: {top_3_smapes[0]:.2f}%, {top_3_smapes[1]:.2f}%, {top_3_smapes[2]:.2f}%")
print(f"   Discarded 2 models SMAPE: {sorted_pairs[3][0]:.2f}%, {sorted_pairs[4][0]:.2f}%")

# Save only top 3
all_specialists['premium'] = top_3_models
all_results['premium'] = premium_smape  # Ensemble SMAPE for reference

# =========================================================================
# SUMMARY
# =========================================================================
print("\n" + "="*70)
print("üìä TRAINING SUMMARY")
print("="*70)

total_models = sum(len(models) for models in all_specialists.values() if models is not None)
print(f"\n‚úÖ Total models trained: {total_models}")

print(f"\nüèÜ Validation Results by Bin:")
for bin_name, mae in all_results.items():
    print(f"   {bin_name.capitalize():12} MAE: ${mae:.2f}")



üèãÔ∏è PHASE 1: TRAINING SPECIALIST ENSEMBLES

üöÄ Training AFFORDABLE Ensemble (10 models)
   Loss: Weighted SMAPE
Training samples: 50903
Validation samples: 12743

üîÑ Training model 1/10 (seed=0)...
   Epoch 10/30: Loss = 0.2100, Val SMAPE = 50.99%, Val MAE = $17.82
   Epoch 20/30: Loss = 0.1837, Val SMAPE = 50.46%, Val MAE = $26.07
   Epoch 30/30: Loss = 0.1688, Val SMAPE = 49.94%, Val MAE = $6.97
‚úÖ Model 1/10 complete - Best SMAPE: 49.94%

üîÑ Training model 2/10 (seed=1)...
   Epoch 10/30: Loss = 0.2106, Val SMAPE = 50.74%, Val MAE = $6.40
   Epoch 20/30: Loss = 0.1883, Val SMAPE = 51.09%, Val MAE = $19.57
   Epoch 30/30: Loss = 0.1712, Val SMAPE = 50.34%, Val MAE = $6.81
‚úÖ Model 2/10 complete - Best SMAPE: 50.34%

üîÑ Training model 3/10 (seed=2)...
   Epoch 10/30: Loss = 0.2102, Val SMAPE = 52.01%, Val MAE = $26.32
   Epoch 20/30: Loss = 0.1858, Val SMAPE = 51.02%, Val MAE = $11.04
   Epoch 30/30: Loss = 0.1692, Val SMAPE = 49.20%, Val MAE = $6.25
‚úÖ Model 3/10 comp

In [15]:
# =========================================================================
# SAVE MODELS (No selection needed - already done!)
# =========================================================================
print("\n" + "="*70)
print("üíæ SAVING SPECIALIST MODELS")
print("="*70)

save_dir = '/content/drive/MyDrive/amazon_ml_challenge/price_specialists'
os.makedirs(save_dir, exist_ok=True)

for bin_name, models in all_specialists.items():
    if models is not None:
        bin_dir = os.path.join(save_dir, bin_name)
        os.makedirs(bin_dir, exist_ok=True)

        for i, model in enumerate(models):
            model_path = os.path.join(bin_dir, f'model_{i}.pt')
            torch.save(model.state_dict(), model_path)

        print(f"‚úÖ Saved {len(models)} models for {bin_name} ‚Üí {bin_dir}")

# Save metadata
metadata = {
    'bin_boundaries': BIN_BOUNDARIES,
    'results': all_results,
    'input_dim': train_input.shape[1],
    'n_models': {k: len(v) if v else 0 for k, v in all_specialists.items()},
    'train_distribution': {
        'affordable': train_affordable_mask.sum(),
        'mid': train_mid_mask.sum(),
        'premium': train_premium_mask.sum()
    },
    'metric': 'SMAPE'
}

metadata_path = os.path.join(save_dir, 'metadata.pt')
torch.save(metadata, metadata_path)
print(f"‚úÖ Saved metadata ‚Üí {metadata_path}")

print("\n" + "="*70)
print("‚úÖ ALL MODELS SAVED!")
print("="*70)
print(f"   Affordable: 10 models")
print(f"   Mid-range: 7 models")
print(f"   Premium: 3 models (top performers only)")


üíæ SAVING SPECIALIST MODELS
‚úÖ Saved 10 models for affordable ‚Üí /content/drive/MyDrive/amazon_ml_challenge/price_specialists/affordable
‚úÖ Saved 7 models for mid ‚Üí /content/drive/MyDrive/amazon_ml_challenge/price_specialists/mid
‚úÖ Saved 3 models for premium ‚Üí /content/drive/MyDrive/amazon_ml_challenge/price_specialists/premium
‚úÖ Saved metadata ‚Üí /content/drive/MyDrive/amazon_ml_challenge/price_specialists/metadata.pt

‚úÖ ALL MODELS SAVED!
   Affordable: 10 models
   Mid-range: 7 models
   Premium: 3 models (top performers only)


In [16]:
# =========================================================================
# PRICE-BASED SPECIALIST MODELS - PHASE 2 & 3 (SMAPE VERSION)
# Phase 2: Validation (compare specialists vs baseline)
# Phase 3: Test Set Predictions
# =========================================================================
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# =========================================================================
# SMAPE METRIC
# =========================================================================
def smape_metric(y_true, y_pred):
    """Calculate SMAPE in original scale"""
    return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2 + 1e-8)) * 100

# =========================================================================
# MODEL DEFINITION (Same as training)
# =========================================================================
class RegressionMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),

            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.model(x)

# =========================================================================
# LOAD SPECIALIST MODELS
# =========================================================================
print("\n" + "="*70)
print("üìÇ LOADING SPECIALIST MODELS")
print("="*70)

save_dir = '/content/drive/MyDrive/amazon_ml_challenge/price_specialists'
metadata_path = os.path.join(save_dir, 'metadata.pt')

# Load metadata
metadata = torch.load(metadata_path, weights_only=False)
input_dim = metadata['input_dim']
bin_boundaries = metadata['bin_boundaries']

print(f"‚úÖ Metadata loaded")
print(f"   Input dimension: {input_dim}")
print(f"   Bins: {list(bin_boundaries.keys())}")

# Load all specialist models
specialists = {}

for bin_name in ['affordable', 'mid', 'premium']:
    bin_dir = os.path.join(save_dir, bin_name)

    if os.path.exists(bin_dir):
        models = []
        model_files = sorted([f for f in os.listdir(bin_dir) if f.endswith('.pt')])

        for model_file in model_files:
            model = RegressionMLP(input_dim).to(device)
            model.load_state_dict(torch.load(
                os.path.join(bin_dir, model_file),
                map_location=device
            ))
            model.eval()
            models.append(model)

        specialists[bin_name] = models
        print(f"‚úÖ Loaded {len(models)} models for {bin_name}")
    else:
        specialists[bin_name] = None
        print(f"‚ö†Ô∏è  No models found for {bin_name}")

# =========================================================================
# LOAD BASELINE ENSEMBLE (for routing)
# =========================================================================
print("\n" + "="*70)
print("üìÇ LOADING BASELINE ENSEMBLE (for routing)")
print("="*70)

baseline_dir = '/content/drive/MyDrive/amazon_ml_challenge/ensemble_CLIP_models'
baseline_models = []

for i in range(10):
    model_path = os.path.join(baseline_dir, f'clip_ensemble_model_{i}.pt')
    if os.path.exists(model_path):
        model = RegressionMLP(input_dim).to(device)
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()
        baseline_models.append(model)

print(f"‚úÖ Loaded {len(baseline_models)} baseline models")

# =========================================================================
# HELPER FUNCTIONS (UPDATED WITH NEW CLIPPING)
# =========================================================================
def ensemble_predict(models, x, use_clipping=True, top_k=None):
    """Average predictions with optional selective ensemble"""
    predictions = []

    min_log_value = np.log1p(3)
    max_log_value = np.log1p(3000)

    if x.ndim == 1:
        x = x.unsqueeze(0)

    # Use only top K models if specified
    models_to_use = models[:top_k] if top_k is not None else models

    for model in models_to_use:
        model.eval()
        with torch.no_grad():
            pred = model(x)
            if use_clipping:
                pred = torch.clamp(pred, min=min_log_value, max=max_log_value)
            predictions.append(pred)

    return torch.stack(predictions).mean(dim=0)

def get_price_bin(price):
    """Map price to bin index - UPDATED THRESHOLDS"""
    if price < 40:  # Changed from 50
        return 0  # affordable
    elif price < 85:  # Changed from 100
        return 1  # mid
    else:
        return 2  # premium

def predict_with_routing(x, baseline_models, specialists):
    """
    Predict using routing strategy:
    1. Use baseline to predict rough price
    2. Route to appropriate specialist
    3. Get final prediction from specialist
    """
    # Step 1: Get rough price from baseline (for routing)
    baseline_pred_log = ensemble_predict(baseline_models, x)
    baseline_pred_orig = np.expm1(baseline_pred_log.cpu().numpy().squeeze())

    # Handle single sample vs batch
    if baseline_pred_orig.ndim == 0:
        baseline_pred_orig = np.array([baseline_pred_orig])

    # Step 2: Route to bins
    bin_indices = np.array([get_price_bin(p) for p in baseline_pred_orig])

    # Step 3: Predict with specialists
    final_preds = []
    bin_names = ['affordable', 'mid', 'premium']

    for i, (sample, bin_idx) in enumerate(zip(x, bin_indices)):
        bin_name = bin_names[bin_idx]

        # Use specialist if available, else use baseline
        if specialists[bin_name] is not None:
          # Use only top 3 models for premium, all for others
          k = 3 if bin_name == 'premium' else None
          pred = ensemble_predict(specialists[bin_name], sample.unsqueeze(0), top_k=k)

        else:
            pred = baseline_pred_log[i].unsqueeze(0)

        final_preds.append(pred)

    return torch.cat(final_preds)

# =========================================================================
# PHASE 2: VALIDATION
# =========================================================================
print("\n" + "="*70)
print("üß™ PHASE 2: VALIDATION - SPECIALISTS vs BASELINE")
print("="*70)

val_input = val_input.to(device)
val_targets = val_targets.to(device)
val_prices_orig = np.expm1(val_targets.cpu().numpy().squeeze())

# Split validation by true bins (UPDATED THRESHOLDS)
val_affordable_mask = val_prices_orig < 40  # Changed from 50
val_mid_mask = (val_prices_orig >= 40) & (val_prices_orig < 85)  # Changed from 50-100
val_premium_mask = val_prices_orig >= 85  # Changed from 100

print(f"\nüìä Validation Data:")
print(f"  Total samples: {len(val_prices_orig)}")
print(f"  Affordable (<$40): {val_affordable_mask.sum()} ({val_affordable_mask.sum()/len(val_prices_orig)*100:.1f}%)")
print(f"  Mid-range ($40-$85): {val_mid_mask.sum()} ({val_mid_mask.sum()/len(val_prices_orig)*100:.1f}%)")
print(f"  Premium (>$85): {val_premium_mask.sum()} ({val_premium_mask.sum()/len(val_prices_orig)*100:.1f}%)")

# -------------------------------------------------------------------------
# Method 1: BASELINE (current approach)
# -------------------------------------------------------------------------
print("\n" + "-"*70)
print("üìä Method 1: BASELINE ENSEMBLE")
print("-"*70)

baseline_preds = []
batch_size = 512

val_loader = DataLoader(
    TensorDataset(val_input),
    batch_size=batch_size,
    shuffle=False
)

for (batch_x,) in tqdm(val_loader, desc="Baseline predictions"):
    if batch_x.ndim == 1:
        batch_x = batch_x.unsqueeze(0)
    pred = ensemble_predict(baseline_models, batch_x.to(device))
    baseline_preds.append(pred.cpu())

baseline_preds = torch.cat(baseline_preds)
baseline_preds_orig = np.expm1(baseline_preds.numpy().squeeze())

# Calculate both metrics
baseline_mae = mean_absolute_error(val_prices_orig, baseline_preds_orig)
baseline_smape = smape_metric(val_prices_orig, baseline_preds_orig)

print(f"\nüéØ Baseline Overall SMAPE: {baseline_smape:.2f}%")
print(f"üìä Baseline Overall MAE: ${baseline_mae:.2f}")

# Per-bin analysis for baseline
print(f"\nüìä Baseline Performance by Price Range:")
for bin_name, mask in [('Affordable', val_affordable_mask),
                        ('Mid-range', val_mid_mask),
                        ('Premium', val_premium_mask)]:
    if mask.sum() > 0:
        bin_smape = smape_metric(val_prices_orig[mask], baseline_preds_orig[mask])
        bin_mae = mean_absolute_error(val_prices_orig[mask], baseline_preds_orig[mask])
        print(f"  {bin_name:12}: {bin_smape:6.2f}% SMAPE, ${bin_mae:7.2f} MAE ({mask.sum()} samples)")

# -------------------------------------------------------------------------
# Method 2: SPECIALIST ENSEMBLES with Routing
# -------------------------------------------------------------------------
print("\n" + "-"*70)
print("üìä Method 2: SPECIALIST ENSEMBLES with Routing")
print("-"*70)

specialist_preds = []

for (batch_x,) in tqdm(val_loader, desc="Specialist predictions"):
    if batch_x.ndim == 1:
        batch_x = batch_x.unsqueeze(0)
    pred = predict_with_routing(batch_x.to(device), baseline_models, specialists)
    specialist_preds.append(pred.cpu())

specialist_preds = torch.cat(specialist_preds)
specialist_preds_orig = np.expm1(specialist_preds.numpy().squeeze())

# Calculate both metrics
specialist_mae = mean_absolute_error(val_prices_orig, specialist_preds_orig)
specialist_smape = smape_metric(val_prices_orig, specialist_preds_orig)

print(f"\nüéØ Specialist Overall SMAPE: {specialist_smape:.2f}%")
print(f"üìä Specialist Overall MAE: ${specialist_mae:.2f}")

# Per-bin analysis for specialists
print(f"\nüìä Specialist Performance by Price Range:")
for bin_name, mask in [('Affordable', val_affordable_mask),
                        ('Mid-range', val_mid_mask),
                        ('Premium', val_premium_mask)]:
    if mask.sum() > 0:
        bin_smape = smape_metric(val_prices_orig[mask], specialist_preds_orig[mask])
        bin_mae = mean_absolute_error(val_prices_orig[mask], specialist_preds_orig[mask])
        print(f"  {bin_name:12}: {bin_smape:6.2f}% SMAPE, ${bin_mae:7.2f} MAE ({mask.sum()} samples)")

# -------------------------------------------------------------------------
# COMPARISON (SMAPE IS PRIMARY METRIC)
# -------------------------------------------------------------------------
print("\n" + "="*70)
print("üèÜ VALIDATION RESULTS COMPARISON")
print("="*70)

improvement_smape = baseline_smape - specialist_smape
improvement_pct = (improvement_smape / baseline_smape) * 100

print(f"\n{'Method':<30} {'SMAPE':>12} {'MAE':>12}")
print("-"*70)
print(f"{'Baseline Ensemble':<30} {baseline_smape:>11.2f}% ${baseline_mae:>10.2f}")
print(f"{'Specialist Ensembles':<30} {specialist_smape:>11.2f}% ${specialist_mae:>10.2f}")
print("-"*70)
print(f"{'SMAPE Improvement':<30} {improvement_smape:>11.2f}% ({improvement_pct:+.2f}%)")

if specialist_smape < baseline_smape:
    print(f"\nüéâ SUCCESS! Specialists improve SMAPE performance!")
    print(f"‚úÖ Use specialists for test predictions")
    use_specialists = True
else:
    print(f"\n‚ö†Ô∏è  Specialists don't improve SMAPE over baseline")
    print(f"üí° Consider: Use baseline for test, or retrain with more premium focus")
    use_specialists = False

# Routing accuracy
baseline_bins = np.array([get_price_bin(p) for p in baseline_preds_orig])
true_bins = np.array([get_price_bin(p) for p in val_prices_orig])
routing_accuracy = (baseline_bins == true_bins).mean()

print(f"\nüéØ Routing Accuracy: {routing_accuracy*100:.1f}%")
print(f"   (How often baseline correctly predicts price range)")

# =========================================================================
# PREMIUM PREDICTION ANALYSIS (MOST IMPORTANT FOR SMAPE)
# =========================================================================
print("\n" + "="*70)
print("üìà PREMIUM PREDICTION ANALYSIS (Critical for SMAPE)")
print("="*70)

premium_mask = val_premium_mask

if premium_mask.sum() > 0:
    premium_preds_spec = specialist_preds_orig[premium_mask]
    premium_preds_base = baseline_preds_orig[premium_mask]
    premium_actual = val_prices_orig[premium_mask]

    print(f"\nüìä Premium Item Analysis ({premium_mask.sum()} samples, >${85}):")
    print(f"  Actual mean:              ${premium_actual.mean():.2f}")
    print(f"  Specialist predictions:   ${premium_preds_spec.mean():.2f}")
    print(f"  Baseline predictions:     ${premium_preds_base.mean():.2f}")

    premium_smape_spec = smape_metric(premium_actual, premium_preds_spec)
    premium_smape_base = smape_metric(premium_actual, premium_preds_base)
    premium_mae_spec = mean_absolute_error(premium_actual, premium_preds_spec)
    premium_mae_base = mean_absolute_error(premium_actual, premium_preds_base)

    print(f"\n  Specialist Premium SMAPE: {premium_smape_spec:.2f}%")
    print(f"  Baseline Premium SMAPE:   {premium_smape_base:.2f}%")
    print(f"  Specialist Premium MAE:   ${premium_mae_spec:.2f}")
    print(f"  Baseline Premium MAE:     ${premium_mae_base:.2f}")

    # Diagnosis
    if premium_preds_spec.mean() < premium_actual.mean() * 0.9:
        print("\n‚ö†Ô∏è  CRITICAL: Specialists under-predicting premium items by >10%")
        print("üí° This hurts SMAPE badly! Consider:")
        print("   - Increase premium weight in loss (try 5x instead of 3x)")
        print("   - Add more premium-specific features")
        print("   - Lower premium threshold to get more training data")
    elif premium_preds_spec.mean() > premium_actual.mean() * 1.1:
        print("\n‚ö†Ô∏è  Specialists over-predicting premium items by >10%")
    else:
        print("\n‚úÖ Specialist predictions are reasonably calibrated")

else:
    print("\n‚ö†Ô∏è  No premium samples found in validation set")

print("\n" + "="*70)
print("‚úÖ VALIDATION COMPLETE")
print("="*70)

Using device: cuda

üìÇ LOADING SPECIALIST MODELS
‚úÖ Metadata loaded
   Input dimension: 1046
   Bins: ['affordable', 'mid', 'premium']
‚úÖ Loaded 10 models for affordable
‚úÖ Loaded 7 models for mid
‚úÖ Loaded 5 models for premium

üìÇ LOADING BASELINE ENSEMBLE (for routing)
‚úÖ Loaded 10 baseline models

üß™ PHASE 2: VALIDATION - SPECIALISTS vs BASELINE

üìä Validation Data:
  Total samples: 15000
  Affordable (<$40): 12743 (85.0%)
  Mid-range ($40-$85): 1671 (11.1%)
  Premium (>$85): 586 (3.9%)

----------------------------------------------------------------------
üìä Method 1: BASELINE ENSEMBLE
----------------------------------------------------------------------


Baseline predictions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:00<00:00, 95.49it/s]



üéØ Baseline Overall SMAPE: 53.37%
üìä Baseline Overall MAE: $12.14

üìä Baseline Performance by Price Range:
  Affordable  :  49.46% SMAPE, $   6.53 MAE (12743 samples)
  Mid-range   :  70.29% SMAPE, $  27.56 MAE (1671 samples)
  Premium     :  90.23% SMAPE, $  90.31 MAE (586 samples)

----------------------------------------------------------------------
üìä Method 2: SPECIALIST ENSEMBLES with Routing
----------------------------------------------------------------------


Specialist predictions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [01:20<00:00,  2.67s/it]


üéØ Specialist Overall SMAPE: 54.65%
üìä Specialist Overall MAE: $13.05

üìä Specialist Performance by Price Range:
  Affordable  :  48.21% SMAPE, $   6.32 MAE (12743 samples)
  Mid-range   :  84.80% SMAPE, $  32.08 MAE (1671 samples)
  Premium     : 108.68% SMAPE, $ 105.15 MAE (586 samples)

üèÜ VALIDATION RESULTS COMPARISON

Method                                SMAPE          MAE
----------------------------------------------------------------------
Baseline Ensemble                    53.37% $     12.14
Specialist Ensembles                 54.65% $     13.05
----------------------------------------------------------------------
SMAPE Improvement                    -1.28% (-2.39%)

‚ö†Ô∏è  Specialists don't improve SMAPE over baseline
üí° Consider: Use baseline for test, or retrain with more premium focus

üéØ Routing Accuracy: 87.4%
   (How often baseline correctly predicts price range)

üìà PREMIUM PREDICTION ANALYSIS (Critical for SMAPE)

üìä Premium Item Analysis (586 s




In [17]:

# =========================================================================
# PHASE 3: TEST SET PREDICTIONS
# =========================================================================
print("\n" + "="*70)
print("üöÄ PHASE 3: TEST SET PREDICTIONS")
print("="*70)

# Load test data
test_data_path = '/content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_test_with_alignment.pt'
test_data = torch.load(test_data_path)

test_input = test_data['test_input'].to(device)
test_ids = test_data['test_ids']

print(f"‚úÖ Test data loaded: {test_input.shape}")
print(f"   Number of test samples: {len(test_ids)}")

# Choose which method to use
if use_specialists:
    print(f"\n‚ú® Using SPECIALIST ensembles for predictions")
    prediction_method = "specialists"
else:
    print(f"\n‚ú® Using BASELINE ensemble for predictions")
    prediction_method = "baseline"

# Make predictions
test_loader = DataLoader(
    TensorDataset(test_input),
    batch_size=batch_size,
    shuffle=False
)

test_preds = []

if prediction_method == "specialists":
    for (batch_x,) in tqdm(test_loader, desc="Test predictions (specialists)"):
        pred = predict_with_routing(batch_x.to(device), baseline_models, specialists)
        test_preds.append(pred.cpu())
else:
    for (batch_x,) in tqdm(test_loader, desc="Test predictions (baseline)"):
        pred = ensemble_predict(baseline_models, batch_x.to(device))
        test_preds.append(pred.cpu())

test_preds = torch.cat(test_preds)
test_preds_orig = np.expm1(test_preds.numpy().squeeze())

# Post-processing
print(f"\nüîß Post-processing predictions...")
print(f"   Before clipping - Min: ${test_preds_orig.min():.2f}, Max: ${test_preds_orig.max():.2f}")

# Clip extreme predictions
test_preds_orig = np.clip(test_preds_orig, 0, 4000)

print(f"   After clipping - Min: ${test_preds_orig.min():.2f}, Max: ${test_preds_orig.max():.2f}")
print(f"   Mean: ${test_preds_orig.mean():.2f}, Median: ${np.median(test_preds_orig):.2f}")

# =========================================================================
# CREATE SUBMISSION FILE
# =========================================================================
print("\n" + "="*70)
print("üíæ CREATING SUBMISSION FILE")
print("="*70)

submission_df = pd.DataFrame({
    'sample_id': test_ids,
    'price': test_preds_orig
})

# Save submission
submission_df.to_csv("submission_specialists.csv", index=False)


from google.colab import files
# Download the CSV file
files.download("submission_using_specialists.csv")

print(f"\nüìä Submission Statistics:")
print(f"   Total predictions: {len(submission_df)}")
print(f"   Price range: ${test_preds_orig.min():.2f} - ${test_preds_orig.max():.2f}")
print(f"   Mean price: ${test_preds_orig.mean():.2f}")
print(f"   Median price: ${np.median(test_preds_orig):.2f}")

# Distribution analysis
bins_counts = {
    'Affordable (<$50)': (test_preds_orig < 50).sum(),
    'Mid-range ($50-$100)': ((test_preds_orig >= 50) & (test_preds_orig < 100)).sum(),
    'Premium ($100+)': (test_preds_orig >= 100).sum()
}

print(f"\nüìà Test Predictions Distribution:")
for bin_name, count in bins_counts.items():
    pct = (count / len(test_preds_orig)) * 100
    print(f"   {bin_name:25} {count:6d} samples ({pct:5.1f}%)")

print(f"\nüéØ First 10 predictions:")
print(submission_df.head(10))

print("\n" + "="*70)
print("‚úÖ ALL PHASES COMPLETE!")
print("="*70)
print(f"\nüìù Summary:")
print(f"   Baseline MAE: ${baseline_mae:.2f}")
print(f"   Specialist MAE: ${specialist_mae:.2f}")
# print(f"   Improvement: ${improvement:.2f} ({improvement_pct:+.2f}%)")
# print(f"   Submission file: {submission_path}")
print(f"\nüöÄ Ready to submit!")


üöÄ PHASE 3: TEST SET PREDICTIONS
‚úÖ Test data loaded: torch.Size([75000, 1046])
   Number of test samples: 75000

‚ú® Using BASELINE ensemble for predictions


Test predictions (baseline): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:01<00:00, 122.31it/s]



üîß Post-processing predictions...
   Before clipping - Min: $3.00, Max: $518.81
   After clipping - Min: $3.00, Max: $518.81
   Mean: $19.96, Median: $15.76

üíæ CREATING SUBMISSION FILE


FileNotFoundError: Cannot find file: submission_using_specialists.csv