# AMATO Production - Campaign Optimization Batch Inference Pipeline

This notebook performs batch inference for campaign success prediction and budget optimization.

**Author:** Data Science Team  
**Date:** 2024

In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import yaml
import logging
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path for imports
# Try multiple possible paths for Jupyter notebook compatibility
possible_paths = [
    Path.cwd(),  # Current working directory
    Path.cwd().parent,  # Parent of current directory
    Path.cwd().parent.parent,  # Grandparent of current directory
    Path(__file__).parent.parent.parent if '__file__' in globals() else None  # If __file__ exists
]

# Filter out None values and find the one with utils folder
project_root = None
for path in possible_paths:
    if path and (path / 'utils').exists():
        project_root = path
        break

if project_root is None:
    # Fallback: use current directory and hope for the best
    project_root = Path.cwd()

sys.path.append(str(project_root))
print(f"🔧 Using project root: {project_root}")

try:
    from utils.s3_utils import get_s3_manager
    print("✅ Successfully imported utils.s3_utils")
except ImportError as e:
    print(f"❌ Failed to import utils.s3_utils: {e}")
    print("🔧 Trying alternative import...")
    try:
        # Try relative import
        sys.path.append('.')
        from utils.s3_utils import get_s3_manager
        print("✅ Successfully imported with relative path")
    except ImportError as e2:
        print(f"❌ Alternative import also failed: {e2}")
        raise

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

🔧 Using project root: /Users/priyankmavani/Desktop/apps/amato
✅ Successfully imported utils.s3_utils


## Campaign Optimization Batch Inference Class

In [5]:
class CampaignOptimizationBatchInference:
    def __init__(self):
        """Initialize the Campaign Optimization Batch Inference Pipeline"""
        self.models = {}
        self.scalers = {}
        self.metadata = {}
        
    def load_trained_models(self):
        """Load trained campaign optimization models from S3"""
        logger.info("📥 Loading trained campaign optimization models...")
        
        try:
            s3_manager = get_s3_manager()
            
            # Create models directory if it doesn't exist
            models_dir = 'models/campaign_optimization'
            os.makedirs(models_dir, exist_ok=True)
            
            # Load Campaign Success model
            success_model_path = f'{models_dir}/campaign_success_model.pkl'
            success_scaler_path = f'{models_dir}/campaign_success_scaler.pkl'
            
            # Download Campaign Success model from S3 if not exists locally
            if not os.path.exists(success_model_path):
                logger.info("📥 Downloading Campaign Success model from S3...")
                try:
                    # Use the correct S3 path structure
                    s3_manager.download_file('amato_pm/models/campaign_optimization/campaign_success_model.pkl', success_model_path)
                    logger.info("✅ Downloaded Campaign Success model from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Campaign Success model from S3: {e}")
            
            # Download Campaign Success scaler from S3 if not exists locally
            if not os.path.exists(success_scaler_path):
                logger.info("📥 Downloading Campaign Success scaler from S3...")
                try:
                    s3_manager.download_file('amato_pm/models/campaign_optimization/campaign_success_scaler.pkl', success_scaler_path)
                    logger.info("✅ Downloaded Campaign Success scaler from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Campaign Success scaler from S3: {e}")
            
            # Load Campaign Success model if available
            if os.path.exists(success_model_path):
                self.models['campaign_success'] = joblib.load(success_model_path)
                if os.path.exists(success_scaler_path):
                    self.scalers['campaign_success'] = joblib.load(success_scaler_path)
                else:
                    self.scalers['campaign_success'] = None
                
                # Create metadata with correct feature columns
                self.metadata['campaign_success'] = {
                    'feature_columns': [
                        'recency_days', 'frequency', 'monetary_value',
                        'avg_order_value', 'customer_lifetime_value',
                        'campaign_count', 'avg_roas', 'avg_ctr', 'total_campaign_revenue',
                        'campaign_response_rate', 'avg_ctr_lift', 'rfm_score',
                        'total_sessions', 'total_events',
                        'conversion_probability', 'churn_risk', 'upsell_potential'
                    ],
                    'model_type': 'RandomForestClassifier',
                    'training_date': datetime.now().isoformat()
                }
                logger.info("✅ Loaded Campaign Success model")
            else:
                logger.warning("⚠️  Campaign Success model not available")
            
            # Load Budget Optimization model
            budget_model_path = f'{models_dir}/budget_optimization_model.pkl'
            budget_scaler_path = f'{models_dir}/budget_optimization_scaler.pkl'
            
            # Download Budget Optimization model from S3 if not exists locally
            if not os.path.exists(budget_model_path):
                logger.info("📥 Downloading Budget Optimization model from S3...")
                try:
                    s3_manager.download_file('amato_pm/models/campaign_optimization/budget_optimization_model.pkl', budget_model_path)
                    logger.info("✅ Downloaded Budget Optimization model from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Budget Optimization model from S3: {e}")
            
            # Download Budget Optimization scaler from S3 if not exists locally
            if not os.path.exists(budget_scaler_path):
                logger.info("📥 Downloading Budget Optimization scaler from S3...")
                try:
                    s3_manager.download_file('amato_pm/models/campaign_optimization/budget_optimization_scaler.pkl', budget_scaler_path)
                    logger.info("✅ Downloaded Budget Optimization scaler from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Budget Optimization scaler from S3: {e}")
            
            # Load Budget Optimization model if available
            if os.path.exists(budget_model_path):
                self.models['budget_optimization'] = joblib.load(budget_model_path)
                if os.path.exists(budget_scaler_path):
                    self.scalers['budget_optimization'] = joblib.load(budget_scaler_path)
                else:
                    self.scalers['budget_optimization'] = None
                
                # Create metadata with correct feature columns
                self.metadata['budget_optimization'] = {
                    'feature_columns': [
                        'recency_days', 'frequency', 'monetary_value',
                        'avg_order_value', 'customer_lifetime_value',
                        'campaign_count', 'avg_roas', 'total_campaign_revenue',
                        'rfm_score', 'conversion_probability'
                    ],
                    'model_type': 'RandomForestRegressor',
                    'training_date': datetime.now().isoformat()
                }
                logger.info("✅ Loaded Budget Optimization model")
            else:
                logger.warning("⚠️  Budget Optimization model not available")
            
            logger.info(f"✅ Loaded {len(self.models)} models")
            
            if len(self.models) == 0:
                logger.error("❌ No models loaded. Please ensure models are available in S3.")
                raise Exception("No models available for inference")
            
        except Exception as e:
            logger.error(f"❌ Failed to load models: {e}")
            raise
    
    def load_inference_data(self, data_path=None):
        """Load recent inference data for campaign optimization"""
        logger.info("📊 Loading recent inference data...")
        
        try:
            # Load recent inference data from S3
            logger.info("🔍 Loading recent inference data from S3...")
            s3_manager = get_s3_manager()
            s3_manager.load_inference_data_from_s3()
            logger.info("✅ Recent inference data loaded from S3")
            
            if data_path is None:
                data_path = 'data_pipelines/unified_dataset/output/recent_customer_dataset.parquet'
            
            if os.path.exists(data_path):
                df = pd.read_parquet(data_path)
                logger.info(f"✅ Loaded recent inference data: {len(df)} customers")
                return df
            else:
                logger.error(f"❌ Recent inference data not found at {data_path}")
                return None
                
        except Exception as e:
            logger.error(f"❌ Failed to load recent inference data: {e}")
            return None
    
    def prepare_features(self, df, model_name):
        """Prepare features for inference"""
        logger.info(f"🔧 Preparing features for {model_name} inference...")
        
        if model_name not in self.metadata:
            logger.error(f"❌ No metadata found for {model_name}")
            return None
        
        # Get feature columns from metadata
        feature_columns = self.metadata[model_name]['feature_columns']
        
        # Filter available features
        available_features = [col for col in feature_columns if col in df.columns]
        
        if len(available_features) < 5:
            logger.warning(f"⚠️  Only {len(available_features)} features available for {model_name}")
            
        # Create feature matrix
        X = df[available_features].copy()
        
        # Handle missing values
        X = X.fillna(X.median())
        
        # Remove outliers using IQR method
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        logger.info(f"✅ Prepared {len(X)} customers for {model_name}")
        return X
    
    def perform_campaign_success_inference(self, df_features):
        """Perform campaign success inference"""
        logger.info("🎯 Performing campaign success inference...")
        
        if 'campaign_success' not in self.models:
            logger.error("❌ Campaign Success model not loaded")
            return None
        
        # Scale features if scaler exists
        if self.scalers['campaign_success'] is not None:
            X_scaled = self.scalers['campaign_success'].transform(df_features)
        else:
            X_scaled = df_features
        
        # Predict success probability
        success_probs = self.models['campaign_success'].predict_proba(X_scaled)[:, 1]
        
        # Create results dataframe
        results = df_features.copy()
        results['success_probability'] = success_probs
        results['success_prediction'] = (success_probs > 0.5).astype(int)
        results['success_category'] = pd.cut(success_probs, 
                                           bins=[0, 0.3, 0.7, 1.0], 
                                           labels=['Low', 'Medium', 'High'])
        
        logger.info(f"✅ Campaign success inference completed: {len(results)} predictions")
        return results
    
    def perform_budget_optimization_inference(self, df_features):
        """Perform budget optimization inference"""
        logger.info("🎯 Performing budget optimization inference...")
        
        if 'budget_optimization' not in self.models:
            logger.error("❌ Budget Optimization model not loaded")
            return None
        
        # Scale features if scaler exists
        if self.scalers['budget_optimization'] is not None:
            X_scaled = self.scalers['budget_optimization'].transform(df_features)
        else:
            X_scaled = df_features
        
        # Predict optimal budget
        optimal_budgets = self.models['budget_optimization'].predict(X_scaled)
        
        # Create results dataframe
        results = df_features.copy()
        results['optimal_budget'] = optimal_budgets
        results['budget_category'] = pd.cut(optimal_budgets, 
                                          bins=5, 
                                          labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
        
        logger.info(f"✅ Budget optimization inference completed: {len(results)} predictions")
        return results
    
    def save_inference_results(self, results, model_name):
        """Save inference results directly to S3"""
        logger.info(f"💾 Saving {model_name} inference results...")
        
        try:
            s3_manager = get_s3_manager()
            
            # Save results directly to S3
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            results_key = f'models/campaign_optimization/inference_results/{model_name}_inference_results_{timestamp}.parquet'
            
            # Convert to parquet bytes and upload
            results_bytes = results.to_parquet(index=False)
            results_success = s3_manager.upload_bytes_direct(
                results_bytes, results_key, 'application/octet-stream'
            )
            
            # Generate and save report
            report = self.generate_inference_report(results, model_name)
            report_key = f'models/campaign_optimization/inference_results/{model_name}_inference_report_{timestamp}.yaml'
            
            report_success = s3_manager.upload_bytes_direct(
                yaml.dump(report, default_flow_style=False).encode('utf-8'),
                report_key, 'text/yaml'
            )
            
            if results_success and report_success:
                logger.info(f"✅ {model_name} results uploaded directly to S3")
                return results_key, report_key
            else:
                logger.warning(f"⚠️  Some {model_name} results failed to upload to S3")
                return None, None
                
        except Exception as e:
            logger.error(f"❌ Failed to save {model_name} results: {e}")
            return None, None
    
    def generate_inference_report(self, results, model_name):
        """Generate inference report"""
        logger.info(f"📋 Generating {model_name} inference report...")
        
        if model_name == 'campaign_success':
            report = {
                'model_name': model_name,
                'inference_date': datetime.now().isoformat(),
                'total_customers': len(results),
                'success_probability_stats': {
                    'mean': float(results['success_probability'].mean()),
                    'median': float(results['success_probability'].median()),
                    'std': float(results['success_probability'].std())
                },
                'success_prediction_distribution': results['success_prediction'].value_counts().to_dict(),
                'success_category_distribution': results['success_category'].value_counts().to_dict(),
                'feature_summary': {
                    'total_features': len(results.columns),
                    'numeric_features': len(results.select_dtypes(include=[np.number]).columns),
                    'categorical_features': len(results.select_dtypes(include=['object']).columns)
                }
            }
        elif model_name == 'budget_optimization':
            report = {
                'model_name': model_name,
                'inference_date': datetime.now().isoformat(),
                'total_customers': len(results),
                'optimal_budget_stats': {
                    'mean': float(results['optimal_budget'].mean()),
                    'median': float(results['optimal_budget'].median()),
                    'std': float(results['optimal_budget'].std())
                },
                'budget_category_distribution': results['budget_category'].value_counts().to_dict(),
                'feature_summary': {
                    'total_features': len(results.columns),
                    'numeric_features': len(results.select_dtypes(include=[np.number]).columns),
                    'categorical_features': len(results.select_dtypes(include=['object']).columns)
                }
            }
        else:
            report = {
                'model_name': model_name,
                'inference_date': datetime.now().isoformat(),
                'total_customers': len(results),
                'feature_summary': {
                    'total_features': len(results.columns),
                    'numeric_features': len(results.select_dtypes(include=[np.number]).columns),
                    'categorical_features': len(results.select_dtypes(include=['object']).columns)
                }
            }
        
        return report
    
    def create_inference_visualizations(self, results, model_name):
        """Create inference visualizations and upload directly to S3"""
        logger.info(f"📊 Creating {model_name} inference visualizations...")
        
        try:
            s3_manager = get_s3_manager()
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            
            if model_name == 'campaign_success':
                # Success probability distribution
                fig1 = px.histogram(
                    results, x='success_probability',
                    title=f'{model_name.replace("_", " ").title()} Success Probability Distribution',
                    labels={'success_probability': 'Success Probability', 'count': 'Customer Count'}
                )
                
                # Success category distribution
                fig2 = px.pie(
                    values=results['success_category'].value_counts().values,
                    names=results['success_category'].value_counts().index,
                    title=f'{model_name.replace("_", " ").title()} Success Category Distribution'
                )
                
                # Upload visualizations directly to S3
                html1_key = f'models/campaign_optimization/inference_results/{model_name}_success_probability_{timestamp}.html'
                html2_key = f'models/campaign_optimization/inference_results/{model_name}_success_categories_{timestamp}.html'
                
            elif model_name == 'budget_optimization':
                # Optimal budget distribution
                fig1 = px.histogram(
                    results, x='optimal_budget',
                    title=f'{model_name.replace("_", " ").title()} Optimal Budget Distribution',
                    labels={'optimal_budget': 'Optimal Budget', 'count': 'Customer Count'}
                )
                
                # Budget category distribution
                fig2 = px.pie(
                    values=results['budget_category'].value_counts().values,
                    names=results['budget_category'].value_counts().index,
                    title=f'{model_name.replace("_", " ").title()} Budget Category Distribution'
                )
                
                # Upload visualizations directly to S3
                html1_key = f'models/campaign_optimization/inference_results/{model_name}_budget_distribution_{timestamp}.html'
                html2_key = f'models/campaign_optimization/inference_results/{model_name}_budget_categories_{timestamp}.html'
            
            # Convert figures to HTML and upload
            html1_bytes = fig1.to_html().encode('utf-8')
            html2_bytes = fig2.to_html().encode('utf-8')
            
            s3_manager.upload_bytes_direct(html1_bytes, html1_key, 'text/html')
            s3_manager.upload_bytes_direct(html2_bytes, html2_key, 'text/html')
            
            logger.info(f"✅ {model_name} visualizations uploaded directly to S3")
            
        except Exception as e:
            logger.error(f"❌ Failed to create {model_name} visualizations: {e}")
    
    def run_batch_inference(self, data_path=None, models=None):
        """Run batch inference for all models"""
        logger.info("🚀 Starting Campaign Optimization Batch Inference...")
        
        try:
            # Load models
            self.load_trained_models()
            
            # Load data
            df = self.load_inference_data(data_path)
            if df is None:
                raise Exception("Failed to load inference data")
            
            # Determine which models to run
            if models is None:
                models = list(self.models.keys())
            
            all_results = {}
            
            for model_name in models:
                if model_name not in self.models:
                    logger.warning(f"⚠️ Model {model_name} not found, skipping...")
                    continue
                
                # Prepare features
                df_features = self.prepare_features(df, model_name)
                
                if df_features is None or len(df_features) == 0:
                    logger.warning(f"⚠️  No features prepared for {model_name}, skipping...")
                    continue
                
                # Perform inference
                if model_name == 'campaign_success':
                    results = self.perform_campaign_success_inference(df_features)
                elif model_name == 'budget_optimization':
                    results = self.perform_budget_optimization_inference(df_features)
                else:
                    logger.warning(f"⚠️  Unknown model: {model_name}")
                    continue
                
                if results is not None:
                    # Save results
                    results_file, report_file = self.save_inference_results(results, model_name)
                    
                    # Create visualizations
                    self.create_inference_visualizations(results, model_name)
                    
                    all_results[model_name] = results
                    
                    logger.info(f"✅ {model_name} batch inference completed")
            
            logger.info("=" * 60)
            logger.info("🎉 BATCH INFERENCE COMPLETED!")
            logger.info("=" * 60)
            logger.info(f"📊 Processed {len(df)} customers")
            logger.info(f"🎯 Ran inference for {len(all_results)} models")
            
            return all_results
            
        except Exception as e:
            logger.error(f"❌ Error in batch inference: {e}")
            raise

## Run the Pipeline

In [6]:
# Initialize and run the pipeline
if __name__ == "__main__":
    inference = CampaignOptimizationBatchInference()
    results = inference.run_batch_inference()
    
    print("\n🎉 Campaign Optimization Batch Inference completed successfully!")
    print(f"📊 Results saved to models/campaign_optimization/inference_results/")
    print("📈 Ready for campaign optimization analysis!")

INFO:__main__:🚀 Starting Campaign Optimization Batch Inference...
INFO:__main__:📥 Loading trained campaign optimization models...
INFO:__main__:✅ Loaded Campaign Success model
INFO:__main__:✅ Loaded Budget Optimization model
INFO:__main__:✅ Loaded 2 models
INFO:__main__:📊 Loading recent inference data...
INFO:__main__:🔍 Loading recent inference data from S3...
INFO:utils.s3_utils:Loading recent inference data from S3 with smart caching (last 3 months)...
INFO:utils.s3_utils:Loading data newer than 2025-06-06
INFO:utils.s3_utils:File unchanged, skipping: amato_pm/data_pipelines/unified_dataset/output/recent_customer_dataset.parquet
INFO:utils.s3_utils:File unchanged, skipping: amato_pm/data_pipelines/unified_dataset/output/timeline_datasets_metadata.yaml
INFO:utils.s3_utils:File unchanged, skipping: amato_pm/data_pipelines/unified_dataset/output/unified_customer_dataset.parquet
INFO:__main__:✅ Recent inference data loaded from S3
INFO:__main__:✅ Loaded recent inference data: 1123 custom


🎉 Campaign Optimization Batch Inference completed successfully!
📊 Results saved to models/campaign_optimization/inference_results/
📈 Ready for campaign optimization analysis!
