# AMATO Production - Journey Simulation Batch Inference Pipeline

This notebook performs batch inference for customer journey stage prediction and conversion probability.

**Author:** Data Science Team  
**Date:** 2024

In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import yaml
import logging
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path for imports
# Try multiple possible paths for Jupyter notebook compatibility
possible_paths = [
    Path.cwd(),  # Current working directory
    Path.cwd().parent,  # Parent of current directory
    Path.cwd().parent.parent,  # Grandparent of current directory
    Path(__file__).parent.parent.parent if '__file__' in globals() else None  # If __file__ exists
]

# Filter out None values and find the one with utils folder
project_root = None
for path in possible_paths:
    if path and (path / 'utils').exists():
        project_root = path
        break

if project_root is None:
    # Fallback: use current directory and hope for the best
    project_root = Path.cwd()

sys.path.append(str(project_root))
print(f"🔧 Using project root: {project_root}")

try:
    from utils.s3_utils import get_s3_manager
    print("✅ Successfully imported utils.s3_utils")
except ImportError as e:
    print(f"❌ Failed to import utils.s3_utils: {e}")
    print("🔧 Trying alternative import...")
    try:
        # Try relative import
        sys.path.append('.')
        from utils.s3_utils import get_s3_manager
        print("✅ Successfully imported with relative path")
    except ImportError as e2:
        print(f"❌ Alternative import also failed: {e2}")
        raise

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

🔧 Using project root: /Users/priyankmavani/Desktop/apps/amato
✅ Successfully imported utils.s3_utils


## Journey Simulation Batch Inference Class

In [5]:
class JourneySimulationBatchInference:
    def __init__(self):
        """Initialize the Journey Simulation Batch Inference Pipeline"""
        self.models = {}
        self.scalers = {}
        self.metadata = {}
        
    def load_trained_models(self):
        """Load trained journey simulation models from S3"""
        logger.info("📥 Loading trained journey simulation models...")
        
        try:
            s3_manager = get_s3_manager()
            
            # Create models directory if it doesn't exist
            models_dir = 'models/journey_simulation'
            os.makedirs(models_dir, exist_ok=True)
            
            # Load Journey Stage model
            journey_stage_model_path = f'{models_dir}/journey_stage_model.pkl'
            journey_stage_scaler_path = f'{models_dir}/journey_stage_scaler.pkl'
            
            # Download Journey Stage model from S3 if not exists locally
            if not os.path.exists(journey_stage_model_path):
                logger.info("📥 Downloading Journey Stage model from S3...")
                try:
                    s3_manager.download_file('amato_pm/models/journey_simulation/journey_stage_model.pkl', journey_stage_model_path)
                    logger.info("✅ Downloaded Journey Stage model from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Journey Stage model from S3: {e}")
            
            # Download Journey Stage scaler from S3 if not exists locally
            if not os.path.exists(journey_stage_scaler_path):
                logger.info("📥 Downloading Journey Stage scaler from S3...")
                try:
                    s3_manager.download_file('amato_pm/models/journey_simulation/journey_stage_scaler.pkl', journey_stage_scaler_path)
                    logger.info("✅ Downloaded Journey Stage scaler from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Journey Stage scaler from S3: {e}")
            
            # Load Journey Stage model if available
            if os.path.exists(journey_stage_model_path):
                self.models['journey_stage'] = joblib.load(journey_stage_model_path)
                if os.path.exists(journey_stage_scaler_path):
                    self.scalers['journey_stage'] = joblib.load(journey_stage_scaler_path)
                else:
                    self.scalers['journey_stage'] = None
                
                # Create metadata with correct feature columns
                self.metadata['journey_stage'] = {
                    'feature_columns': [
                        'recency_days', 'frequency', 'monetary_value',
                        'avg_order_value', 'total_orders', 'days_since_first_order',
                        'customer_lifetime_value', 'avg_days_between_orders',
                        'order_count_30d', 'order_count_90d', 'order_count_365d',
                        'revenue_30d', 'revenue_90d', 'revenue_365d'
                    ],
                    'model_type': 'RandomForestClassifier',
                    'training_date': datetime.now().isoformat()
                }
                logger.info("✅ Loaded Journey Stage model")
            else:
                logger.warning("⚠️  Journey Stage model not available")
            
            # Load Conversion Prediction model
            conversion_model_path = f'{models_dir}/conversion_prediction_model.pkl'
            conversion_scaler_path = f'{models_dir}/conversion_prediction_scaler.pkl'
            
            # Download Conversion Prediction model from S3 if not exists locally
            if not os.path.exists(conversion_model_path):
                logger.info("📥 Downloading Conversion Prediction model from S3...")
                try:
                    s3_manager.download_file('amato_pm/models/journey_simulation/conversion_prediction_model.pkl', conversion_model_path)
                    logger.info("✅ Downloaded Conversion Prediction model from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Conversion Prediction model from S3: {e}")
            
            # Download Conversion Prediction scaler from S3 if not exists locally
            if not os.path.exists(conversion_scaler_path):
                logger.info("📥 Downloading Conversion Prediction scaler from S3...")
                try:
                    s3_manager.download_file('amato_pm/models/journey_simulation/conversion_prediction_scaler.pkl', conversion_scaler_path)
                    logger.info("✅ Downloaded Conversion Prediction scaler from S3")
                except Exception as e:
                    logger.warning(f"⚠️  Failed to download Conversion Prediction scaler from S3: {e}")
            
            # Load Conversion Prediction model if available
            if os.path.exists(conversion_model_path):
                self.models['conversion_prediction'] = joblib.load(conversion_model_path)
                if os.path.exists(conversion_scaler_path):
                    self.scalers['conversion_prediction'] = joblib.load(conversion_scaler_path)
                else:
                    self.scalers['conversion_prediction'] = None
                
                # Create metadata with correct feature columns
                self.metadata['conversion_prediction'] = {
                    'feature_columns': [
                        'recency_days', 'frequency', 'monetary_value',
                        'avg_order_value', 'total_orders', 'days_since_first_order',
                        'customer_lifetime_value', 'avg_days_between_orders',
                        'order_count_30d', 'order_count_90d', 'order_count_365d',
                        'revenue_30d', 'revenue_90d', 'revenue_365d'
                    ],
                    'model_type': 'RandomForestClassifier',
                    'training_date': datetime.now().isoformat()
                }
                logger.info("✅ Loaded Conversion Prediction model")
            else:
                logger.warning("⚠️  Conversion Prediction model not available")
            
            logger.info(f"✅ Loaded {len(self.models)} models")
            
            if len(self.models) == 0:
                logger.error("❌ No models loaded. Please ensure models are available in S3.")
                raise Exception("No models available for inference")
            
        except Exception as e:
            logger.error(f"❌ Failed to load models: {e}")
            raise
    
    def load_inference_data(self, data_path=None):
        """Load recent inference data for journey simulation"""
        logger.info("📊 Loading recent inference data...")
        
        try:
            # Load recent inference data from S3
            logger.info("🔍 Loading recent inference data from S3...")
            s3_manager = get_s3_manager()
            s3_manager.load_inference_data_from_s3()
            logger.info("✅ Recent inference data loaded from S3")
            
            if data_path is None:
                data_path = 'data_pipelines/unified_dataset/output/recent_customer_dataset.parquet'
            
            if os.path.exists(data_path):
                df = pd.read_parquet(data_path)
                logger.info(f"✅ Loaded recent inference data: {len(df)} customers")
                return df
            else:
                logger.error(f"❌ Recent inference data not found at {data_path}")
                return None
                
        except Exception as e:
            logger.error(f"❌ Failed to load recent inference data: {e}")
            return None
    
    def prepare_features(self, df, model_name):
        """Prepare features for inference"""
        logger.info(f"�� Preparing features for {model_name} inference...")
        
        # Use EXACTLY the same features that were used during training
        if model_name == 'journey_stage':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'total_orders', 'days_since_first_order',
                'customer_lifetime_value', 'avg_days_between_orders',
                'order_count_30d', 'order_count_90d', 'order_count_365d',
                'revenue_30d', 'revenue_90d', 'revenue_365d'
            ]
        elif model_name == 'conversion_prediction':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'total_orders', 'days_since_first_order',
                'customer_lifetime_value', 'avg_days_between_orders',
                'order_count_30d', 'order_count_90d', 'order_count_365d',
                'revenue_30d', 'revenue_90d', 'revenue_365d'
            ]
        else:
            logger.error(f"❌ Unknown model: {model_name}")
            return None
        
        # Filter available features
        available_features = [col for col in feature_columns if col in df.columns]
        
        if len(available_features) < 5:
            logger.warning(f"⚠️  Only {len(available_features)} features available for {model_name}")
            
        # Create feature matrix with EXACTLY the same features used in training
        X = df[available_features].copy()
        
        # Handle missing values
        X = X.fillna(X.median())
        
        # Remove outliers using IQR method
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        logger.info(f"✅ Prepared {len(X)} customers with {len(X.columns)} features for {model_name}")
        return X
    
    def perform_journey_stage_prediction(self, df_features):
        """Perform journey stage prediction inference"""
        logger.info("🎯 Performing journey stage prediction inference...")
        
        if 'journey_stage' not in self.models:
            logger.error("❌ Journey Stage model not loaded")
            return None
        
        # Scale features if scaler exists
        if self.scalers['journey_stage'] is not None:
            X_scaled = self.scalers['journey_stage'].transform(df_features)
        else:
            X_scaled = df_features
        
        # Predict journey stage
        journey_stages = self.models['journey_stage'].predict(X_scaled)
        
        # Get stage probabilities if available
        try:
            stage_probs = self.models['journey_stage'].predict_proba(X_scaled)
            max_probs = np.max(stage_probs, axis=1)
        except:
            max_probs = np.ones(len(journey_stages))
        
        # Create results dataframe
        results = df_features.copy()
        results['predicted_journey_stage'] = journey_stages
        results['stage_confidence'] = max_probs
        results['stage_category'] = pd.cut(max_probs, 
                                         bins=[0, 0.5, 0.8, 1.0], 
                                         labels=['Low', 'Medium', 'High'])
        
        logger.info(f"✅ Journey stage prediction completed: {len(results)} predictions")
        return results
    
    def perform_conversion_prediction(self, df_features):
        """Perform conversion prediction inference"""
        logger.info("🎯 Performing conversion prediction inference...")
        
        if 'conversion_prediction' not in self.models:
            logger.error("❌ Conversion Prediction model not loaded")
            return None
        
        # Scale features if scaler exists
        if self.scalers['conversion_prediction'] is not None:
            X_scaled = self.scalers['conversion_prediction'].transform(df_features)
        else:
            X_scaled = df_features
        
        # Predict conversion probability
        conversion_probs = self.models['conversion_prediction'].predict_proba(X_scaled)[:, 1]
        
        # Create results dataframe
        results = df_features.copy()
        results['conversion_probability'] = conversion_probs
        results['conversion_prediction'] = (conversion_probs > 0.5).astype(int)
        results['conversion_category'] = pd.cut(conversion_probs, 
                                             bins=[0, 0.3, 0.7, 1.0], 
                                             labels=['Low', 'Medium', 'High'])
        
        logger.info(f"✅ Conversion prediction completed: {len(results)} predictions")
        return results
    
    def save_inference_results(self, results, model_name):
        """Save inference results directly to S3"""
        logger.info(f"💾 Saving {model_name} inference results...")
        
        try:
            s3_manager = get_s3_manager()
            
            # Save results directly to S3
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            results_key = f'models/journey_simulation/inference_results/{model_name}_inference_results_{timestamp}.parquet'
            
            # Convert to parquet bytes and upload
            results_bytes = results.to_parquet(index=False)
            results_success = s3_manager.upload_bytes_direct(
                results_bytes, results_key, 'application/octet-stream'
            )
            
            # Generate and save report
            report = self.generate_inference_report(results, model_name)
            report_key = f'models/journey_simulation/inference_results/{model_name}_inference_report_{timestamp}.yaml'
            
            report_success = s3_manager.upload_bytes_direct(
                yaml.dump(report, default_flow_style=False).encode('utf-8'),
                report_key, 'text/yaml'
            )
            
            if results_success and report_success:
                logger.info(f"✅ {model_name} results uploaded directly to S3")
                return results_key, report_key
            else:
                logger.warning(f"⚠️  Some {model_name} results failed to upload to S3")
                return None, None
                
        except Exception as e:
            logger.error(f"❌ Failed to save {model_name} results: {e}")
            return None, None
    
    def generate_inference_report(self, results, model_name):
        """Generate inference report"""
        logger.info(f"📋 Generating {model_name} inference report...")
        
        if model_name == 'journey_stage':
            report = {
                'model_name': model_name,
                'inference_date': datetime.now().isoformat(),
                'total_customers': len(results),
                'journey_stage_distribution': results['predicted_journey_stage'].value_counts().to_dict(),
                'stage_confidence_stats': {
                    'mean': float(results['stage_confidence'].mean()),
                    'median': float(results['stage_confidence'].median()),
                    'std': float(results['stage_confidence'].std())
                },
                'stage_category_distribution': results['stage_category'].value_counts().to_dict(),
                'feature_summary': {
                    'total_features': len(results.columns),
                    'numeric_features': len(results.select_dtypes(include=[np.number]).columns),
                    'categorical_features': len(results.select_dtypes(include=['object']).columns)
                }
            }
        elif model_name == 'conversion_prediction':
            report = {
                'model_name': model_name,
                'inference_date': datetime.now().isoformat(),
                'total_customers': len(results),
                'conversion_probability_stats': {
                    'mean': float(results['conversion_probability'].mean()),
                    'median': float(results['conversion_probability'].median()),
                    'std': float(results['conversion_probability'].std())
                },
                'conversion_prediction_distribution': results['conversion_prediction'].value_counts().to_dict(),
                'conversion_category_distribution': results['conversion_category'].value_counts().to_dict(),
                'feature_summary': {
                    'total_features': len(results.columns),
                    'numeric_features': len(results.select_dtypes(include=[np.number]).columns),
                    'categorical_features': len(results.select_dtypes(include=['object']).columns)
                }
            }
        else:
            report = {
                'model_name': model_name,
                'inference_date': datetime.now().isoformat(),
                'total_customers': len(results),
                'feature_summary': {
                    'total_features': len(results.columns),
                    'numeric_features': len(results.select_dtypes(include=[np.number]).columns),
                    'categorical_features': len(results.select_dtypes(include=['object']).columns)
                }
            }
        
        return report
    
    def create_inference_visualizations(self, results, model_name):
        """Create inference visualizations and upload directly to S3"""
        logger.info(f"📊 Creating {model_name} inference visualizations...")
        
        try:
            s3_manager = get_s3_manager()
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            
            if model_name == 'journey_stage':
                # Journey stage distribution
                fig1 = px.bar(
                    x=results['predicted_journey_stage'].value_counts().index,
                    y=results['predicted_journey_stage'].value_counts().values,
                    title=f'{model_name.replace("_", " ").title()} Distribution',
                    labels={'x': 'Journey Stage', 'y': 'Customer Count'}
                )
                
                # Stage confidence distribution
                fig2 = px.histogram(
                    results, x='stage_confidence',
                    title=f'{model_name.replace("_", " ").title()} Confidence Distribution',
                    labels={'stage_confidence': 'Stage Confidence', 'count': 'Customer Count'}
                )
                
                # Upload visualizations directly to S3
                html1_key = f'models/journey_simulation/inference_results/{model_name}_stage_distribution_{timestamp}.html'
                html2_key = f'models/journey_simulation/inference_results/{model_name}_confidence_distribution_{timestamp}.html'
                
            elif model_name == 'conversion_prediction':
                # Conversion probability distribution
                fig1 = px.histogram(
                    results, x='conversion_probability',
                    title=f'{model_name.replace("_", " ").title()} Probability Distribution',
                    labels={'conversion_probability': 'Conversion Probability', 'count': 'Customer Count'}
                )
                
                # Conversion category distribution
                fig2 = px.pie(
                    values=results['conversion_category'].value_counts().values,
                    names=results['conversion_category'].value_counts().index,
                    title=f'{model_name.replace("_", " ").title()} Category Distribution'
                )
                
                # Upload visualizations directly to S3
                html1_key = f'models/journey_simulation/inference_results/{model_name}_probability_distribution_{timestamp}.html'
                html2_key = f'models/journey_simulation/inference_results/{model_name}_category_distribution_{timestamp}.html'
            
            # Convert figures to HTML and upload
            html1_bytes = fig1.to_html().encode('utf-8')
            html2_bytes = fig2.to_html().encode('utf-8')
            
            s3_manager.upload_bytes_direct(html1_bytes, html1_key, 'text/html')
            s3_manager.upload_bytes_direct(html2_bytes, html2_key, 'text/html')
            
            logger.info(f"✅ {model_name} visualizations uploaded directly to S3")
            
        except Exception as e:
            logger.error(f"❌ Failed to create {model_name} visualizations: {e}")
    
    def run_batch_inference(self, data_path=None, models=None):
        """Run batch inference for all models"""
        logger.info("🚀 Starting Journey Simulation Batch Inference...")
        
        try:
            # Load models
            self.load_trained_models()
            
            # Load data
            df = self.load_inference_data(data_path)
            if df is None:
                raise Exception("Failed to load inference data")
            
            # Determine which models to run
            if models is None:
                models = list(self.models.keys())
            
            all_results = {}
            
            for model_name in models:
                if model_name not in self.models:
                    logger.warning(f"⚠️ Model {model_name} not found, skipping...")
                    continue
                
                # Prepare features
                df_features = self.prepare_features(df, model_name)
                
                if df_features is None or len(df_features) == 0:
                    logger.warning(f"⚠️  No features prepared for {model_name}, skipping...")
                    continue
                
                # Perform inference
                if model_name == 'journey_stage':
                    results = self.perform_journey_stage_prediction(df_features)
                elif model_name == 'conversion_prediction':
                    results = self.perform_conversion_prediction(df_features)
                else:
                    logger.warning(f"⚠️  Unknown model: {model_name}")
                    continue
                
                if results is not None:
                    # Save results
                    results_file, report_file = self.save_inference_results(results, model_name)
                    
                    # Create visualizations
                    self.create_inference_visualizations(results, model_name)
                    
                    all_results[model_name] = results
                    
                    logger.info(f"✅ {model_name} batch inference completed")
            
            logger.info("=" * 60)
            logger.info("🎉 BATCH INFERENCE COMPLETED!")
            logger.info("=" * 60)
            logger.info(f"📊 Processed {len(df)} customers")
            logger.info(f"🎯 Ran inference for {len(all_results)} models")
            
            return all_results
            
        except Exception as e:
            logger.error(f"❌ Error in batch inference: {e}")
            raise

## Run the Pipeline

In [6]:
# Initialize and run the pipeline
if __name__ == "__main__":
    inference = JourneySimulationBatchInference()
    results = inference.run_batch_inference()
    
    print("\n🎉 Journey Simulation Batch Inference completed successfully!")
    print(f"📊 Results saved to models/journey_simulation/inference_results/")
    print("🚀 Ready for customer journey analysis and conversion optimization!")

INFO:__main__:🚀 Starting Journey Simulation Batch Inference...
INFO:__main__:📥 Loading trained journey simulation models...
INFO:__main__:✅ Loaded Journey Stage model
INFO:__main__:✅ Loaded Conversion Prediction model
INFO:__main__:✅ Loaded 2 models
INFO:__main__:📊 Loading recent inference data...
INFO:__main__:🔍 Loading recent inference data from S3...
INFO:utils.s3_utils:Loading recent inference data from S3 (last 3 months)...
INFO:utils.s3_utils:Loading data newer than 2025-06-03
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output//unified_customer_dataset.parquet to data_pipelines/unified_dataset/output/unified_customer_dataset.parquet
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output//unified_dataset_report.yaml to data_pipelines/unified_dataset/output/unified_dataset_report.yaml
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_


🎉 Journey Simulation Batch Inference completed successfully!
📊 Results saved to models/journey_simulation/inference_results/
🚀 Ready for customer journey analysis and conversion optimization!
