# AMATO Production - Customer Segmentation Batch Inference Pipeline

This notebook performs batch inference on customer data using trained segmentation models.

**Author:** Data Science Team  
**Date:** 2024

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import yaml
import logging
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path for imports
# Try multiple possible paths for Jupyter notebook compatibility
possible_paths = [
    Path.cwd(),  # Current working directory
    Path.cwd().parent,  # Parent of current directory
    Path.cwd().parent.parent,  # Grandparent of current directory
    Path(__file__).parent.parent.parent if '__file__' in globals() else None  # If __file__ exists
]

# Filter out None values and find the one with utils folder
project_root = None
for path in possible_paths:
    if path and (path / 'utils').exists():
        project_root = path
        break

if project_root is None:
    # Fallback: use current directory and hope for the best
    project_root = Path.cwd()

sys.path.append(str(project_root))
print(f"🔧 Using project root: {project_root}")

try:
    from utils.s3_utils import get_s3_manager
    print("✅ Successfully imported utils.s3_utils")
except ImportError as e:
    print(f"❌ Failed to import utils.s3_utils: {e}")
    print("🔧 Trying alternative import...")
    try:
        # Try relative import
        sys.path.append('.')
        from utils.s3_utils import get_s3_manager
        print("✅ Successfully imported with relative path")
    except ImportError as e2:
        print(f"❌ Alternative import also failed: {e2}")
        raise

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

🔧 Using project root: /Users/priyankmavani/Desktop/apps/amato
✅ Successfully imported utils.s3_utils


## Customer Segmentation Batch Inference Class

In [2]:
class CustomerSegmentationBatchInference:
    def __init__(self):
        """Initialize the Customer Segmentation Batch Inference Pipeline"""
        self.models = {}
        self.scalers = {}
        self.metadata = {}
        
    def load_trained_models(self):
        """Load trained segmentation models from S3 with timestamped filenames"""
        logger.info("📥 Loading trained segmentation models...")
        
        try:
            s3_manager = get_s3_manager()
            
            # Create models directory if it doesn't exist
            models_dir = 'models/customer_segmentation'
            os.makedirs(models_dir, exist_ok=True)
            
            # Find latest K-means model and scaler
            try:
                kmeans_models = s3_manager.list_files('amato_pm/models/customer_segmentation/')
                kmeans_model_files = [f for f in kmeans_models if 'kmeans_model_' in f and f.endswith('.pkl')]
                kmeans_scaler_files = [f for f in kmeans_models if 'kmeans_scaler_' in f and f.endswith('.pkl')]
                
                if kmeans_model_files and kmeans_scaler_files:
                    # Get latest by timestamp (sort by filename which includes timestamp)
                    latest_kmeans_model = sorted(kmeans_model_files)[-1]
                    latest_kmeans_scaler = sorted(kmeans_scaler_files)[-1]
                    
                    logger.info(f"📥 Found latest K-means model: {latest_kmeans_model}")
                    logger.info(f"📥 Found latest K-means scaler: {latest_kmeans_scaler}")
                    
                    # Download latest K-means model
                    kmeans_path = f'{models_dir}/kmeans_model.pkl'
                    if not os.path.exists(kmeans_path):
                        logger.info("📥 Downloading latest K-means model from S3...")
                        s3_manager.download_file(latest_kmeans_model, kmeans_path)
                        logger.info("✅ Downloaded latest K-means model from S3")
                    
                    # Download latest K-means scaler
                    kmeans_scaler_path = f'{models_dir}/kmeans_scaler.pkl'
                    if not os.path.exists(kmeans_scaler_path):
                        logger.info("📥 Downloading latest K-means scaler from S3...")
                        s3_manager.download_file(latest_kmeans_scaler, kmeans_scaler_path)
                        logger.info("✅ Downloaded latest K-means scaler from S3")
                    
                    # Load K-means model
                    if os.path.exists(kmeans_path) and os.path.exists(kmeans_scaler_path):
                        self.models['kmeans'] = joblib.load(kmeans_path)
                        self.scalers['kmeans'] = joblib.load(kmeans_scaler_path)
                        logger.info("✅ Loaded K-means model")
                    else:
                        logger.warning("⚠️  K-means model not available")
                else:
                    logger.warning("⚠️  No K-means models found in S3")
                    
            except Exception as e:
                logger.warning(f"⚠️  Failed to find/download K-means models: {e}")
            
            # Find latest HDBSCAN model and scaler
            try:
                hdbscan_model_files = [f for f in kmeans_models if 'hdbscan_model_' in f and f.endswith('.pkl')]
                hdbscan_scaler_files = [f for f in kmeans_models if 'hdbscan_scaler_' in f and f.endswith('.pkl')]
                
                if hdbscan_model_files and hdbscan_scaler_files:
                    # Get latest by timestamp
                    latest_hdbscan_model = sorted(hdbscan_model_files)[-1]
                    latest_hdbscan_scaler = sorted(hdbscan_scaler_files)[-1]
                    
                    logger.info(f"📥 Found latest HDBSCAN model: {latest_hdbscan_model}")
                    logger.info(f"📥 Found latest HDBSCAN scaler: {latest_hdbscan_scaler}")
                    
                    # Download latest HDBSCAN model
                    hdbscan_path = f'{models_dir}/hdbscan_model.pkl'
                    if not os.path.exists(hdbscan_path):
                        logger.info("📥 Downloading latest HDBSCAN model from S3...")
                        s3_manager.download_file(latest_hdbscan_model, hdbscan_path)
                        logger.info("✅ Downloaded latest HDBSCAN model from S3")
                    
                    # Download latest HDBSCAN scaler
                    hdbscan_scaler_path = f'{models_dir}/hdbscan_scaler.pkl'
                    if not os.path.exists(hdbscan_scaler_path):
                        logger.info("📥 Downloading latest HDBSCAN scaler from S3...")
                        s3_manager.download_file(latest_hdbscan_scaler, hdbscan_scaler_path)
                        logger.info("✅ Downloaded latest HDBSCAN scaler from S3")
                    
                    # Load HDBSCAN model
                    if os.path.exists(hdbscan_path) and os.path.exists(hdbscan_scaler_path):
                        self.models['hdbscan'] = joblib.load(hdbscan_path)
                        self.scalers['hdbscan'] = joblib.load(hdbscan_scaler_path)
                        logger.info("✅ Loaded HDBSCAN model")
                    else:
                        logger.warning("⚠️  HDBSCAN model not available")
                else:
                    logger.warning("⚠️  No HDBSCAN models found in S3")
                    
            except Exception as e:
                logger.warning(f"⚠️  Failed to find/download HDBSCAN models: {e}")
            
            logger.info(f"✅ Loaded {len(self.models)} models")
            
            if len(self.models) == 0:
                logger.error("❌ No models loaded. Please ensure models are available in S3.")
                raise Exception("No models available for inference")
            
        except Exception as e:
            logger.error(f"❌ Failed to load models: {e}")
            raise
    
    def load_inference_data(self, data_path=None):
        """Load recent inference data for customer segmentation"""
        logger.info("📊 Loading recent inference data...")
        
        try:
            # Load recent inference data from S3
            logger.info("🔍 Loading recent inference data from S3...")
            s3_manager = get_s3_manager()
            s3_manager.load_inference_data_from_s3()
            logger.info("✅ Recent inference data loaded from S3")
            
            if data_path is None:
                data_path = 'data_pipelines/unified_dataset/output/recent_customer_dataset.parquet'
            
            if os.path.exists(data_path):
                df = pd.read_parquet(data_path)
                logger.info(f"✅ Loaded recent inference data: {len(df)} customers")
                return df
            else:
                logger.error(f"❌ Recent inference data not found at {data_path}")
                return None
                
        except Exception as e:
            logger.error(f"❌ Failed to load recent inference data: {e}")
            return None
    
    def prepare_features(self, df, model_name):
        """Prepare features for inference"""
        logger.info(f"🔧 Preparing features for {model_name} inference...")
        
        # Select RFM and behavioral features
        feature_columns = [
            'recency_days', 'frequency', 'monetary_value',
            'avg_order_value', 'total_orders', 'days_since_first_order',
            'customer_lifetime_value', 'avg_days_between_orders',
            'order_count_30d', 'order_count_90d', 'order_count_365d',
            'revenue_30d', 'revenue_90d', 'revenue_365d'
        ]
        
        # Filter available features
        available_features = [col for col in feature_columns if col in df.columns]
        
        if len(available_features) < 5:
            logger.warning(f"⚠️  Only {len(available_features)} features available for {model_name}")
            
        # Create feature matrix
        X = df[available_features].copy()
        
        # Handle missing values
        X = X.fillna(X.median())
        
        # Remove outliers using IQR method
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        logger.info(f"✅ Prepared {len(X)} customers for {model_name}")
        return X
    
    def perform_kmeans_inference(self, df_features):
        """Perform K-means inference"""
        logger.info("🎯 Performing kmeans inference...")
        
        if 'kmeans' not in self.models:
            logger.error("❌ K-means model not loaded")
            return None
        
        # Scale features
        X_scaled = self.scalers['kmeans'].transform(df_features)
        
        # Predict clusters
        cluster_labels = self.models['kmeans'].predict(X_scaled)
        
        # Create results dataframe
        results = df_features.copy()
        results['kmeans_segment'] = cluster_labels
        results['kmeans_segment_type'] = results['kmeans_segment'].map({
            0: 'Low Value', 1: 'Medium Value', 2: 'High Value', 3: 'Premium', 4: 'VIP'
        })
        
        logger.info(f"✅ kmeans inference completed: {len(results)} predictions")
        return results
    
    def perform_hdbscan_inference(self, df_features):
        """Perform HDBSCAN inference"""
        logger.info("🎯 Performing hdbscan inference...")
        
        if 'hdbscan' not in self.models:
            logger.error("❌ HDBSCAN model not loaded")
            return None
        
        # Scale features
        X_scaled = self.scalers['hdbscan'].transform(df_features)
        
        # For HDBSCAN, we need to use fit_predict or get the labels from the fitted model
        # Since this is a pre-trained model, we'll use the stored labels approach
        # or create a new clustering based on the existing model parameters
        
        # Get the HDBSCAN model
        hdbscan_model = self.models['hdbscan']
        
        # Check if the model has been fitted and has labels
        if hasattr(hdbscan_model, 'labels_') and hdbscan_model.labels_ is not None:
            # If model was already fitted, we need to create new clusters for new data
            # For inference, we'll use the model's parameters to create new clusters
            logger.info("📊 Creating new HDBSCAN clusters for inference data...")
            
            # Create a new HDBSCAN instance with the same parameters
            from hdbscan import HDBSCAN
            new_hdbscan = HDBSCAN(
                min_cluster_size=hdbscan_model.min_cluster_size,
                min_samples=hdbscan_model.min_samples,
                metric=hdbscan_model.metric,
                cluster_selection_method=hdbscan_model.cluster_selection_method,
                cluster_selection_epsilon=hdbscan_model.cluster_selection_epsilon,
                alpha=hdbscan_model.alpha
            )
            
            # Fit and predict on new data
            cluster_labels = new_hdbscan.fit_predict(X_scaled)
        else:
            # Use fit_predict for new data
            cluster_labels = hdbscan_model.fit_predict(X_scaled)
        
        # Create results dataframe
        results = df_features.copy()
        results['hdbscan_segment'] = cluster_labels
        
        # Map cluster labels to meaningful names
        unique_clusters = sorted(set(cluster_labels))
        cluster_names = {}
        for i, cluster_id in enumerate(unique_clusters):
            if cluster_id == -1:
                cluster_names[cluster_id] = 'Noise'
            else:
                cluster_names[cluster_id] = f'Segment_{i}'
        
        results['hdbscan_segment_type'] = results['hdbscan_segment'].map(cluster_names)
        
        logger.info(f"✅ hdbscan inference completed: {len(results)} predictions")
        return results
    
    def save_inference_results(self, results, model_name):
        """Save inference results directly to S3"""
        logger.info(f"💾 Saving {model_name} inference results...")
        
        try:
            s3_manager = get_s3_manager()
            
            # Save results directly to S3
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            results_key = f'models/customer_segmentation/inference_results/{model_name}_inference_results_{timestamp}.parquet'
            
            # Convert to parquet bytes and upload
            results_bytes = results.to_parquet(index=False)
            results_success = s3_manager.upload_bytes_direct(
                results_bytes, results_key, 'application/octet-stream'
            )
            
            # Generate and save report
            report = self.generate_inference_report(results, model_name)
            report_key = f'models/customer_segmentation/inference_results/{model_name}_inference_report_{timestamp}.yaml'
            
            report_success = s3_manager.upload_bytes_direct(
                yaml.dump(report, default_flow_style=False).encode('utf-8'),
                report_key, 'text/yaml'
            )
            
            if results_success and report_success:
                logger.info(f"✅ {model_name} results uploaded directly to S3")
                return results_key, report_key
            else:
                logger.warning(f"⚠️  Some {model_name} results failed to upload to S3")
                return None, None
                
        except Exception as e:
            logger.error(f"❌ Failed to save {model_name} results: {e}")
            return None, None
    
    def generate_inference_report(self, results, model_name):
        """Generate inference report"""
        logger.info(f"📋 Generating {model_name} inference report...")
        
        segment_col = f'{model_name}_segment'
        type_col = f'{model_name}_segment_type'
        
        report = {
            'model_name': model_name,
            'inference_date': datetime.now().isoformat(),
            'total_customers': len(results),
            'segment_distribution': results[segment_col].value_counts().to_dict(),
            'segment_type_distribution': results[type_col].value_counts().to_dict(),
            'feature_summary': {
                'total_features': len(results.columns),
                'numeric_features': len(results.select_dtypes(include=[np.number]).columns),
                'categorical_features': len(results.select_dtypes(include=['object']).columns)
            }
        }
        
        return report
    
    def create_inference_visualizations(self, results, model_name):
        """Create inference visualizations and upload directly to S3"""
        logger.info(f"📊 Creating {model_name} inference visualizations...")
        
        try:
            s3_manager = get_s3_manager()
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            
            # Segment distribution
            fig1 = px.bar(
                x=results[f'{model_name}_segment'].value_counts().index,
                y=results[f'{model_name}_segment'].value_counts().values,
                title=f'{model_name.upper()} Segment Distribution',
                labels={'x': 'Segment ID', 'y': 'Customer Count'}
            )
            
            # Segment type distribution
            fig2 = px.pie(
                values=results[f'{model_name}_segment_type'].value_counts().values,
                names=results[f'{model_name}_segment_type'].value_counts().index,
                title=f'{model_name.upper()} Segment Type Distribution'
            )
            
            # Upload visualizations directly to S3
            html1_key = f'models/customer_segmentation/inference_results/{model_name}_segment_distribution_{timestamp}.html'
            html2_key = f'models/customer_segmentation/inference_results/{model_name}_segment_types_{timestamp}.html'
            
            # Convert figures to HTML and upload
            html1_bytes = fig1.to_html().encode('utf-8')
            html2_bytes = fig2.to_html().encode('utf-8')
            
            s3_manager.upload_bytes_direct(html1_bytes, html1_key, 'text/html')
            s3_manager.upload_bytes_direct(html2_bytes, html2_key, 'text/html')
            
            logger.info(f"✅ {model_name} visualizations uploaded directly to S3")
            
        except Exception as e:
            logger.error(f"❌ Failed to create {model_name} visualizations: {e}")
    
    def run_batch_inference(self, data_path=None, models=None):
        """Run batch inference for all models"""
        logger.info("🚀 Starting Customer Segmentation Batch Inference...")
        
        try:
            # Load models
            self.load_trained_models()
            
            # Load data
            df = self.load_inference_data(data_path)
            if df is None:
                raise Exception("Failed to load inference data")
            
            # Determine which models to run
            if models is None:
                models = list(self.models.keys())
            
            all_results = {}
            
            for model_name in models:
                if model_name not in self.models:
                    logger.warning(f"⚠️ Model {model_name} not found, skipping...")
                    continue
                
                # Prepare features
                df_features = self.prepare_features(df, model_name)
                
                
                if df_features is None or len(df_features) == 0:
                    logger.warning(f"⚠️  No features prepared for {model_name}, skipping...")
                    continue
                
                # Perform inference
                if model_name == 'kmeans':
                    results = self.perform_kmeans_inference(df_features)
                elif model_name == 'hdbscan':
                    results = self.perform_hdbscan_inference(df_features)
                else:
                    logger.warning(f"⚠️  Unknown model: {model_name}")
                    continue
                
                if results is not None:
                    # Save results
                    results_file, report_file = self.save_inference_results(results, model_name)
                    
                    # Create visualizations
                    self.create_inference_visualizations(results, model_name)
                    
                    all_results[model_name] = results
                    
                    logger.info(f"✅ {model_name} batch inference completed")
            
            logger.info("=" * 60)
            logger.info("🎉 BATCH INFERENCE COMPLETED!")
            logger.info("=" * 60)
            logger.info(f"📊 Processed {len(df)} customers")
            logger.info(f"🎯 Ran inference for {len(all_results)} models")
            
            return all_results
            
        except Exception as e:
            logger.error(f"❌ Error in batch inference: {e}")
            raise

## Run the Pipeline

In [3]:
# Initialize and run the pipeline
if __name__ == "__main__":
    inference = CustomerSegmentationBatchInference()
    results = inference.run_batch_inference()
    
    print("\n🎉 Customer Segmentation Batch Inference completed successfully!")
    print(f"📊 Results saved to models/customer_segmentation/inference_results/")
    print("📈 Ready for business analysis!")

INFO:__main__:🚀 Starting Customer Segmentation Batch Inference...
INFO:__main__:📥 Loading trained segmentation models...
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:__main__:📥 Found latest K-means model: amato_pm/models/customer_segmentation/kmeans_model_20250904_113434.pkl
INFO:__main__:📥 Found latest K-means scaler: amato_pm/models/customer_segmentation/kmeans_scaler_20250904_113435.pkl
INFO:__main__:✅ Loaded K-means model
INFO:__main__:📥 Found latest HDBSCAN model: amato_pm/models/customer_segmentation/hdbscan_model_20250904_113435.pkl
INFO:__main__:📥 Found latest HDBSCAN scaler: amato_pm/models/customer_segmentation/hdbscan_scaler_20250904_113435.pkl
INFO:__main__:✅ Loaded HDBSCAN model
INFO:__main__:✅ Loaded 2 models
INFO:__main__:📊 Loading recent inference data...
INFO:__main__:🔍 Loading recent inference data from S3...
INFO:utils.s3_utils:Loading recent inference data from S3 with smart caching (last 3 months)...
INFO:utils.s3_


🎉 Customer Segmentation Batch Inference completed successfully!
📊 Results saved to models/customer_segmentation/inference_results/
📈 Ready for business analysis!
