# AMATO Production - Customer Segmentation ML Pipeline

This notebook trains clustering models for customer segmentation using RFM and behavioral data.

**Author:** Data Science Team  
**Date:** 2024

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import yaml
import logging
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.model_selection import train_test_split
import hdbscan
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path for imports
# Try multiple possible paths for Jupyter notebook compatibility
possible_paths = [
    Path.cwd(),  # Current working directory
    Path.cwd().parent,  # Parent of current directory
    Path.cwd().parent.parent,  # Grandparent of current directory
    Path(__file__).parent.parent.parent if '__file__' in globals() else None  # If __file__ exists
]

# Filter out None values and find the one with utils folder
project_root = None
for path in possible_paths:
    if path and (path / 'utils').exists():
        project_root = path
        break

if project_root is None:
    # Fallback: use current directory and hope for the best
    project_root = Path.cwd()

sys.path.append(str(project_root))
print(f"🔧 Using project root: {project_root}")

try:
    from utils.s3_utils import get_s3_manager
    print("✅ Successfully imported utils.s3_utils")
except ImportError as e:
    print(f"❌ Failed to import utils.s3_utils: {e}")
    print("🔧 Trying alternative import...")
    try:
        # Try relative import
        sys.path.append('.')
        from utils.s3_utils import get_s3_manager
        print("✅ Successfully imported with relative path")
    except ImportError as e2:
        print(f"❌ Alternative import also failed: {e2}")
        raise

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

🔧 Using project root: /Users/priyankmavani/Desktop/apps/amato
✅ Successfully imported utils.s3_utils


## Customer Segmentation Pipeline Class

In [2]:
class CustomerSegmentationPipeline:
    def __init__(self):
        """Initialize the Customer Segmentation Pipeline"""
        self.models = {}
        self.scalers = {}
        self.metadata = {}
        

    
    def load_data(self):
        """Load historical training data for customer segmentation"""
        try:
            # Load historical training data from S3
            logger.info("🔍 Loading historical training data from S3...")
            s3_manager = get_s3_manager()
            s3_manager.load_training_data_from_s3()
            logger.info("✅ Historical training data loaded from S3")

            # Now try to load the local file
            data_path = 'data_pipelines/unified_dataset/output/unified_customer_dataset.parquet'
            if os.path.exists(data_path):
                df = pd.read_parquet(data_path)
                logger.info(f"✅ Loaded historical training dataset: {df.shape}")
                return df
            else:
                logger.error(f"❌ Historical training dataset not found at {data_path}")
                return None
        except Exception as e:
            logger.error(f"❌ Failed to load historical training data: {e}")
            return None
    
    def prepare_features(self, df):
        """Prepare features for customer segmentation"""
        logger.info("🔧 Preparing features for segmentation...")
        
        # Select RFM and behavioral features
        feature_columns = [
            'recency_days', 'frequency', 'monetary_value',
            'avg_order_value', 'total_orders', 'days_since_first_order',
            'customer_lifetime_value', 'avg_days_between_orders',
            'order_count_30d', 'order_count_90d', 'order_count_365d',
            'revenue_30d', 'revenue_90d', 'revenue_365d'
        ]
        
        # Filter available features
        available_features = [col for col in feature_columns if col in df.columns]
        
        if len(available_features) < 5:
            logger.warning(f"⚠️  Only {len(available_features)} features available for segmentation")
            
        # Create feature matrix
        X = df[available_features].copy()
        
        # Handle missing values
        X = X.fillna(X.median())
        
        # Remove outliers using IQR method
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        logger.info(f"✅ Prepared {len(X)} customers with {len(X.columns)} features")
        return X, available_features

    def run_training_pipeline(self):
        """Run the complete customer segmentation training pipeline"""
        logger.info("🚀 Starting Customer Segmentation Training Pipeline...")
        
        try:
            # Load data
            df = self.load_data()
            if df is None:
                raise Exception("Failed to load data")
            
            # Prepare features
            X, feature_names = self.prepare_features(df)
            if X is None:
                raise Exception("Failed to prepare features")
            
            # Train K-means model
            kmeans_model, kmeans_scaler, kmeans_labels, kmeans_metrics = self.train_kmeans_model(X)
            
            # Train HDBSCAN model
            hdbscan_model, hdbscan_scaler, hdbscan_labels, hdbscan_metrics = self.train_hdbscan_model(X)
            
            # Save models
            output_dir = self.save_models(
                kmeans_model, kmeans_scaler, hdbscan_model, hdbscan_scaler,
                kmeans_metrics, hdbscan_metrics, feature_names
            )
            
            logger.info("=" * 60)
            logger.info("🎉 CUSTOMER SEGMENTATION TRAINING COMPLETED!")
            logger.info("=" * 60)
            logger.info(f"📊 Trained 2 models on {len(X)} customers")
            logger.info(f"🔧 Used {len(feature_names)} features")
            logger.info(f"💾 Models saved to: {output_dir}")
            
            return {
                'kmeans': kmeans_model,
                'hdbscan': hdbscan_model,
                'kmeans_metrics': kmeans_metrics,
                'hdbscan_metrics': hdbscan_metrics
            }
            
        except Exception as e:
            logger.error(f"❌ Error in training pipeline: {e}")
            raise

    def train_kmeans_model(self, X, n_clusters=5):
        """Train K-means clustering model"""
        logger.info(f"🎯 Training K-means model with {n_clusters} clusters...")
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Train K-means model
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        
        # Get cluster labels
        cluster_labels = kmeans.predict(X_scaled)
        
        # Calculate metrics
        silhouette_avg = silhouette_score(X_scaled, cluster_labels)
        calinski_avg = calinski_harabasz_score(X_scaled, cluster_labels)
        
        logger.info(f"✅ K-means training completed")
        logger.info(f"   Silhouette Score: {silhouette_avg:.4f}")
        logger.info(f"   Calinski-Harabasz Score: {calinski_avg:.4f}")
        
        return kmeans, scaler, cluster_labels, {
            'silhouette_score': silhouette_avg,
            'calinski_harabasz_score': calinski_avg,
            'n_clusters': n_clusters
        }

    def train_hdbscan_model(self, X, min_cluster_size=50, min_samples=10):
        """Train HDBSCAN clustering model"""
        logger.info(f"🎯 Training HDBSCAN model...")
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Train HDBSCAN model
        hdbscan_model = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean'
        )
        hdbscan_model.fit(X_scaled)
        
        # Get cluster labels
        cluster_labels = hdbscan_model.labels_
        
        # Calculate metrics (only for non-noise points)
        valid_labels = cluster_labels[cluster_labels != -1]
        valid_indices = cluster_labels != -1
        
        if len(valid_labels) > 1:
            silhouette_avg = silhouette_score(X_scaled[valid_indices], valid_labels)
            calinski_avg = calinski_harabasz_score(X_scaled[valid_indices], valid_labels)
        else:
            silhouette_avg = 0
            calinski_avg = 0
        
        n_clusters = len(set(valid_labels))
        noise_ratio = (cluster_labels == -1).sum() / len(cluster_labels)
        
        logger.info(f"✅ HDBSCAN training completed")
        logger.info(f"   Clusters found: {n_clusters}")
        logger.info(f"   Noise ratio: {noise_ratio:.4f}")
        logger.info(f"   Silhouette Score: {silhouette_avg:.4f}")
        logger.info(f"   Calinski-Harabasz Score: {calinski_avg:.4f}")
        
        return hdbscan_model, scaler, cluster_labels, {
            'silhouette_score': silhouette_avg,
            'calinski_harabasz_score': calinski_avg,
            'n_clusters': n_clusters,
            'noise_ratio': noise_ratio
        }

    def save_models(self, kmeans_model, kmeans_scaler, hdbscan_model, hdbscan_scaler, 
                    kmeans_metrics, hdbscan_metrics, feature_names):
        """Save trained models and metadata directly to S3"""
        logger.info("💾 Saving trained models directly to S3...")
        
        try:
            s3_manager = get_s3_manager()
            
            # Save K-means model directly to S3
            kmeans_success = s3_manager.upload_model_direct(
                kmeans_model, 'kmeans_model', 'customer_segmentation', 
                {'model_type': 'kmeans', 'metrics': kmeans_metrics}
            )
            
            # Save K-means scaler directly to S3
            kmeans_scaler_success = s3_manager.upload_model_direct(
                kmeans_scaler, 'kmeans_scaler', 'customer_segmentation', 
                {'model_type': 'scaler', 'for_model': 'kmeans'}
            )
            
            # Save HDBSCAN model directly to S3
            hdbscan_success = s3_manager.upload_model_direct(
                hdbscan_model, 'hdbscan_model', 'customer_segmentation', 
                {'model_type': 'hdbscan', 'metrics': hdbscan_metrics}
            )
            
            # Save HDBSCAN scaler directly to S3
            hdbscan_scaler_success = s3_manager.upload_model_direct(
                hdbscan_scaler, 'hdbscan_scaler', 'customer_segmentation', 
                {'model_type': 'scaler', 'for_model': 'hdbscan'}
            )
            
            # Save metadata directly to S3
            metadata = {
                'kmeans': kmeans_metrics,
                'hdbscan': hdbscan_metrics,
                'feature_names': feature_names,
                'training_date': datetime.now().isoformat(),
                'model_versions': {
                    'kmeans': '1.0',
                    'hdbscan': '1.0'
                }
            }
            
            metadata_success = s3_manager.upload_bytes_direct(
                yaml.dump(metadata, default_flow_style=False).encode('utf-8'),
                'models/customer_segmentation/pipeline_report.yaml',
                'text/yaml'
            )
            
            # Check success
            all_success = all([
                kmeans_success, kmeans_scaler_success, 
                hdbscan_success, hdbscan_scaler_success, metadata_success
            ])
            
            if all_success:
                logger.info("✅ All models and metadata uploaded directly to S3")
                return "S3"
            else:
                logger.warning("⚠️  Some models failed to upload to S3")
                return "S3_partial"
                
        except Exception as e:
            logger.error(f"❌ Failed to upload models to S3: {e}")
            return "failed"

In [3]:
# Initialize and run the pipeline
if __name__ == "__main__":
    pipeline = CustomerSegmentationPipeline()
    results = pipeline.run_training_pipeline()
    
    print("\n🎉 Customer Segmentation Training completed successfully!")
    print(f"📊 K-means: {results['kmeans_metrics']['n_clusters']} clusters, Silhouette: {results['kmeans_metrics']['silhouette_score']:.4f}")
    print(f"📊 HDBSCAN: {results['hdbscan_metrics']['n_clusters']} clusters, Noise: {results['hdbscan_metrics']['noise_ratio']:.4f}")
    print("💾 Models saved and ready for inference!")

INFO:__main__:🚀 Starting Customer Segmentation Training Pipeline...
INFO:__main__:🔍 Loading historical training data from S3...
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:utils.s3_utils:Loading historical training data from S3 with smart caching...
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output/recent_customer_dataset.parquet to data_pipelines/unified_dataset/output/recent_customer_dataset.parquet
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output/timeline_datasets_metadata.yaml to data_pipelines/unified_dataset/output/timeline_datasets_metadata.yaml
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output/unified_customer_dataset.parquet to data_pipelines/unified_dataset/output/unified_customer_dataset.parquet
INFO:__main__:✅ Historical training data l


🎉 Customer Segmentation Training completed successfully!
📊 K-means: 5 clusters, Silhouette: 0.2675
📊 HDBSCAN: 8 clusters, Noise: 0.0980
💾 Models saved and ready for inference!
