# AMATO Production - Journey Simulation ML Pipeline

This notebook trains journey simulation models for customer journey stage prediction and conversion prediction.

**Author:** Data Science Team  
**Date:** 2024

In [7]:
# Import required libraries
import pandas as pd
import numpy as np
import yaml
import logging
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path for imports
# Try multiple possible paths for Jupyter notebook compatibility
possible_paths = [
    Path.cwd(),  # Current working directory
    Path.cwd().parent,  # Parent of current directory
    Path.cwd().parent.parent,  # Grandparent of current directory
    Path(__file__).parent.parent.parent if '__file__' in globals() else None  # If __file__ exists
]

# Filter out None values and find the one with utils folder
project_root = None
for path in possible_paths:
    if path and (path / 'utils').exists():
        project_root = path
        break

if project_root is None:
    # Fallback: use current directory and hope for the best
    project_root = Path.cwd()

sys.path.append(str(project_root))
print(f"🔧 Using project root: {project_root}")

try:
    from utils.s3_utils import get_s3_manager
    print("✅ Successfully imported utils.s3_utils")
except ImportError as e:
    print(f"❌ Failed to import utils.s3_utils: {e}")
    print("🔧 Trying alternative import...")
    try:
        # Try relative import
        sys.path.append('.')
        from utils.s3_utils import get_s3_manager
        print("✅ Successfully imported with relative path")
    except ImportError as e2:
        print(f"❌ Alternative import also failed: {e2}")
        raise

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

🔧 Using project root: /Users/priyankmavani/Desktop/apps/amato
✅ Successfully imported utils.s3_utils


## Journey Simulation Pipeline Class

In [8]:
class JourneySimulationPipeline:
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.metadata = {}
        
    def load_data(self):
        """Load customer data for journey simulation from S3"""
        try:
            # Load historical training data from S3
            logger.info("�� Loading historical training data from S3...")
            s3_manager = get_s3_manager()
            s3_manager.load_inference_data_from_s3()
            logger.info("✅ Historical training data loaded from S3")
            
            # Load the historical dataset
            data_path = 'data_pipelines/unified_dataset/output/unified_customer_dataset.parquet'
            if os.path.exists(data_path):
                df = pd.read_parquet(data_path)
                logger.info(f"✅ Loaded historical training dataset: {df.shape}")
                return df
            else:
                logger.error(f"❌ Historical training dataset not found at {data_path}")
                return None
        except Exception as e:
            logger.error(f"❌ Failed to load data: {e}")
            return None
    
    def prepare_features(self, df, target_col):
        """Prepare features for journey simulation"""
        logger.info(f"�� Preparing features for {target_col} prediction...")
        
        # Select features based on target
        if target_col == 'journey_stage':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'total_orders', 'days_since_first_order',
                'customer_lifetime_value', 'avg_days_between_orders',
                'order_count_30d', 'order_count_90d', 'order_count_365d',
                'revenue_30d', 'revenue_90d', 'revenue_365d'
            ]
        elif target_col == 'conversion':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'total_orders', 'days_since_first_order',
                'customer_lifetime_value', 'avg_days_between_orders',
                'order_count_30d', 'order_count_90d', 'order_count_365d',
                'revenue_30d', 'revenue_90d', 'revenue_365d'
            ]
        else:
            logger.error(f"❌ Unknown target column: {target_col}")
            return None, None, None
        
        # Filter available features
        available_features = [col for col in feature_columns if col in df.columns]
        
        if len(available_features) < 5:
            logger.warning(f"⚠️  Only {len(available_features)} features available for {target_col}")
            
        # Create feature matrix
        X = df[available_features].copy()
        
        # Create synthetic target variables based on available data
        if target_col == 'journey_stage':
            # Create journey stage target based on customer behavior
            y = df.apply(lambda row: 
                min(4, max(0, int(row['frequency'] / 2))), axis=1)
        elif target_col == 'conversion':
            # Create conversion target based on engagement
            # Calculate median of the entire column first
            monetary_median = df['monetary_value'].median()
            y = df.apply(lambda row: 
                1 if row['monetary_value'] > monetary_median else 0, axis=1)
        else:
            y = None
        
        # Handle missing values
        X = X.fillna(X.median())
        if y is not None:
            y = y.fillna(y.median())
        
        # Remove outliers using IQR method
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        logger.info(f"✅ Prepared {len(X)} customers with {len(X.columns)} features for {target_col}")
        return X, y, available_features
    
    def train_journey_stage_model(self, X, y, n_estimators=100, max_depth=10):
        """Train journey stage prediction model"""
        logger.info(f"🎯 Training journey stage prediction model...")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train Random Forest model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
        
        logger.info(f"✅ Journey stage prediction training completed")
        logger.info(f"   Accuracy: {accuracy:.4f}")
        logger.info(f"   F1 Score: {f1:.4f}")
        logger.info(f"   CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return model, scaler, {
            'accuracy': accuracy,
            'f1_score': f1,
            'cv_accuracy_mean': cv_scores.mean(),
            'cv_accuracy_std': cv_scores.std(),
            'n_estimators': n_estimators,
            'max_depth': max_depth
        }
    
    def train_conversion_model(self, X, y, n_estimators=100, max_depth=10):
        """Train conversion prediction model"""
        logger.info(f"🎯 Training conversion prediction model...")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train Random Forest model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
        
        logger.info(f"✅ Conversion prediction training completed")
        logger.info(f"   Accuracy: {accuracy:.4f}")
        logger.info(f"   F1 Score: {f1:.4f}")
        logger.info(f"   CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return model, scaler, {
            'accuracy': accuracy,
            'f1_score': f1,
            'cv_accuracy_mean': cv_scores.mean(),
            'cv_accuracy_std': cv_scores.std(),
            'n_estimators': n_estimators,
            'max_depth': max_depth
        }
    
    def save_models(self, journey_model, journey_scaler, conversion_model, conversion_scaler, 
                    journey_metrics, conversion_metrics, feature_names):
        """Save trained models and metadata"""
        logger.info("💾 Saving trained models...")
        
        # Create output directory
        output_dir = 'models/journey_simulation'
        os.makedirs(output_dir, exist_ok=True)
        
        # Save journey stage model
        journey_path = os.path.join(output_dir, 'journey_stage_model.pkl')
        joblib.dump(journey_model, journey_path)
        
        # Save journey stage scaler
        journey_scaler_path = os.path.join(output_dir, 'journey_stage_scaler.pkl')
        joblib.dump(journey_scaler, journey_scaler_path)
        
        # Save conversion model
        conversion_path = os.path.join(output_dir, 'conversion_prediction_model.pkl')
        joblib.dump(conversion_model, conversion_path)
        
        # Save conversion scaler
        conversion_scaler_path = os.path.join(output_dir, 'conversion_prediction_scaler.pkl')
        joblib.dump(conversion_scaler, conversion_scaler_path)
        
        # Save metadata
        metadata = {
            'journey_stage': journey_metrics,
            'conversion': conversion_metrics,
            'feature_names': feature_names,
            'training_date': datetime.now().isoformat(),
            'model_versions': {
                'journey_stage': '1.0',
                'conversion': '1.0'
            }
        }
        
        metadata_path = os.path.join(output_dir, 'pipeline_report.yaml')
        with open(metadata_path, 'w') as f:
            yaml.dump(metadata, f, default_flow_style=False)
        
        # Upload to S3
        try:
            s3_manager = get_s3_manager()
            s3_manager.upload_file(journey_path, "amato_pm/models/journey_simulation")
            s3_manager.upload_file(journey_scaler_path, "amato_pm/models/journey_simulation")
            s3_manager.upload_file(conversion_path, "amato_pm/models/journey_simulation")
            s3_manager.upload_file(conversion_scaler_path, "amato_pm/models/journey_simulation")
            s3_manager.upload_file(metadata_path, "amato_pm/models/journey_simulation")
            logger.info("✅ Models uploaded to S3")
        except Exception as e:
            logger.warning(f"⚠️  Failed to upload models to S3: {e}")
        
        logger.info(f"✅ Models saved to {output_dir}")
        return output_dir
    
    def run_training_pipeline(self):
        """Run the complete journey simulation training pipeline"""
        logger.info("🚀 Starting Journey Simulation Training Pipeline...")
        
        try:
            # Load data
            df = self.load_data()
            if df is None:
                raise Exception("Failed to load data")
            
            # Prepare features for journey stage prediction
            X_journey, y_journey, journey_features = self.prepare_features(df, 'journey_stage')
            if X_journey is None:
                raise Exception("Failed to prepare journey stage features")
            
            # Prepare features for conversion prediction
            X_conversion, y_conversion, conversion_features = self.prepare_features(df, 'conversion')
            if X_conversion is None:
                raise Exception("Failed to prepare conversion features")
            
            # Train journey stage model
            journey_model, journey_scaler, journey_metrics = self.train_journey_stage_model(X_journey, y_journey)
            
            # Train conversion model
            conversion_model, conversion_scaler, conversion_metrics = self.train_conversion_model(X_conversion, y_conversion)
            
            # Save models
            output_dir = self.save_models(
                journey_model, journey_scaler, conversion_model, conversion_scaler,
                journey_metrics, conversion_metrics, {
                    'journey_stage': journey_features,
                    'conversion': conversion_features
                }
            )
            
            logger.info("=" * 60)
            logger.info("🎉 JOURNEY SIMULATION TRAINING COMPLETED!")
            logger.info("=" * 60)
            logger.info(f"📊 Trained 2 models on {len(df)} customers")
            logger.info(f"🔧 Journey Stage features: {len(journey_features)}, Conversion features: {len(conversion_features)}")
            logger.info(f"💾 Models saved to: {output_dir}")
            
            return {
                'journey_stage': journey_model,
                'conversion': conversion_model,
                'journey_metrics': journey_metrics,
                'conversion_metrics': conversion_metrics
            }
            
        except Exception as e:
            logger.error(f"❌ Error in training pipeline: {e}")
            raise

## Run the Pipeline

In [9]:
# Initialize and run the pipeline
if __name__ == "__main__":
    pipeline = JourneySimulationPipeline()
    results = pipeline.run_training_pipeline()
    
    print("\n🎉 Journey Simulation Training completed successfully!")
    print(f"📊 Journey Stage: Accuracy = {results['journey_metrics']['accuracy']:.4f}, CV Accuracy = {results['journey_metrics']['cv_accuracy_mean']:.4f}")
    print(f"📊 Conversion: Accuracy = {results['conversion_metrics']['accuracy']:.4f}, CV Accuracy = {results['conversion_metrics']['cv_accuracy_mean']:.4f}")
    print("💾 Models saved and ready for inference!")

INFO:__main__:🚀 Starting Journey Simulation Training Pipeline...
INFO:__main__:�� Loading historical training data from S3...
INFO:utils.s3_utils:Loading recent inference data from S3 (last 3 months)...
INFO:utils.s3_utils:Loading data newer than 2025-06-03
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output//unified_customer_dataset.parquet to data_pipelines/unified_dataset/output/unified_customer_dataset.parquet
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output//unified_dataset_report.yaml to data_pipelines/unified_dataset/output/unified_dataset_report.yaml
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output//unified_dataset_summary.yaml to data_pipelines/unified_dataset/output/unified_dataset_summary.yaml
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified


🎉 Journey Simulation Training completed successfully!
📊 Journey Stage: Accuracy = 0.9823, CV Accuracy = 0.9811
📊 Conversion: Accuracy = 0.9994, CV Accuracy = 1.0000
💾 Models saved and ready for inference!
