# AMATO Production - Forecasting ML Pipeline

This notebook trains forecasting models for CTR and revenue prediction using customer behavioral data.

**Author:** Data Science Team  
**Date:** 2024

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import yaml
import logging
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path for imports
# Try multiple possible paths for Jupyter notebook compatibility
possible_paths = [
    Path.cwd(),  # Current working directory
    Path.cwd().parent,  # Parent of current directory
    Path.cwd().parent.parent,  # Grandparent of current directory
    Path(__file__).parent.parent.parent if '__file__' in globals() else None  # If __file__ exists
]

# Filter out None values and find the one with utils folder
project_root = None
for path in possible_paths:
    if path and (path / 'utils').exists():
        project_root = path
        break

if project_root is None:
    # Fallback: use current directory and hope for the best
    project_root = Path.cwd()

sys.path.append(str(project_root))
print(f"🔧 Using project root: {project_root}")

try:
    from utils.s3_utils import get_s3_manager
    print("✅ Successfully imported utils.s3_utils")
except ImportError as e:
    print(f"❌ Failed to import utils.s3_utils: {e}")
    print("🔧 Trying alternative import...")
    try:
        # Try relative import
        sys.path.append('.')
        from utils.s3_utils import get_s3_manager
        print("✅ Successfully imported with relative path")
    except ImportError as e2:
        print(f"❌ Alternative import also failed: {e2}")
        raise

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

🔧 Using project root: /Users/priyankmavani/Desktop/apps/amato
✅ Successfully imported utils.s3_utils


## Forecasting Pipeline Class

In [2]:
class ForecastingPipeline:
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.metadata = {}
        
    def load_data(self):
        """Load customer data for forecasting from S3"""
        try:
            # Load historical training data from S3
            logger.info("🔍 Loading historical training data from S3...")
            s3_manager = get_s3_manager()
            s3_manager.load_inference_data_from_s3()
            logger.info("✅ Historical training data loaded from S3")
            
            # Load the historical dataset
            data_path = 'data_pipelines/unified_dataset/output/unified_customer_dataset.parquet'
            if os.path.exists(data_path):
                df = pd.read_parquet(data_path)
                logger.info(f"✅ Loaded historical training dataset: {df.shape}")
                return df
            else:
                logger.error(f"❌ Historical training dataset not found at {data_path}")
                return None
        except Exception as e:
            logger.error(f"❌ Failed to load data: {e}")
            return None
    
    def prepare_features(self, df, target_col):
        """Prepare features for forecasting"""
        logger.info(f"🔧 Preparing features for {target_col} forecasting...")
        
        # Select features based on target
        if target_col == 'ctr':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'total_orders', 'days_since_first_order',
                'customer_lifetime_value', 'avg_days_between_orders',
                'order_count_30d', 'order_count_90d', 'order_count_365d',
                'revenue_30d', 'revenue_90d', 'revenue_365d'
            ]
        elif target_col == 'revenue':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'total_orders', 'days_since_first_order',
                'customer_lifetime_value', 'avg_days_between_orders',
                'order_count_30d', 'order_count_90d', 'order_count_365d',
                'revenue_30d', 'revenue_90d', 'revenue_365d'
            ]
        else:
            logger.error(f"❌ Unknown target column: {target_col}")
            return None, None, None
        
        # Filter available features
        available_features = [col for col in feature_columns if col in df.columns]
        
        if len(available_features) < 5:
            logger.warning(f"⚠️  Only {len(available_features)} features available for {target_col}")
            
        # Create feature matrix
        X = df[available_features].copy()
        
        # Create synthetic target variables based on available data
        if target_col == 'revenue':
            # Create revenue forecasting target based on monetary value and frequency
            y = df.apply(lambda row: 
                row['monetary_value'] * (1 + row['frequency'] / 10), axis=1)
        elif target_col == 'ctr':
            # Create CTR forecasting target based on engagement metrics
            y = df.apply(lambda row: 
                min(0.15, max(0.01, 0.05 * (1 + row['frequency'] / 10))), axis=1)
        else:
            y = None
        
        # Handle missing values
        X = X.fillna(X.median())
        if y is not None:
            y = y.fillna(y.median())
        
        # Remove outliers using IQR method
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        logger.info(f"✅ Prepared {len(X)} customers with {len(X.columns)} features for {target_col}")
        return X, y, available_features
    
    def train_ctr_model(self, X, y, n_estimators=100, max_depth=10):
        """Train CTR forecasting model"""
        logger.info(f"🎯 Training CTR forecasting model...")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train Random Forest model
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
        
        logger.info(f"✅ CTR forecasting training completed")
        logger.info(f"   MSE: {mse:.6f}")
        logger.info(f"   MAE: {mae:.6f}")
        logger.info(f"   R²: {r2:.4f}")
        logger.info(f"   CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return model, scaler, {
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'cv_r2_mean': cv_scores.mean(),
            'cv_r2_std': cv_scores.std(),
            'n_estimators': n_estimators,
            'max_depth': max_depth
        }
    
    def train_revenue_model(self, X, y, n_estimators=100, max_depth=10):
        """Train revenue forecasting model"""
        logger.info(f"🎯 Training revenue forecasting model...")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train Random Forest model
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
        
        logger.info(f"✅ Revenue forecasting training completed")
        logger.info(f"   MSE: {mse:.6f}")
        logger.info(f"   MAE: {mae:.6f}")
        logger.info(f"   R²: {r2:.4f}")
        logger.info(f"   CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return model, scaler, {
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'cv_r2_mean': cv_scores.mean(),
            'cv_r2_std': cv_scores.std(),
            'n_estimators': n_estimators,
            'max_depth': max_depth
        }
    
    def save_models(self, ctr_model, ctr_scaler, revenue_model, revenue_scaler, 
                    ctr_metrics, revenue_metrics, feature_names):
        """Save trained models and metadata"""
        logger.info("💾 Saving trained models...")
        
        # Create output directory
        output_dir = 'models/forecasting'
        os.makedirs(output_dir, exist_ok=True)
        
        # Save CTR model
        ctr_path = os.path.join(output_dir, 'ctr_forecasting_model.pkl')
        joblib.dump(ctr_model, ctr_path)
        
        # Save CTR scaler
        ctr_scaler_path = os.path.join(output_dir, 'ctr_forecasting_scaler.pkl')
        joblib.dump(ctr_scaler, ctr_scaler_path)
        
        # Save revenue model
        revenue_path = os.path.join(output_dir, 'revenue_forecasting_model.pkl')
        joblib.dump(revenue_model, revenue_path)
        
        # Save revenue scaler
        revenue_scaler_path = os.path.join(output_dir, 'revenue_forecasting_scaler.pkl')
        joblib.dump(revenue_scaler, revenue_scaler_path)
        
        # Save metadata
        metadata = {
            'ctr': ctr_metrics,
            'revenue': revenue_metrics,
            'feature_names': feature_names,
            'training_date': datetime.now().isoformat(),
            'model_versions': {
                'ctr': '1.0',
                'revenue': '1.0'
            }
        }
        
        metadata_path = os.path.join(output_dir, 'pipeline_report.yaml')
        with open(metadata_path, 'w') as f:
            yaml.dump(metadata, f, default_flow_style=False)
        
        # Upload to S3
        try:
            s3_manager = get_s3_manager()
            s3_manager.upload_file(ctr_path, "amato_pm/models/forecasting")
            s3_manager.upload_file(ctr_scaler_path, "amato_pm/models/forecasting")
            s3_manager.upload_file(revenue_path, "amato_pm/models/forecasting")
            s3_manager.upload_file(revenue_scaler_path, "amato_pm/models/forecasting")
            s3_manager.upload_file(metadata_path, "amato_pm/models/forecasting")
            logger.info("✅ Models uploaded to S3")
        except Exception as e:
            logger.warning(f"⚠️  Failed to upload models to S3: {e}")
        
        logger.info(f"✅ Models saved to {output_dir}")
        return output_dir
    
    def run_training_pipeline(self):
        """Run the complete forecasting training pipeline"""
        logger.info("🚀 Starting Forecasting Training Pipeline...")
        
        try:
            # Load data
            df = self.load_data()
            if df is None:
                raise Exception("Failed to load data")
            
            # Prepare features for CTR forecasting
            X_ctr, y_ctr, ctr_features = self.prepare_features(df, 'ctr')
            if X_ctr is None:
                raise Exception("Failed to prepare CTR features")
            
            # Prepare features for revenue forecasting
            X_revenue, y_revenue, revenue_features = self.prepare_features(df, 'revenue')
            if X_revenue is None:
                raise Exception("Failed to prepare revenue features")
            
            # Train CTR model
            ctr_model, ctr_scaler, ctr_metrics = self.train_ctr_model(X_ctr, y_ctr)
            
            # Train revenue model
            revenue_model, revenue_scaler, revenue_metrics = self.train_revenue_model(X_revenue, y_revenue)
            
            # Save models
            output_dir = self.save_models(
                ctr_model, ctr_scaler, revenue_model, revenue_scaler,
                ctr_metrics, revenue_metrics, {
                    'ctr': ctr_features,
                    'revenue': revenue_features
                }
            )
            
            logger.info("=" * 60)
            logger.info("🎉 FORECASTING TRAINING COMPLETED!")
            logger.info("=" * 60)
            logger.info(f"📊 Trained 2 models on {len(df)} customers")
            logger.info(f"🔧 CTR features: {len(ctr_features)}, Revenue features: {len(revenue_features)}")
            logger.info(f"💾 Models saved to: {output_dir}")
            
            return {
                'ctr': ctr_model,
                'revenue': revenue_model,
                'ctr_metrics': ctr_metrics,
                'revenue_metrics': revenue_metrics
            }
            
        except Exception as e:
            logger.error(f"❌ Error in training pipeline: {e}")
            raise

## Run the Pipeline

In [3]:
# Initialize and run the pipeline
if __name__ == "__main__":
    pipeline = ForecastingPipeline()
    results = pipeline.run_training_pipeline()
    
    print("\n🎉 Forecasting Training completed successfully!")
    print(f"📊 CTR: R² = {results['ctr_metrics']['r2']:.4f}, CV R² = {results['ctr_metrics']['cv_r2_mean']:.4f}")
    print(f"📊 Revenue: R² = {results['revenue_metrics']['r2']:.4f}, CV R² = {results['revenue_metrics']['cv_r2_mean']:.4f}")
    print("💾 Models saved and ready for inference!")

INFO:__main__:🚀 Starting Forecasting Training Pipeline...
INFO:__main__:🔍 Loading historical training data from S3...
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:utils.s3_utils:Loading recent inference data from S3 with smart caching (last 3 months)...
INFO:utils.s3_utils:Loading data newer than 2025-06-06
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output/recent_customer_dataset.parquet to data_pipelines/unified_dataset/output/recent_customer_dataset.parquet
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output/timeline_datasets_metadata.yaml to data_pipelines/unified_dataset/output/timeline_datasets_metadata.yaml
INFO:utils.s3_utils:Downloading s3://nuscale-data-services-public/amato_pm/data_pipelines/unified_dataset/output/unified_customer_dataset.parquet to data_pipelines/unified_dataset/output/unified_customer_


🎉 Forecasting Training completed successfully!
📊 CTR: R² = 0.9988, CV R² = 0.9991
📊 Revenue: R² = 0.9842, CV R² = 0.9912
💾 Models saved and ready for inference!
