In [4]:
import pandas as pd
import numpy as np
import yaml
import logging
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


possible_paths = [
    Path.cwd(),  # Current working directory
    Path.cwd().parent,  # Parent of current directory
    Path.cwd().parent.parent,  # Grandparent of current directory
    Path(__file__).parent.parent.parent if '__file__' in globals() else None  # If __file__ exists
]

# Filter out None values and find the one with utils folder
project_root = None
for path in possible_paths:
    if path and (path / 'utils').exists():
        project_root = path
        break
# Add project root to path for imports
sys.path.append(str(project_root))

from utils.s3_utils import get_s3_manager

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CampaignOptimizationPipeline:
    def __init__(self):
        """Initialize the Campaign Optimization Pipeline"""
        self.models = {}
        self.scalers = {}
        self.metadata = {}
        
    def load_data(self):
        """Load historical training data for campaign optimization (before 3 months ago)"""
        try:
            # Load historical training data from S3
            logger.info("🔍 Loading historical training data from S3...")
            s3_manager = get_s3_manager()
            s3_manager.load_training_data_from_s3()
            logger.info("✅ Historical training data loaded from S3")
            
            # Load the training dataset (historical data)
            data_path = 'data_pipelines/unified_dataset/output/unified_customer_dataset.parquet'
            
            if os.path.exists(data_path):
                df = pd.read_parquet(data_path)
                logger.info(f"✅ Loaded historical training dataset: {df.shape}")
                logger.info(f"📅 This dataset contains historical data for model training")
                return df
            else:
                logger.error(f"❌ Historical training dataset not found at {data_path}")
                return None
        except Exception as e:
            logger.error(f"❌ Failed to load historical training data: {e}")
            return None
    
    def prepare_features(self, df, target_col):
        """Prepare features for campaign optimization"""
        logger.info(f"🔧 Preparing features for {target_col}...")
        
        # Select features based on target - use columns that actually exist
        if target_col == 'campaign_success':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'customer_lifetime_value',
                'campaign_count', 'avg_roas', 'avg_ctr', 'total_campaign_revenue',
                'campaign_response_rate', 'avg_ctr_lift', 'rfm_score',
                'total_sessions', 'total_events',
                'conversion_probability', 'churn_risk', 'upsell_potential'
            ]
        elif target_col == 'budget_optimization':
            feature_columns = [
                'recency_days', 'frequency', 'monetary_value',
                'avg_order_value', 'customer_lifetime_value',
                'campaign_count', 'avg_roas', 'total_campaign_revenue',
                'rfm_score', 'conversion_probability'
            ]
        else:
            logger.error(f"❌ Unknown target column: {target_col}")
            return None, None
        
        # Filter available features
        available_features = [col for col in feature_columns if col in df.columns]
        
        if len(available_features) < 5:
            logger.warning(f"⚠️  Only {len(available_features)} features available for {target_col}")
            
        # Create feature matrix
        X = df[available_features].copy()
        
        # Handle missing values
        X = X.fillna(X.median())
        
        # Remove outliers using IQR method
        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower_bound, upper_bound)
        
        # Create synthetic target variable if it doesn't exist
        if target_col == 'campaign_success':
            if 'campaign_success' not in df.columns:
                # Create synthetic success based on campaign performance
                y = ((df['avg_roas'] > df['avg_roas'].median()) & 
                     (df['avg_ctr'] > df['avg_ctr'].median())).astype(int)
                logger.info("🔧 Created synthetic campaign_success target")
            else:
                y = df['campaign_success']
        elif target_col == 'budget_optimization':
            if 'optimal_budget' not in df.columns:
                # Create synthetic optimal budget based on customer value
                y = df['customer_lifetime_value'] * 0.1  # 10% of CLV
                logger.info("🔧 Created synthetic optimal_budget target")
            else:
                y = df['optimal_budget']
        
        logger.info(f"✅ Prepared {len(X)} customers with {len(available_features)} features for {target_col}")
        return X, y
    
    def train_campaign_success_model(self, X, y):
        """Train campaign success prediction model"""
        logger.info("🎯 Training Campaign Success model...")
        
        try:
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Train model
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_train_scaled, y_train)
            
            # Evaluate
            y_pred = model.predict(X_test_scaled)
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            
            # Cross-validation
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
            
            logger.info(f"✅ Campaign Success model trained successfully")
            logger.info(f"   Accuracy: {accuracy:.4f}")
            logger.info(f"   F1 Score: {f1:.4f}")
            logger.info(f"   CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
            
            # Store model and scaler
            self.models['campaign_success'] = model
            self.scalers['campaign_success'] = scaler
            
            # Store metadata
            self.metadata['campaign_success'] = {
                'feature_columns': list(X.columns),
                'model_type': 'RandomForestClassifier',
                'training_date': datetime.now().isoformat(),
                'accuracy': accuracy,
                'f1_score': f1,
                'cv_score_mean': cv_scores.mean(),
                'cv_score_std': cv_scores.std()
            }
            
            return model, scaler, accuracy, f1
            
        except Exception as e:
            logger.error(f"❌ Failed to train Campaign Success model: {e}")
            return None, None, 0, 0
    
    def train_budget_optimization_model(self, X, y):
        """Train budget optimization model"""
        logger.info("🎯 Training Budget Optimization model...")
        
        try:
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Train model
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train_scaled, y_train)
            
            # Evaluate
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            # Cross-validation
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
            
            logger.info(f"✅ Budget Optimization model trained successfully")
            logger.info(f"   MSE: {mse:.4f}")
            logger.info(f"   MAE: {mae:.4f}")
            logger.info(f"   R²: {r2:.4f}")
            logger.info(f"   CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
            
            # Store model and scaler
            self.models['budget_optimization'] = model
            self.scalers['budget_optimization'] = scaler
            
            # Store metadata
            self.metadata['budget_optimization'] = {
                'feature_columns': list(X.columns),
                'model_type': 'RandomForestRegressor',
                'training_date': datetime.now().isoformat(),
                'mse': mse,
                'mae': mae,
                'r2_score': r2,
                'cv_r2_mean': cv_scores.mean(),
                'cv_r2_std': cv_scores.std()
            }
            
            return model, scaler, mse, r2
            
        except Exception as e:
            logger.error(f"❌ Failed to train Budget Optimization model: {e}")
            return None, None, 0, 0
    
    def save_models_direct(self):
        """Save models directly to S3"""
        logger.info("💾 Saving models directly to S3...")
        
        try:
            s3_manager = get_s3_manager()
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            
            for model_name, model in self.models.items():
                # Save model
                model_key = f'models/campaign_optimization/{model_name}_model.pkl'
                model_bytes = joblib.dumps(model)
                model_success = s3_manager.upload_bytes_direct(
                    model_bytes, model_key, 'application/octet-stream'
                )
                
                # Save scaler if exists
                if model_name in self.scalers and self.scalers[model_name] is not None:
                    scaler_key = f'models/campaign_optimization/{model_name}_scaler.pkl'
                    scaler_bytes = joblib.dumps(self.scalers[model_name])
                    scaler_success = s3_manager.upload_bytes_direct(
                        scaler_bytes, scaler_key, 'application/octet-stream'
                    )
                else:
                    scaler_success = True
                
                # Save metadata
                metadata_key = f'models/campaign_optimization/{model_name}_metadata.yaml'
                metadata_bytes = yaml.dump(self.metadata[model_name], default_flow_style=False).encode('utf-8')
                metadata_success = s3_manager.upload_bytes_direct(
                    metadata_bytes, metadata_key, 'text/yaml'
                )
                
                if model_success and scaler_success and metadata_success:
                    logger.info(f"✅ {model_name} model, scaler, and metadata uploaded to S3")
                else:
                    logger.warning(f"⚠️  Some {model_name} files failed to upload to S3")
            
            logger.info("✅ All models saved directly to S3")
            
        except Exception as e:
            logger.error(f"❌ Failed to save models: {e}")
    
    def run_training_pipeline(self):
        """Run the complete training pipeline"""
        logger.info("🚀 Starting Campaign Optimization Training Pipeline...")
        
        try:
            # Load data
            df = self.load_data()
            if df is None:
                raise Exception("Failed to load training data")
            
            # Train Campaign Success model
            X_success, y_success = self.prepare_features(df, 'campaign_success')
            if X_success is not None and y_success is not None:
                self.train_campaign_success_model(X_success, y_success)
            
            # Train Budget Optimization model
            X_budget, y_budget = self.prepare_features(df, 'budget_optimization')
            if X_budget is not None and y_budget is not None:
                self.train_budget_optimization_model(X_budget, y_budget)
            
            # Save models
            if self.models:
                self.save_models_direct()
                
                logger.info("=" * 60)
                logger.info("🎉 TRAINING COMPLETED!")
                logger.info("=" * 60)
                logger.info(f"📊 Trained {len(self.models)} models on {len(df)} customers")
                logger.info(f"🔧 Used features: {list(self.models.keys())}")
                logger.info(f"💾 Models saved to: S3")
                
                return True
            else:
                logger.error("❌ No models were trained successfully")
                return False
                
        except Exception as e:
            logger.error(f"❌ Error in training pipeline: {e}")
            return False

## Run the Training Pipeline

In [5]:
# Initialize and run the training pipeline
if __name__ == "__main__":
    pipeline = CampaignOptimizationPipeline()
    success = pipeline.run_training_pipeline()
    
    if success:
        print("\n🎉 Campaign Optimization Training completed successfully!")
        print(f"📊 Trained {len(pipeline.models)} models")
        print("🔧 Models ready for inference!")
    else:
        print("\n❌ Training pipeline failed. Check logs for details.")

INFO:__main__:🚀 Starting Campaign Optimization Training Pipeline...
INFO:__main__:🔍 Loading historical training data from S3...
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:utils.s3_utils:Loading historical training data from S3 with smart caching...
INFO:utils.s3_utils:File unchanged, skipping: amato_pm/data_pipelines/unified_dataset/output/recent_customer_dataset.parquet
INFO:utils.s3_utils:File unchanged, skipping: amato_pm/data_pipelines/unified_dataset/output/timeline_datasets_metadata.yaml
INFO:utils.s3_utils:File unchanged, skipping: amato_pm/data_pipelines/unified_dataset/output/unified_customer_dataset.parquet
INFO:__main__:✅ Historical training data loaded from S3
INFO:__main__:✅ Loaded historical training dataset: (8514, 89)
INFO:__main__:📅 This dataset contains historical data for model training
INFO:__main__:🔧 Preparing features for campaign_success...
INFO:__main__:🔧 Created synthetic campaign_success target
INFO:__main__


🎉 Campaign Optimization Training completed successfully!
📊 Trained 2 models
🔧 Models ready for inference!
