In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import warnings
import re
from datetime import datetime

warnings.filterwarnings('ignore')

# Custom transformers for specific data processing tasks
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract features from date columns"""
    def __init__(self, date_columns):
        self.date_columns = date_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.date_columns:
            if col in X.columns:
                X[col] = pd.to_datetime(X[col], errors='coerce')
                X[f'{col}_month'] = X[col].dt.month
                X[f'{col}_dayofweek'] = X[col].dt.dayofweek
                X[f'{col}_year'] = X[col].dt.year
                # Drop original date column
                X = X.drop(columns=[col])
        return X

class IncomeBandEncoder(BaseEstimator, TransformerMixin):
    """Encode income bands to numerical values"""
    def __init__(self):
        self.income_mapping = {
            '50,000 or Below': 0,
            '50,000 to 100,000': 1,
            '100,000 to 200,000': 2,
            '200,000 to 300,000': 3,
            '300,000 to 500,000': 4,
            '500,000 or Above': 5
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if 'Income_Band_SGD' in X.columns:
            X['Income_Band_SGD'] = X['Income_Band_SGD'].map(self.income_mapping)
        return X

class BooleanConverter(BaseEstimator, TransformerMixin):
    """Convert various boolean representations to proper booleans"""
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        bool_columns = ['Partial_Payment_Indicator', 'Repayment_Irregularity_Flags',
                       'Mobile_Number_Active_Status', 'Email_Activity', 'Do_Not_Call_Registry_Data',
                       'WhatsApp_OTT_usage_Indicator', 'Overdraft_or_Low_Balance_Flag',
                       'Delinquency_on_other_Loans']

        for col in bool_columns:
            if col in X.columns:
                X[col] = X[col].astype(str).str.lower().replace({
                    'true': True, 'false': False, '1': True, '0': False,
                    'yes': True, 'no': False
                }).astype(bool)
        return X

class AddressProcessor(BaseEstimator, TransformerMixin):
    """Extract area and pincode from address column but don't encode them"""
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if 'Address' in X.columns:
            # Extract area and pincode from address (these will be kept as passthrough, not encoded)
            extracted_data = X['Address'].apply(self._extract_area_and_pincode)
            X['Area_From_Address'] = extracted_data.apply(lambda x: x[0])
            X['Pincode_From_Address'] = extracted_data.apply(lambda x: x[1])

        return X

    def _extract_area_and_pincode(self, address):
        """Extract area name and pincode from address"""
        if pd.isna(address):
            return (None, None)

        # Common Singapore areas (from your data generation code)
        singapore_areas = [
            "Raffles Place", "Marina Bay", "Suntec City", "Harbourfront",
            "Serangoon Garden", "Marine Parade", "Bukit Timah", "Orchard",
            "Tanjong Pagar", "Chinatown", "Little India", "Kampong Glam",
            "Bugis", "Dhoby Ghaut", "Somerset", "City Hall",
            "Lavender", "Kallang", "Geylang", "Eunos",
            "Bedok", "Tampines", "Pasir Ris", "Simei",
            "Jurong East", "Jurong West", "Clementi", "Bukit Batok",
            "Bukit Panjang", "Choa Chu Kang", "Woodlands", "Yishun",
            "Sembawang", "Ang Mo Kio", "Bishan", "Toa Payoh",
            "Serangoon", "Hougang", "Punggol", "Sengkang"
        ]

        # Look for area names in the address
        address_lower = address.lower()
        area_found = None

        for area in singapore_areas:
            if area.lower() in address_lower:
                area_found = area
                break

        # Extract pincode (6-digit Singapore pincode)
        pincode_match = re.search(r'(\d{6})', address)
        pincode = pincode_match.group(1) if pincode_match else None

        # If no area found using exact match, try to extract from address structure
        if area_found is None:
            # Try to extract the word before "Singapore" or look for common patterns
            match = re.search(r',\s*([^,]+?),\s*Singapore', address, re.IGNORECASE)
            if match:
                potential_area = match.group(1).strip()
                # Check if this potential area is in our list
                for area in singapore_areas:
                    if area.lower() in potential_area.lower():
                        area_found = area
                        break
                else:
                    area_found = potential_area  # Use as-is if not in list

        return (area_found, pincode)

class EmailValidator(BaseEstimator, TransformerMixin):
    """Validate email addresses and extract domain information"""
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if 'Email_ID' in X.columns:
            # Validate email format
            X['Email_Valid_Format'] = X['Email_ID'].apply(self._validate_email_format)

            # Extract domain
            X['Email_Domain'] = X['Email_ID'].apply(self._extract_domain)

            # Check if domain is legitimate (common email providers)
            X['Email_Domain_Legitimate'] = X['Email_Domain'].apply(self._is_legitimate_domain)

            # Check for disposable email domains
            X['Email_Disposable'] = X['Email_Domain'].apply(self._is_disposable_domain)

        return X

    def _validate_email_format(self, email):
        """Validate basic email format"""
        if pd.isna(email) or not isinstance(email, str):
            return False

        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        return bool(re.match(email_pattern, email))

    def _extract_domain(self, email):
        """Extract domain from email address"""
        if pd.isna(email) or not isinstance(email, str) or '@' not in email:
            return None

        return email.split('@')[1].lower()

    def _is_legitimate_domain(self, domain):
        """Check if domain is from a legitimate email provider"""
        if pd.isna(domain):
            return False

        legitimate_domains = [
            'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'live.com',
            'icloud.com', 'protonmail.com', 'zoho.com', 'aol.com', 'mail.com',
            'gmail.com.sg', 'yahoo.com.sg', 'hotmail.com.sg', 'singnet.com.sg',
            'starhub.net.sg', 'pacific.net.sg'
        ]

        return domain in legitimate_domains

    def _is_disposable_domain(self, domain):
        """Check if domain is from a disposable email service"""
        if pd.isna(domain):
            return False

        disposable_domains = [
            'tempmail.com', '10minutemail.com', 'guerrillamail.com',
            'mailinator.com', 'yopmail.com', 'trashmail.com',
            'disposableemail.com', 'fakeinbox.com', 'temp-mail.org'
        ]

        return domain in disposable_domains

class DataLoader(BaseEstimator, TransformerMixin):
    """Load and initial data processing"""
    def __init__(self, file_path, chunk_size=10000):
        self.file_path = file_path
        self.chunk_size = chunk_size

    def fit(self, X, y=None):
        return self

    def transform(self, X=None):
        """Load data from CSV file"""
        print("Loading data...")

        try:
            # Load data in chunks for memory efficiency
            chunks = []
            for i, chunk in enumerate(pd.read_csv(self.file_path, chunksize=self.chunk_size)):
                chunks.append(chunk)
                if (i + 1) % 10 == 0:  # Print progress every 10 chunks
                    print(f"Loaded {min((i+1)*self.chunk_size, self._get_total_rows()):,} rows")

            df = pd.concat(chunks, ignore_index=True)
            print(f"Successfully loaded {len(df):,} rows")
            return df
        except Exception as e:
            print(f"Error loading data: {e}")
            # Return empty DataFrame with expected structure
            return pd.DataFrame()

    def _get_total_rows(self):
        """Get total number of rows in the file"""
        try:
            return sum(1 for line in open(self.file_path)) - 1  # Subtract header
        except:
            return 0

class ColumnDropper(BaseEstimator, TransformerMixin):
    """Drop specified columns from the dataset"""
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Drop columns if they exist
        columns_present = [col for col in self.columns_to_drop if col in X.columns]
        X = X.drop(columns=columns_present)
        return X

class UpdatedSklearnDataPipeline:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pipeline = None
        self.processed_data = None
        self.column_names = None
        self._build_pipeline()

    def _get_numeric_columns(self):
        """Define numeric columns for preprocessing"""
        return ['Age', 'Loan_Amount_SGD', 'Outstanding_Balance_SGD', 'Day_Past_Due',
                'Tenure', 'Interest_Rate', 'Current_EMI_SGD', 'Number_of_Past_Payments',
                'Amount_Paid_Each_Month_SGD', 'Missed_Payments_Count', 'Bounce_History',
                'Contact_History_Call_Attempts', 'Contact_History_SMS', 'Contact_History_WhatsApp',
                'Contact_History_EmailLogs', 'No_of_Attempts', 'Average_Handling_Time',
                'Credit_Score', 'Recent_Inquiries', 'Loan_Exposure_Across_Banks',
                'Recent_Score_Change', 'Unemployeement_rate_region', 'Inflation_Rate',
                'Interest_Rate_Trend', 'Economic_Stress_Index', 'Income_Band_SGD',
                'Utility_Spend_SGD', 'Shopping_Spend_SGD', 'Entertainment_Spend_SGD',
                'Health_Spend_SGD', 'Education_Spend_SGD', 'Travel_Spend_SGD',
                'Monthly_Spend_Trend_SGD', 'Seasonal_Spend_Variation', 'Weekend_Spend_Ratio',
                'Festive_Season_Spend_SGD', 'Total_Monthly_Spend_SGD', 'Spend_to_Income_Ratio',
                'UPI_Transaction_Count', 'Debit_Card_Transaction_Count', 'Credit_Card_Transaction_Count',
                'Cash_Withdrawal_Count', 'Recurring_Transaction_Count', 'Recurring_Payment_Ratio',
                'Savings_to_Spend_Ratio', 'Spend_Growth_Rate_YoY', 'High_Value_Transaction_Count',
                'Flight_Risk_Score', 'Financial_Stress_Score', 'AAR_Score',
                'Successful_Contacts_Count', 'Contact_Success_Rate', 'Customer_Best_Agent_Interaction_Count',
                'App_Login_Frequency', 'Online_Banking_Activity', 'Monthly_Income_SGD']

    def _get_categorical_columns(self):
        """Define categorical columns for preprocessing (Region will be encoded here)"""
        return ['Product_Type', 'Payment_Frequency', 'Settlement_History',
                'Channel_used', 'Response_Outcome', 'Urban_Rural_Tag',
                'Language_Preference', 'Smartphone_Penetration', 'Preferred_Channel',
                'Call_SMS_Activity_Patterns', 'WhatsApp_OTT_usage_Indicator',
                'Regional_Time_Restrictions', 'Communication_Complaince_Limits',
                'Gender', 'Occupation', 'Employeement_Type', 'Customer_Employment_Status',
                'Finance_Stress_Status', 'Preferred_Payment_Channel', 'Financial_Health_Status',
                'Avg_Balance_Trends', 'AAR_Risk_Level', 'Region']  # Region added for encoding

    def _get_agent_columns(self):
        """Define agent columns that should be left as-is"""
        return ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']

    def _get_passthrough_columns(self):
        """Define columns that should be passed through without transformation"""
        return ['Area_From_Address', 'Pincode_From_Address', 'Address']  # Updated to area and pincode

    def _get_date_columns(self):
        """Define date columns for processing"""
        return ['Installment_Due_Date', 'Last_Payment_Date']

    def _get_columns_to_drop(self):
        """Define columns to drop from the dataset"""
        return ['Primary_Phone_Number', 'Secondary_Mobile_Number', 'Landline_Phone_Number']

    def _build_pipeline(self):
        """Build the updated pipeline"""

        # Main preprocessing pipeline
        self.pipeline = Pipeline([
            # Step 1: Load data
            ('data_loader', DataLoader(self.file_path)),

            # Step 2: Drop phone number columns
            ('column_dropper', ColumnDropper(self._get_columns_to_drop())),

            # Step 3: Process address information (extract area and pincode but don't encode)
            ('address_processor', AddressProcessor()),

            # Step 4: Validate email addresses
            ('email_validator', EmailValidator()),

            # Step 5: Convert boolean columns
            ('boolean_converter', BooleanConverter()),

            # Step 6: Encode income bands
            ('income_encoder', IncomeBandEncoder()),

            # Step 7: Extract date features
            ('date_extractor', DateFeatureExtractor(self._get_date_columns())),
        ])

    def fit_transform(self, save_path=None):
        """Run the complete pipeline and return processed data"""
        print("=" * 50)
        print("RUNNING UPDATED SKLEARN DATA PIPELINE")
        print("=" * 50)

        try:
            # Run the pipeline steps manually to maintain DataFrame structure
            df = self.pipeline.named_steps['data_loader'].transform(None)
            original_shape = df.shape

            print("Applying preprocessing steps...")
            for step_name, transformer in list(self.pipeline.named_steps.items())[1:]:
                print(f"Applying {step_name}...")
                df = transformer.fit_transform(df)

            # Remove duplicates
            df = df.drop_duplicates()

            # Apply standard preprocessing to categorical columns only
            # Numeric columns will be handled in feature engineering
            categorical_features = [col for col in self._get_categorical_columns() if col in df.columns]
            agent_features = [col for col in self._get_agent_columns() if col in df.columns]
            passthrough_features = [col for col in self._get_passthrough_columns() if col in df.columns]

            # Handle categorical columns (including Region)
            if categorical_features:
                categorical_imputer = SimpleImputer(strategy='most_frequent')
                onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

                # Impute missing values
                df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])

                # One-hot encode categorical variables (including Region)
                encoded_array = onehot_encoder.fit_transform(df[categorical_features])
                encoded_columns = onehot_encoder.get_feature_names_out(categorical_features)

                # Create DataFrame for encoded features
                encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=df.index)

                # Drop original categorical columns and add encoded ones
                df = df.drop(columns=categorical_features)
                df = pd.concat([df, encoded_df], axis=1)

            # Agent columns, Area_From_Address, and Pincode_From_Address are left as-is (no transformation)
            # They remain in the DataFrame

            self.processed_data = df.reset_index(drop=True)

            # Save if path provided
            if save_path:
                self.save_processed_data(save_path)

            print("Pipeline completed successfully!")
            print(f"Original shape: {original_shape}")
            print(f"Processed shape: {self.processed_data.shape}")

            return self.processed_data

        except Exception as e:
            print(f"Error in pipeline: {e}")
            import traceback
            traceback.print_exc()
            return None

    def save_processed_data(self, path):
        """Save the processed data"""
        if self.processed_data is not None:
            self.processed_data.to_csv(path, index=False)
            print(f"Processed data saved to {path}")

            # Also save the pipeline for future use
            joblib.dump(self.pipeline, 'data_pipeline.pkl')
            print("Pipeline saved as 'data_pipeline.pkl'")

    def load_and_transform_new_data(self, new_data_path):
        """Load and transform new data using the fitted pipeline"""
        if self.pipeline is None:
            raise ValueError("Pipeline not fitted yet. Call fit_transform first.")

        # Load new data
        new_data_loader = DataLoader(new_data_path)
        new_df = new_data_loader.transform(None)

        # Apply the same transformations
        for step_name, transformer in list(self.pipeline.named_steps.items())[1:]:
            new_df = transformer.transform(new_df)

        return new_df

# Example usage
if __name__ == "__main__":
    # Initialize the updated sklearn pipeline
    updated_pipeline = UpdatedSklearnDataPipeline('singapore_loan_data.csv')

    # Run the complete pipeline
    processed_data = updated_pipeline.fit_transform(save_path='processed_data.csv')

    if processed_data is not None:
        # Display results
        print("\nPipeline Summary:")
        print(f"Processed data shape: {processed_data.shape}")
        print(f"Processed data columns: {len(processed_data.columns)}")

        print("\nSample of processed data:")
        print(processed_data.head())

        print("\nData types after processing:")
        print(processed_data.dtypes.value_counts())

        # Show the new columns created by the data cleaning processes
        new_columns = ['Area_From_Address', 'Pincode_From_Address', 'Email_Valid_Format', 'Email_Domain',
                       'Email_Domain_Legitimate', 'Email_Disposable']

        print("\nNew columns created by data cleaning:")
        for col in new_columns:
            if col in processed_data.columns:
                print(f"{col}: {processed_data[col].notna().sum()} non-null values")

        # Show date features created
        date_features = ['Installment_Due_Date_month', 'Installment_Due_Date_dayofweek',
                        'Installment_Due_Date_year', 'Last_Payment_Date_month',
                        'Last_Payment_Date_dayofweek', 'Last_Payment_Date_year']

        print("\nDate features created:")
        for col in date_features:
            if col in processed_data.columns:
                print(f"{col}: {processed_data[col].notna().sum()} non-null values")

        # Show agent columns are preserved as-is
        agent_columns = ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']
        print("\nAgent columns preserved as-is:")
        for col in agent_columns:
            if col in processed_data.columns:
                print(f"{col}: {processed_data[col].notna().sum()} non-null values")
                print(f"Sample values: {processed_data[col].head(3).tolist()}")

        # Show Region encoding
        region_encoded_columns = [col for col in processed_data.columns if col.startswith('Region_')]
        print(f"\nRegion encoded columns ({len(region_encoded_columns)}):")
        for col in region_encoded_columns[:5]:  # Show first 5
            print(f"  {col}: {processed_data[col].sum()} records")
        if len(region_encoded_columns) > 5:
            print(f"  ... and {len(region_encoded_columns) - 5} more region columns")

        # Show Area_From_Address and Pincode_From_Address are preserved as-is (not encoded)
        if 'Area_From_Address' in processed_data.columns:
            print(f"\nArea_From_Address (preserved as-is):")
            print(f"  Unique values: {processed_data['Area_From_Address'].nunique()}")
            print(f"  Sample values: {processed_data['Area_From_Address'].head(5).tolist()}")

        if 'Pincode_From_Address' in processed_data.columns:
            print(f"\nPincode_From_Address (preserved as-is):")
            print(f"  Unique values: {processed_data['Pincode_From_Address'].nunique()}")
            print(f"  Sample values: {processed_data['Pincode_From_Address'].head(5).tolist()}")

        # Verify phone number columns are removed
        phone_columns = ['Primary_Phone_Number', 'Secondary_Mobile_Number', 'Landline_Phone_Number']
        print(f"\nPhone number columns removed:")
        for col in phone_columns:
            if col not in processed_data.columns:
                print(f"  ‚úì {col} successfully removed")
    else:
        print("Pipeline failed to process data.")

RUNNING UPDATED SKLEARN DATA PIPELINE
Loading data...
Loaded 100,000 rows
Successfully loaded 100,000 rows
Applying preprocessing steps...
Applying column_dropper...
Applying address_processor...
Applying email_validator...
Applying boolean_converter...
Applying income_encoder...
Applying date_extractor...
Processed data saved to processed_data.csv
Pipeline saved as 'data_pipeline.pkl'
Pipeline completed successfully!
Original shape: (100000, 110)
Processed shape: (100000, 179)

Pipeline Summary:
Processed data shape: (100000, 179)
Processed data columns: 179

Sample of processed data:
    Customer_id  Loan_Account_id  Loan_Amount_SGD  Outstanding_Balance_SGD  \
0  SCB843421788         32765033          10000.0                  8591.56   
1  SCB998027725         87850971          70000.0                 51361.65   
2  SCB871158951         24716289          52000.0                 44262.73   
3  SCB938686930         36505419         125000.0                125000.00   
4  SCB843983697  

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
import joblib
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')

class NumericPreprocessor(BaseEstimator, TransformerMixin):
    """Handle numeric columns imputation and scaling"""

    def __init__(self):
        self.numeric_imputer = None
        self.numeric_scaler = None
        self.numeric_columns_ = None

    def fit(self, X, y=None):
        # Define numeric columns
        self.numeric_columns_ = [
            'Age', 'Loan_Amount_SGD', 'Outstanding_Balance_SGD', 'Day_Past_Due',
            'Tenure', 'Interest_Rate', 'Current_EMI_SGD', 'Number_of_Past_Payments',
            'Amount_Paid_Each_Month_SGD', 'Missed_Payments_Count', 'Bounce_History',
            'Contact_History_Call_Attempts', 'Contact_History_SMS', 'Contact_History_WhatsApp',
            'Contact_History_EmailLogs', 'No_of_Attempts', 'Average_Handling_Time',
            'Credit_Score', 'Recent_Inquiries', 'Loan_Exposure_Across_Banks',
            'Recent_Score_Change', 'Unemployeement_rate_region', 'Inflation_Rate',
            'Interest_Rate_Trend', 'Economic_Stress_Index', 'Income_Band_SGD',
            'Utility_Spend_SGD', 'Shopping_Spend_SGD', 'Entertainment_Spend_SGD',
            'Health_Spend_SGD', 'Education_Spend_SGD', 'Travel_Spend_SGD',
            'Monthly_Spend_Trend_SGD', 'Seasonal_Spend_Variation', 'Weekend_Spend_Ratio',
            'Festive_Season_Spend_SGD', 'Total_Monthly_Spend_SGD', 'Spend_to_Income_Ratio',
            'UPI_Transaction_Count', 'Debit_Card_Transaction_Count', 'Credit_Card_Transaction_Count',
            'Cash_Withdrawal_Count', 'Recurring_Transaction_Count', 'Recurring_Payment_Ratio',
            'Savings_to_Spend_Ratio', 'Spend_Growth_Rate_YoY', 'High_Value_Transaction_Count',
            'Flight_Risk_Score', 'Financial_Stress_Score', 'AAR_Score',
            'Successful_Contacts_Count', 'Contact_Success_Rate', 'Customer_Best_Agent_Interaction_Count',
            'App_Login_Frequency', 'Online_Banking_Activity', 'Monthly_Income_SGD'
        ]

        # Filter to only columns that exist in the data
        available_numeric_cols = [col for col in self.numeric_columns_ if col in X.columns]

        if available_numeric_cols:
            self.numeric_imputer = SimpleImputer(strategy='median')
            self.numeric_scaler = StandardScaler()

            # Fit on available numeric data
            numeric_data = X[available_numeric_cols]
            self.numeric_imputer.fit(numeric_data)
            self.numeric_scaler.fit(numeric_data)

        return self

    def transform(self, X):
        X = X.copy()

        # Filter to only columns that exist in the data
        available_numeric_cols = [col for col in self.numeric_columns_ if col in X.columns]

        if available_numeric_cols and self.numeric_imputer is not None:
            print(f"üîß Processing {len(available_numeric_cols)} numeric columns...")

            # Impute missing values
            X[available_numeric_cols] = self.numeric_imputer.transform(X[available_numeric_cols])

            # Scale numeric features
            X[available_numeric_cols] = self.numeric_scaler.transform(X[available_numeric_cols])

            print(f"‚úÖ Numeric preprocessing completed: imputation + scaling")

        return X

class FinancialRiskCalculator(BaseEstimator, TransformerMixin):
    """Calculate comprehensive financial risk scores and ratios"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Debt Service Coverage Ratio (DSCR)
        if all(col in X.columns for col in ['Monthly_Income_SGD', 'Current_EMI_SGD']):
            X['DSCR'] = X['Monthly_Income_SGD'] / (X['Current_EMI_SGD'] + 1e-8)

        # Loan-to-Income Ratio
        if all(col in X.columns for col in ['Loan_Amount_SGD', 'Monthly_Income_SGD']):
            X['Loan_to_Income_Ratio'] = X['Loan_Amount_SGD'] / (X['Monthly_Income_SGD'] * 12 + 1e-8)

        # Credit Utilization Ratio
        if all(col in X.columns for col in ['Outstanding_Balance_SGD', 'Loan_Amount_SGD']):
            X['Credit_Utilization_Ratio'] = X['Outstanding_Balance_SGD'] / (X['Loan_Amount_SGD'] + 1e-8)

        # Payment Efficiency Score
        if all(col in X.columns for col in ['Number_of_Past_Payments', 'Tenure', 'Missed_Payments_Count']):
            total_expected_payments = X['Tenure'] * 12  # Assuming monthly payments
            X['Payment_Efficiency'] = (X['Number_of_Past_Payments'] - X['Missed_Payments_Count']) / (total_expected_payments + 1e-8)

        # Financial Stress Composite Score
        stress_indicators = []
        if 'Financial_Stress_Score' in X.columns:
            stress_indicators.append(X['Financial_Stress_Score'])
        if 'Day_Past_Due' in X.columns:
            stress_indicators.append(X['Day_Past_Due'] / 100)  # Normalize
        if 'Missed_Payments_Count' in X.columns:
            stress_indicators.append(X['Missed_Payments_Count'] / 12)  # Normalize by year

        if stress_indicators:
            X['Composite_Financial_Stress_Score'] = pd.concat(stress_indicators, axis=1).mean(axis=1)

        # Behavioral Spending Pattern
        if all(col in X.columns for col in ['Total_Monthly_Spend_SGD', 'Monthly_Income_SGD']):
            X['Spending_Behavior_Ratio'] = X['Total_Monthly_Spend_SGD'] / (X['Monthly_Income_SGD'] + 1e-8)

        # Liquidity Score
        if all(col in X.columns for col in ['Savings_to_Spend_Ratio', 'High_Value_Transaction_Count']):
            X['Liquidity_Score'] = (X['Savings_to_Spend_Ratio'] +
                                   X['High_Value_Transaction_Count'] / 100)  # Normalize

        return X

class CustomerSegmentationEngine(BaseEstimator, TransformerMixin):
    """Advanced customer segmentation using RFM and financial behavior"""

    def __init__(self, n_segments=5):
        self.n_segments = n_segments
        self.kmeans = None

    def fit(self, X, y=None):
        # Prepare features for segmentation
        segmentation_features = self._prepare_segmentation_features(X)
        if len(segmentation_features) > 0:
            self.kmeans = KMeans(n_clusters=self.n_segments, random_state=42)
            self.kmeans.fit(segmentation_features)
        return self

    def transform(self, X):
        X = X.copy()

        # RFM-like segments for loan customers
        # Recency: Days since last payment (inverse of Day_Past_Due)
        if 'Day_Past_Due' in X.columns:
            X['Recency_Score'] = 1 / (X['Day_Past_Due'] + 1)

        # Frequency Score (payment regularity)
        if all(col in X.columns for col in ['Number_of_Past_Payments', 'Missed_Payments_Count']):
            X['Frequency_Score'] = X['Number_of_Past_Payments'] / (X['Number_of_Past_Payments'] + X['Missed_Payments_Count'] + 1e-8)

        # Monetary Score (financial capacity)
        monetary_components = []
        if 'Loan_Amount_SGD' in X.columns:
            monetary_components.append(X['Loan_Amount_SGD'])
        if 'Monthly_Income_SGD' in X.columns:
            monetary_components.append(X['Monthly_Income_SGD'])
        if 'Total_Monthly_Spend_SGD' in X.columns:
            monetary_components.append(X['Total_Monthly_Spend_SGD'])

        if monetary_components:
            X['Monetary_Score'] = pd.concat(monetary_components, axis=1).mean(axis=1)

        # RFM Composite Score
        rfm_components = [col for col in ['Recency_Score', 'Frequency_Score', 'Monetary_Score'] if col in X.columns]
        if rfm_components:
            X['RFM_Score'] = pd.concat([X[col] for col in rfm_components], axis=1).mean(axis=1)

        # Behavioral Segments
        conditions = [
            # High Risk Delinquent
            (X.get('Day_Past_Due', 0) > 30) & (X.get('Financial_Stress_Score', 0) > 0.7),

            # Stable Payers
            (X.get('Day_Past_Due', 0) <= 0) & (X.get('Payment_Efficiency', 0) > 0.9),

            # High Value Customers
            (X.get('Loan_Amount_SGD', 0) > X['Loan_Amount_SGD'].quantile(0.75)) &
            (X.get('Monthly_Income_SGD', 0) > X['Monthly_Income_SGD'].quantile(0.75)),

            # Digital Savvy
            (X.get('App_Login_Frequency', 0) > 0.7) &
            (X.get('Online_Banking_Activity', 0) > 0.7),

            # Traditional Customers
            (X.get('App_Login_Frequency', 0) < 0.3) &
            (X.get('Online_Banking_Activity', 0) < 0.3)
        ]

        segments = ['High_Risk_Delinquent', 'Stable_Payer', 'High_Value', 'Digital_Savvy', 'Traditional']

        X['Behavioral_Segment'] = 'Standard'
        for condition, segment in zip(conditions, segments):
            if isinstance(condition, pd.Series):
                X.loc[condition, 'Behavioral_Segment'] = segment

        # Apply K-means clustering if fitted
        if self.kmeans is not None:
            segmentation_features = self._prepare_segmentation_features(X)
            if len(segmentation_features) > 0:
                X['KMeans_Cluster'] = self.kmeans.predict(segmentation_features)

        return X

    def _prepare_segmentation_features(self, X):
        features = []
        numeric_cols = ['Loan_Amount_SGD', 'Monthly_Income_SGD', 'Day_Past_Due',
                       'Credit_Score', 'Financial_Stress_Score', 'App_Login_Frequency']

        for col in numeric_cols:
            if col in X.columns:
                # Handle potential missing values
                features.append(X[col].fillna(0))

        if features:
            return pd.concat(features, axis=1)
        return pd.DataFrame()

class ChannelEffectivenessCalculator(BaseEstimator, TransformerMixin):
    """Calculate channel effectiveness and customer responsiveness"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Channel Response Success Rate
        channel_columns = {
            'Call': ['Contact_History_Call_Attempts', 'Channel_used_Call'],
            'SMS': ['Contact_History_SMS', 'Channel_used_SMS'],
            'WhatsApp': ['Contact_History_WhatsApp', 'Channel_used_WhatsApp'],
            'Email': ['Contact_History_EmailLogs', 'Channel_used_Email']
        }

        for channel, (attempts_col, usage_col) in channel_columns.items():
            if all(col in X.columns for col in [attempts_col, usage_col]):
                if 'Successful_Contacts_Count' in X.columns:
                    # Channel Efficiency (success per attempt)
                    X[f'{channel}_Efficiency'] = X['Successful_Contacts_Count'] / (X[attempts_col] + 1e-8)

                    # Channel Preference Strength
                    X[f'{channel}_Preference_Strength'] = X[usage_col] * X.get(f'{channel}_Efficiency', 0)

        # Multi-Channel Engagement Score
        channel_efficiency_cols = [f'{channel}_Efficiency' for channel in channel_columns.keys()
                                 if f'{channel}_Efficiency' in X.columns]
        if channel_efficiency_cols:
            X['Multi_Channel_Engagement_Score'] = pd.concat([X[col] for col in channel_efficiency_cols], axis=1).mean(axis=1)

        # Time-Based Responsiveness
        if all(col in X.columns for col in ['Contact_Success_Rate', 'Average_Handling_Time']):
            X['Time_Adjusted_Responsiveness'] = X['Contact_Success_Rate'] / (X['Average_Handling_Time'] + 1e-8)

        # Agent Effectiveness - PRESERVING AGENT ID SECTIONS
        if all(col in X.columns for col in ['Customer_Best_Agent_Interaction_Count', 'Contact_History_Call_Attempts']):
            X['Agent_Effectiveness_Score'] = X['Customer_Best_Agent_Interaction_Count'] / (
                X['Contact_History_Call_Attempts'] + 1e-8)

        # Digital vs Traditional Preference
        digital_channels = ['Channel_used_Email', 'Channel_used_WhatsApp', 'Channel_used_SMS']
        traditional_channels = ['Channel_used_Call', 'Channel_used_IVR', 'Channel_used_Field Agent']

        digital_cols = [col for col in digital_channels if col in X.columns]
        traditional_cols = [col for col in traditional_channels if col in X.columns]

        if digital_cols and traditional_cols:
            digital_pref = pd.concat([X[col] for col in digital_cols], axis=1).mean(axis=1)
            traditional_pref = pd.concat([X[col] for col in traditional_cols], axis=1).mean(axis=1)
            X['Digital_vs_Traditional_Preference'] = digital_pref - traditional_pref

        return X

class TemporalPatternEngineer(BaseEstimator, TransformerMixin):
    """Engineer temporal and seasonal patterns"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Payment Cycle Patterns
        if 'Installment_Due_Date_dayofweek' in X.columns:
            # Weekend vs Weekday payment preference
            X['Weekend_Payment_Preference'] = X['Installment_Due_Date_dayofweek'].isin([5, 6]).astype(int)

        # Seasonal Payment Patterns
        if 'Installment_Due_Date_month' in X.columns:
            # Quarter-based segmentation
            X['Payment_Quarter'] = (X['Installment_Due_Date_month'] - 1) // 3 + 1

            # Festive season (year-end)
            X['Festive_Season_Payment'] = X['Installment_Due_Date_month'].isin([11, 12]).astype(int)

        # Recency of Last Payment
        if 'Last_Payment_Date_month' in X.columns and 'Installment_Due_Date_month' in X.columns:
            X['Months_Since_Last_Payment'] = X['Installment_Due_Date_month'] - X['Last_Payment_Date_month']
            X['Months_Since_Last_Payment'] = X['Months_Since_Last_Payment'].apply(
                lambda x: x if x >= 0 else x + 12
            )

        # Payment Regularity Score
        if all(col in X.columns for col in ['Number_of_Past_Payments', 'Tenure']):
            expected_payments = X['Tenure'] * 12  # Assuming monthly payments
            X['Payment_Regularity_Score'] = X['Number_of_Past_Payments'] / (expected_payments + 1e-8)

        return X

class AdvancedDigitalBehaviorEngineer(BaseEstimator, TransformerMixin):
    """Engineer advanced digital behavior features"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Digital Engagement Composite Score
        digital_metrics = []
        if 'App_Login_Frequency' in X.columns:
            digital_metrics.append(X['App_Login_Frequency'])
        if 'Online_Banking_Activity' in X.columns:
            digital_metrics.append(X['Online_Banking_Activity'])
        if 'UPI_Transaction_Count' in X.columns:
            digital_metrics.append(X['UPI_Transaction_Count'] / 100)  # Normalize

        if digital_metrics:
            X['Digital_Engagement_Score'] = pd.concat(digital_metrics, axis=1).mean(axis=1)

        # Transaction Diversity Score
        transaction_types = ['UPI_Transaction_Count', 'Debit_Card_Transaction_Count',
                           'Credit_Card_Transaction_Count', 'Cash_Withdrawal_Count']

        available_transactions = [col for col in transaction_types if col in X.columns]
        if available_transactions:
            transaction_matrix = X[available_transactions]
            X['Transaction_Diversity_Score'] = (transaction_matrix > 0).sum(axis=1) / len(available_transactions)

        # Channel Adaptability Score
        channel_columns = ['Channel_used_Call', 'Channel_used_SMS', 'Channel_used_WhatsApp',
                         'Channel_used_Email', 'Channel_used_IVR', 'Channel_used_Field Agent']

        available_channels = [col for col in channel_columns if col in X.columns]
        if available_channels:
            X['Channel_Adaptability_Score'] = (X[available_channels] > 0).sum(axis=1) / len(available_channels)

        # Communication Responsiveness
        if all(col in X.columns for col in ['Contact_Success_Rate', 'No_of_Attempts']):
            X['Communication_Responsiveness'] = X['Contact_Success_Rate'] / (X['No_of_Attempts'] + 1e-8)

        return X

class AgentIdPreserver(BaseEstimator, TransformerMixin):
    """Preserve agent ID columns throughout the pipeline"""

    def __init__(self):
        self.agent_columns = ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Agent columns are automatically preserved as they remain in the DataFrame
        return X

class FeatureOptimizer(BaseEstimator, TransformerMixin):
    """Optimize and select the best features for channel ranking prediction"""

    def __init__(self):
        self.best_features = None

    def fit(self, X, y=None):
        # Define the optimal feature set based on domain knowledge
        self.best_features = [
            # Customer Identity
            'Customer_id',

            # Financial Risk Features
            'DSCR', 'Loan_to_Income_Ratio', 'Credit_Utilization_Ratio',
            'Payment_Efficiency', 'Composite_Financial_Stress_Score',
            'Spending_Behavior_Ratio', 'Liquidity_Score',

            # Customer Segmentation
            'Recency_Score', 'Frequency_Score', 'Monetary_Score', 'RFM_Score',
            'Behavioral_Segment', 'KMeans_Cluster',

            # Channel Effectiveness
            'Call_Efficiency', 'SMS_Efficiency', 'WhatsApp_Efficiency', 'Email_Efficiency',
            'Multi_Channel_Engagement_Score', 'Time_Adjusted_Responsiveness',
            'Agent_Effectiveness_Score', 'Digital_vs_Traditional_Preference',
            'Channel_Adaptability_Score', 'Communication_Responsiveness',

            # Temporal Patterns
            'Weekend_Payment_Preference', 'Payment_Quarter',
            'Festive_Season_Payment', 'Months_Since_Last_Payment',
            'Payment_Regularity_Score',

            # Digital Behavior
            'Digital_Engagement_Score', 'Transaction_Diversity_Score',

            # Agent Information - PRESERVED
            'Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs',

            # Original Important Features (filtered)
            'Age', 'Loan_Amount_SGD', 'Outstanding_Balance_SGD', 'Day_Past_Due',
            'Credit_Score', 'Monthly_Income_SGD', 'Financial_Stress_Score',
            'Contact_Success_Rate', 'App_Login_Frequency', 'Online_Banking_Activity'
        ]
        return self

    def transform(self, X):
        # Select only features that exist in the dataset
        available_features = [f for f in self.best_features if f in X.columns]

        # Ensure Customer_id is always included
        if 'Customer_id' in X.columns and 'Customer_id' not in available_features:
            available_features = ['Customer_id'] + available_features

        # Always include agent columns if they exist
        agent_columns = ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']
        for agent_col in agent_columns:
            if agent_col in X.columns and agent_col not in available_features:
                available_features.append(agent_col)

        return X[available_features]

class ComprehensiveFeaturePipeline:
    """Comprehensive feature engineering pipeline for channel ranking prediction"""

    def __init__(self, n_segments=5):
        self.n_segments = n_segments
        self.pipeline = None
        self.feature_names = None
        self._build_pipeline()

    def _build_pipeline(self):
        """Build the comprehensive feature engineering pipeline"""

        self.pipeline = Pipeline([
            # Step 0: Agent ID Preserver
            ('agent_preserver', AgentIdPreserver()),

            # Step 1: Numeric Preprocessing (Imputation + Scaling)
            ('numeric_preprocessor', NumericPreprocessor()),

            # Step 2: Financial Risk Assessment
            ('financial_risk_calculator', FinancialRiskCalculator()),

            # Step 3: Customer Segmentation
            ('customer_segmentation', CustomerSegmentationEngine(n_segments=self.n_segments)),

            # Step 4: Channel Effectiveness Analysis
            ('channel_effectiveness', ChannelEffectivenessCalculator()),

            # Step 5: Temporal Pattern Analysis
            ('temporal_engineer', TemporalPatternEngineer()),

            # Step 6: Digital Behavior Analysis
            ('digital_behavior_engineer', AdvancedDigitalBehaviorEngineer()),

            # Step 7: Feature Optimization
            ('feature_optimizer', FeatureOptimizer())
        ])

    def fit_transform(self, X, y=None):
        """Fit and transform the data"""
        print("=" * 60)
        print("COMPREHENSIVE FEATURE ENGINEERING PIPELINE")
        print("=" * 60)

        try:
            # Store original Customer_id and Agent IDs
            original_columns = {}
            important_columns = ['Customer_id', 'Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']

            for col in important_columns:
                if col in X.columns:
                    original_columns[col] = X[col].copy()

            # Transform data
            print("üîÑ Applying feature engineering steps...")
            X_transformed = self.pipeline.fit_transform(X)

            # Ensure important columns are preserved
            for col, values in original_columns.items():
                if col not in X_transformed.columns:
                    X_transformed[col] = values

            # Move Customer_id to first column
            if 'Customer_id' in X_transformed.columns:
                cols = ['Customer_id'] + [col for col in X_transformed.columns if col != 'Customer_id']
                X_transformed = X_transformed[cols]

            self.feature_names = list(X_transformed.columns)

            print("‚úÖ Feature engineering completed successfully!")
            print(f"üìä Original shape: {X.shape}")
            print(f"üìà Engineered shape: {X_transformed.shape}")
            print(f"üéØ Number of features: {len(self.feature_names)}")

            # Show feature categories
            self._analyze_feature_categories(X_transformed)

            return X_transformed

        except Exception as e:
            print(f"‚ùå Error in feature pipeline: {e}")
            import traceback
            traceback.print_exc()
            return None

    def _analyze_feature_categories(self, X):
        """Analyze and display feature categories"""
        feature_categories = {
            'Financial Risk': ['DSCR', 'Loan_to_Income_Ratio', 'Credit_Utilization_Ratio',
                              'Payment_Efficiency', 'Composite_Financial_Stress_Score'],
            'Customer Segmentation': ['Recency_Score', 'Frequency_Score', 'Monetary_Score',
                                    'RFM_Score', 'Behavioral_Segment', 'KMeans_Cluster'],
            'Channel Effectiveness': ['Call_Efficiency', 'SMS_Efficiency', 'WhatsApp_Efficiency',
                                     'Multi_Channel_Engagement_Score', 'Agent_Effectiveness_Score'],
            'Temporal Patterns': ['Weekend_Payment_Preference', 'Payment_Quarter',
                                'Months_Since_Last_Payment', 'Payment_Regularity_Score'],
            'Digital Behavior': ['Digital_Engagement_Score', 'Transaction_Diversity_Score',
                               'Channel_Adaptability_Score'],
            'Agent Information': ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']
        }

        print("\n" + "=" * 50)
        print("FEATURE CATEGORY ANALYSIS")
        print("=" * 50)

        for category, features in feature_categories.items():
            available_features = [f for f in features if f in X.columns]
            if available_features:
                print(f"üìÅ {category}: {len(available_features)} features")
                for feature in available_features[:3]:  # Show first 3
                    print(f"   ‚îú‚îÄ‚îÄ {feature}")
                if len(available_features) > 3:
                    print(f"   ‚îî‚îÄ‚îÄ ... and {len(available_features) - 3} more")

        # Show total engineered features
        engineered_keywords = ['_Score', '_Ratio', '_Efficiency', '_Segment']
        engineered_features = [col for col in X.columns if any(keyword in col for keyword in engineered_keywords)]
        print(f"\nüéØ Total engineered features: {len(engineered_features)}")

        # Verify agent columns are preserved
        agent_columns = [col for col in ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs'] if col in X.columns]
        if agent_columns:
            print(f"üë§ Agent columns preserved: {len(agent_columns)}")
            for agent_col in agent_columns:
                print(f"   ‚úÖ {agent_col}")

    def transform(self, X):
        """Transform new data using fitted pipeline"""
        if self.pipeline is None:
            raise ValueError("Pipeline not fitted yet. Call fit_transform first.")

        # Store original important columns
        original_columns = {}
        important_columns = ['Customer_id', 'Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']

        for col in important_columns:
            if col in X.columns:
                original_columns[col] = X[col].copy()

        X_transformed = self.pipeline.transform(X)

        # Ensure important columns are preserved
        for col, values in original_columns.items():
            if col not in X_transformed.columns:
                X_transformed[col] = values

        if 'Customer_id' in X_transformed.columns:
            cols = ['Customer_id'] + [col for col in X_transformed.columns if col != 'Customer_id']
            X_transformed = X_transformed[cols]

        return X_transformed

    def get_feature_importance_analysis(self):
        """Provide domain-driven feature importance analysis"""
        importance_analysis = {
            'CRITICAL': [
                'DSCR', 'Loan_to_Income_Ratio', 'Day_Past_Due', 'Payment_Efficiency',
                'Composite_Financial_Stress_Score'
            ],
            'HIGH': [
                'RFM_Score', 'Behavioral_Segment', 'Multi_Channel_Engagement_Score',
                'Channel_Adaptability_Score'
            ],
            'MEDIUM': [
                'Digital_Engagement_Score', 'Communication_Responsiveness',
                'Payment_Regularity_Score', 'Transaction_Diversity_Score'
            ],
            'CONTEXTUAL': [
                'Weekend_Payment_Preference', 'Festive_Season_Payment',
                'Digital_vs_Traditional_Preference', 'Agent_Effectiveness_Score'
            ]
        }

        print("\n" + "=" * 50)
        print("DOMAIN EXPERT FEATURE IMPORTANCE ANALYSIS")
        print("=" * 50)

        for importance, features in importance_analysis.items():
            available_features = [f for f in features if f in self.feature_names]
            if available_features:
                print(f"\n{importance} IMPORTANCE:")
                for feature in available_features:
                    print(f"  ‚úÖ {feature}")

# Example usage with your data
if __name__ == "__main__":
    # Load your preprocessed data
    df = pd.read_csv('processed_data.csv')

    # Ensure Customer_id exists
    if 'Customer_id' not in df.columns:
        print("‚ö†Ô∏è Creating temporary Customer_id...")
        df['Customer_id'] = [f'CUST_{i+1:06d}' for i in range(len(df))]

    # Initialize comprehensive feature pipeline
    feature_pipeline = ComprehensiveFeaturePipeline(n_segments=5)

    # Run feature engineering
    engineered_features = feature_pipeline.fit_transform(df)

    if engineered_features is not None:
        print("\n" + "=" * 60)
        print("FINAL ENGINEERED FEATURES SUMMARY")
        print("=" * 60)

        print(f"‚úÖ Final dataset shape: {engineered_features.shape}")
        print(f"‚úÖ Total features: {len(engineered_features.columns)}")

        # Display sample of engineered features
        print("\nüìä Sample of engineered features (first 10 columns):")
        print(engineered_features.iloc[:, :10].head())

        # Show feature importance analysis
        feature_pipeline.get_feature_importance_analysis()

        # Save results
        engineered_features.to_csv("comprehensive_engineered_features.csv", index=False)
        print("\nüíæ Engineered features saved to 'comprehensive_engineered_features.csv'")

        joblib.dump(feature_pipeline, "comprehensive_feature_pipeline.pkl")
        print("üíæ Feature pipeline saved to 'comprehensive_feature_pipeline.pkl'")

        # Generate feature report
        print("\n" + "=" * 50)
        print("FEATURE ENGINEERING REPORT")
        print("=" * 50)

        # Count features by type
        financial_features = [col for col in engineered_features.columns if any(keyword in col for keyword in ['Ratio', 'Score', 'Efficiency'])]
        segment_features = [col for col in engineered_features.columns if 'Segment' in col or 'Cluster' in col]
        agent_features = [col for col in engineered_features.columns if 'Agent' in col]

        print(f"üí∞ Financial Risk Features: {len(financial_features)}")
        print(f"üë• Customer Segmentation Features: {len(segment_features)}")
        print(f"üì± Channel Effectiveness Features: {len([col for col in engineered_features.columns if 'Channel' in col])}")
        print(f"üïí Temporal Pattern Features: {len([col for col in engineered_features.columns if 'Payment' in col or 'Season' in col])}")
        print(f"üë§ Agent Information Features: {len(agent_features)}")

    else:
        print("‚ùå Feature engineering failed!")

COMPREHENSIVE FEATURE ENGINEERING PIPELINE
üîÑ Applying feature engineering steps...
üîß Processing 55 numeric columns...
‚úÖ Numeric preprocessing completed: imputation + scaling
‚úÖ Feature engineering completed successfully!
üìä Original shape: (100000, 179)
üìà Engineered shape: (100000, 42)
üéØ Number of features: 42

FEATURE CATEGORY ANALYSIS
üìÅ Financial Risk: 5 features
   ‚îú‚îÄ‚îÄ DSCR
   ‚îú‚îÄ‚îÄ Loan_to_Income_Ratio
   ‚îú‚îÄ‚îÄ Credit_Utilization_Ratio
   ‚îî‚îÄ‚îÄ ... and 2 more
üìÅ Customer Segmentation: 6 features
   ‚îú‚îÄ‚îÄ Recency_Score
   ‚îú‚îÄ‚îÄ Frequency_Score
   ‚îú‚îÄ‚îÄ Monetary_Score
   ‚îî‚îÄ‚îÄ ... and 3 more
üìÅ Channel Effectiveness: 5 features
   ‚îú‚îÄ‚îÄ Call_Efficiency
   ‚îú‚îÄ‚îÄ SMS_Efficiency
   ‚îú‚îÄ‚îÄ WhatsApp_Efficiency
   ‚îî‚îÄ‚îÄ ... and 2 more
üìÅ Temporal Patterns: 4 features
   ‚îú‚îÄ‚îÄ Weekend_Payment_Preference
   ‚îú‚îÄ‚îÄ Payment_Quarter
   ‚îú‚îÄ‚îÄ Months_Since_Last_Payment
   ‚îî‚îÄ‚îÄ ... and 1 more
üìÅ Digital Be

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
from sklearn.cluster import KMeans
from scipy import stats

warnings.filterwarnings('ignore')

class SmartChannelEffectivenessCalculator(BaseEstimator, TransformerMixin):
    """Calculate channel effectiveness using comprehensive feature set"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        print("üìä Calculating channel effectiveness scores...")

        # Calculate effectiveness for each channel using multiple factors
        for channel in self.channels:
            effectiveness_components = []

            # 1. Historical Efficiency (if available)
            efficiency_col = f'{channel}_Efficiency'
            if efficiency_col in X.columns:
                effectiveness_components.append(X[efficiency_col].fillna(0))
                print(f"   ‚úÖ Using {efficiency_col} for {channel}")

            # 2. Channel Adaptability
            if 'Channel_Adaptability_Score' in X.columns:
                adaptability_boost = X['Channel_Adaptability_Score'] * 0.2
                effectiveness_components.append(adaptability_boost)

            # 3. Digital Preference (for digital channels)
            if channel in ['SMS', 'WhatsApp', 'Email'] and 'Digital_vs_Traditional_Preference' in X.columns:
                digital_boost = X['Digital_vs_Traditional_Preference'] * 0.3
                effectiveness_components.append(digital_boost)

            # 4. Traditional Preference (for traditional channels)
            if channel in ['Call', 'IVR', 'Field_Agent'] and 'Digital_vs_Traditional_Preference' in X.columns:
                traditional_boost = -X['Digital_vs_Traditional_Preference'] * 0.3
                effectiveness_components.append(traditional_boost)

            # 5. Customer Segment-based adjustments
            if 'Behavioral_Segment' in X.columns:
                segment_boost = self._get_segment_boost(X['Behavioral_Segment'], channel)
                effectiveness_components.append(segment_boost)

            # 6. Time-based adjustments
            if 'Time_Adjusted_Responsiveness' in X.columns:
                time_factor = X['Time_Adjusted_Responsiveness'] * 0.1
                effectiveness_components.append(time_factor)

            # Combine all components
            if effectiveness_components:
                # Weighted combination (historical efficiency gets highest weight)
                if len(effectiveness_components) > 1:
                    weights = [0.4] + [0.6/(len(effectiveness_components)-1)] * (len(effectiveness_components)-1)
                    weighted_components = [comp * weight for comp, weight in zip(effectiveness_components, weights)]
                    X[f'{channel}_Effectiveness_Score'] = pd.concat(weighted_components, axis=1).sum(axis=1)
                else:
                    X[f'{channel}_Effectiveness_Score'] = effectiveness_components[0]

                # Normalize to 0-1 range
                min_score = X[f'{channel}_Effectiveness_Score'].min()
                max_score = X[f'{channel}_Effectiveness_Score'].max()
                if max_score > min_score:
                    X[f'{channel}_Effectiveness_Score'] = (X[f'{channel}_Effectiveness_Score'] - min_score) / (max_score - min_score)
            else:
                # Default score if no components available
                X[f'{channel}_Effectiveness_Score'] = 0.5

        return X

    def _get_segment_boost(self, segments, channel):
        """Get channel preference boost based on customer segment"""
        boost = pd.Series(0, index=segments.index)

        # Segment-specific channel preferences
        segment_rules = {
            'Digital_Savvy': {'SMS': 0.3, 'WhatsApp': 0.4, 'Email': 0.3, 'Call': -0.2, 'IVR': -0.3, 'Field_Agent': -0.4},
            'Traditional': {'Call': 0.3, 'IVR': 0.2, 'Field_Agent': 0.3, 'SMS': -0.2, 'WhatsApp': -0.3, 'Email': -0.2},
            'High_Risk_Delinquent': {'Call': 0.4, 'Field_Agent': 0.4, 'SMS': 0.1, 'WhatsApp': 0.1, 'Email': -0.1, 'IVR': -0.2},
            'High_Value': {'Call': 0.3, 'Email': 0.2, 'WhatsApp': 0.2, 'SMS': 0.1, 'IVR': -0.1, 'Field_Agent': 0.3},
            'Stable_Payer': {'SMS': 0.2, 'Email': 0.2, 'WhatsApp': 0.2, 'Call': 0.1, 'IVR': 0.1, 'Field_Agent': -0.1}
        }

        for segment, rules in segment_rules.items():
            segment_mask = segments == segment
            if channel in rules:
                boost[segment_mask] = rules[channel]

        return boost

class FinancialContextAwareRanker(BaseEstimator, TransformerMixin):
    """Rank channels considering financial context and risk factors"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
        self.loan_amount_threshold = None
        self.contact_attempts_threshold = None

    def fit(self, X, y=None):
        # Pre-calculate thresholds for the entire dataset
        if 'Loan_Amount_SGD' in X.columns:
            self.loan_amount_threshold = X['Loan_Amount_SGD'].quantile(0.75)

        if 'Contact_History_Call_Attempts' in X.columns:
            self.contact_attempts_threshold = X['Contact_History_Call_Attempts'].quantile(0.95)

        return self

    def transform(self, X):
        X = X.copy()

        print("üéØ Ranking channels with financial context...")

        # For each customer, rank channels considering financial context
        preference_orders = []
        top_channels = []
        ranking_scores = []

        for i in range(len(X)):
            customer_scores = {}
            row = X.iloc[i]

            # Base effectiveness scores
            for channel in self.channels:
                score_col = f'{channel}_Effectiveness_Score'
                if score_col in X.columns:
                    base_score = row[score_col]
                else:
                    base_score = 0.5

                # Financial context adjustments
                financial_adjustment = self._calculate_financial_adjustment(row, channel)

                # Risk-based adjustments
                risk_adjustment = self._calculate_risk_adjustment(row, channel)

                # Final score with adjustments
                final_score = base_score + financial_adjustment + risk_adjustment
                customer_scores[channel] = max(0, min(1, final_score))  # Clip to 0-1

            # Sort channels by score (highest to lowest)
            ranked_channels = sorted(customer_scores.items(), key=lambda x: x[1], reverse=True)

            # Create preference order
            preference_order = ','.join([channel for channel, score in ranked_channels])
            preference_orders.append(preference_order)
            top_channels.append(ranked_channels[0][0])
            ranking_scores.append(customer_scores)

        X['Channel_Preference_Order'] = preference_orders
        X['Top_Channel'] = top_channels
        X['Ranking_Scores'] = ranking_scores

        return X

    def _calculate_financial_adjustment(self, row, channel):
        """Adjust scores based on financial context"""
        adjustment = 0

        # High risk customers need more personal contact
        if 'Composite_Financial_Stress_Score' in row and row['Composite_Financial_Stress_Score'] > 0.7:
            if channel in ['Call', 'Field_Agent']:
                adjustment += 0.3
            elif channel in ['SMS', 'Email']:
                adjustment -= 0.2

        # High value customers prefer professional channels
        if 'Loan_Amount_SGD' in row and self.loan_amount_threshold is not None:
            if row['Loan_Amount_SGD'] > self.loan_amount_threshold:
                if channel in ['Call', 'Field_Agent', 'Email']:
                    adjustment += 0.2
                elif channel == 'IVR':
                    adjustment -= 0.3

        # Severe delinquency needs urgent personal contact
        if 'Day_Past_Due' in row and row['Day_Past_Due'] > 30:
            if channel in ['Call', 'Field_Agent']:
                adjustment += 0.4
            elif channel in ['SMS', 'Email', 'IVR']:
                adjustment -= 0.3

        return adjustment

    def _calculate_risk_adjustment(self, row, channel):
        """Adjust scores based on risk factors"""
        adjustment = 0

        # Digital engagement affects digital channel preference
        if 'Digital_Engagement_Score' in row:
            digital_engagement = row['Digital_Engagement_Score']
            if channel in ['SMS', 'WhatsApp', 'Email']:
                adjustment += digital_engagement * 0.2
            else:
                adjustment -= digital_engagement * 0.1

        # Contact history affects channel choice
        if 'Contact_History_Call_Attempts' in row and self.contact_attempts_threshold is not None:
            if row['Contact_History_Call_Attempts'] > self.contact_attempts_threshold:
                if channel in ['Call', 'Field_Agent']:  # Try different approaches for hard-to-reach
                    adjustment += 0.2
                elif channel in ['WhatsApp', 'SMS']:  # Alternative channels
                    adjustment += 0.1

        return adjustment

class MultiLevelLabelEncoder(BaseEstimator, TransformerMixin):
    """Create multiple types of labels for different modeling approaches"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
        self.label_encoder = LabelEncoder()
        self.cluster_model = KMeans(n_clusters=5, random_state=42)

    def fit(self, X, y=None):
        if 'Channel_Preference_Order' in X.columns:
            self.unique_orders = X['Channel_Preference_Order'].unique()
            self.label_encoder.fit(self.unique_orders)

        # Fit clustering for preference patterns
        if all(f'{channel}_Effectiveness_Score' in X.columns for channel in self.channels):
            cluster_features = X[[f'{channel}_Effectiveness_Score' for channel in self.channels]]
            self.cluster_model.fit(cluster_features)

        return self

    def transform(self, X):
        X = X.copy()

        print("üè∑Ô∏è Creating multiple label types...")

        if 'Channel_Preference_Order' not in X.columns:
            print("‚ùå No preference orders found. Run FinancialContextAwareRanker first.")
            return X

        # 1. Primary Channel Label (Multi-class)
        X['Primary_Channel_Label'] = self.label_encoder.transform(X['Channel_Preference_Order'])

        # 2. Top Channel Label (Simplified)
        X['Top_Channel_Label'] = X['Top_Channel']

        # 3. Top-3 Channels (Multi-label)
        for channel in self.channels:
            X[f'Prefers_{channel}_Top3'] = X['Channel_Preference_Order'].apply(
                lambda x: 1 if channel in x.split(',')[:3] else 0
            )

        # 4. Channel Preference Cluster
        if all(f'{channel}_Effectiveness_Score' in X.columns for channel in self.channels):
            cluster_features = X[[f'{channel}_Effectiveness_Score' for channel in self.channels]]
            X['Preference_Cluster'] = self.cluster_model.predict(cluster_features)

        # 5. Urgency-based Channel Label (High/Medium/Low Touch)
        X['Contact_Urgency_Level'] = X.apply(self._calculate_urgency_level, axis=1)

        # 6. Binary labels for key channels
        X['Prefers_Personal_Contact'] = X['Channel_Preference_Order'].apply(
            lambda x: 1 if any(channel in x.split(',')[0] for channel in ['Call', 'Field_Agent']) else 0
        )

        X['Prefers_Digital_Contact'] = X['Channel_Preference_Order'].apply(
            lambda x: 1 if any(channel in x.split(',')[0] for channel in ['SMS', 'WhatsApp', 'Email']) else 0
        )

        return X

    def _calculate_urgency_level(self, row):
        """Calculate contact urgency level based on financial risk"""
        urgency_score = 0

        # Financial stress increases urgency
        if 'Composite_Financial_Stress_Score' in row:
            urgency_score += row['Composite_Financial_Stress_Score'] * 2

        # Delinquency increases urgency
        if 'Day_Past_Due' in row and row['Day_Past_Due'] > 30:
            urgency_score += 1

        # High risk indicators increase urgency
        if 'Financial_Stress_Score' in row and row['Financial_Stress_Score'] > 0.7:
            urgency_score += 1

        # Categorize urgency
        if urgency_score >= 3:
            return 'High_Urgency'
        elif urgency_score >= 1.5:
            return 'Medium_Urgency'
        else:
            return 'Low_Urgency'

class AdvancedLabelAnalyzer(BaseEstimator, TransformerMixin):
    """Analyze and validate the created labels"""

    def __init__(self):
        self.analysis_results = {}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        print("\n" + "="*60)
        print("üìä ADVANCED LABEL ANALYSIS")
        print("="*60)

        self._analyze_channel_distribution(X)
        self._analyze_segment_preferences(X)
        self._analyze_financial_context_patterns(X)
        self._analyze_label_quality(X)

        return X

    def _analyze_channel_distribution(self, X):
        """Analyze distribution of channel preferences"""
        print("\nüéØ CHANNEL PREFERENCE DISTRIBUTION:")

        # Top channel distribution
        if 'Top_Channel' in X.columns:
            top_dist = X['Top_Channel'].value_counts()
            print("\nüèÜ Top Channel Preferences:")
            for channel, count in top_dist.items():
                percentage = (count / len(X)) * 100
                print(f"   {channel}: {count} customers ({percentage:.1f}%)")

        # Top-3 channel presence
        if any(f'Prefers_{channel}_Top3' in X.columns for channel in ['Call', 'SMS', 'WhatsApp', 'Email']):
            print("\nüì± Channel Presence in Top 3:")
            for channel in ['Call', 'SMS', 'WhatsApp', 'Email']:
                col = f'Prefers_{channel}_Top3'
                if col in X.columns:
                    count = X[col].sum()
                    percentage = (count / len(X)) * 100
                    print(f"   {channel}: {count} customers ({percentage:.1f}%)")

    def _analyze_segment_preferences(self, X):
        """Analyze channel preferences by customer segment"""
        if 'Behavioral_Segment' in X.columns and 'Top_Channel' in X.columns:
            print("\nüë• CHANNEL PREFERENCES BY SEGMENT:")

            segment_channel_pref = X.groupby('Behavioral_Segment')['Top_Channel'].agg(
                lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown'
            )

            for segment, preferred_channel in segment_channel_pref.items():
                segment_count = (X['Behavioral_Segment'] == segment).sum()
                print(f"   {segment}: Prefers {preferred_channel} ({segment_count} customers)")

    def _analyze_financial_context_patterns(self, X):
        """Analyze how financial context affects channel preferences"""
        print("\nüí∞ FINANCIAL CONTEXT PATTERNS:")

        # High risk customers' preferences
        if 'Composite_Financial_Stress_Score' in X.columns and 'Top_Channel' in X.columns:
            high_risk_mask = X['Composite_Financial_Stress_Score'] > 0.7
            if high_risk_mask.any():
                high_risk_pref = X[high_risk_mask]['Top_Channel'].value_counts()
                print("   High-Risk Customers prefer:")
                for channel, count in high_risk_pref.items():
                    percentage = (count / high_risk_mask.sum()) * 100
                    print(f"     {channel}: {count} ({percentage:.1f}%)")

        # High value customers' preferences
        if 'Loan_Amount_SGD' in X.columns and 'Top_Channel' in X.columns:
            high_value_threshold = X['Loan_Amount_SGD'].quantile(0.75)
            high_value_mask = X['Loan_Amount_SGD'] > high_value_threshold
            if high_value_mask.any():
                high_value_pref = X[high_value_mask]['Top_Channel'].value_counts()
                print("   High-Value Customers prefer:")
                for channel, count in high_value_pref.items():
                    percentage = (count / high_value_mask.sum()) * 100
                    print(f"     {channel}: {count} ({percentage:.1f}%)")

    def _analyze_label_quality(self, X):
        """Analyze the quality and diversity of created labels"""
        print("\nüìà LABEL QUALITY METRICS:")

        # Label diversity
        if 'Channel_Preference_Order' in X.columns:
            unique_orders = X['Channel_Preference_Order'].nunique()
            total_customers = len(X)
            diversity_ratio = unique_orders / total_customers

            print(f"   Unique preference patterns: {unique_orders}")
            print(f"   Diversity ratio: {diversity_ratio:.3f}")

            # Show most common patterns
            common_patterns = X['Channel_Preference_Order'].value_counts().head(5)
            print("\n   Most common preference patterns:")
            for pattern, count in common_patterns.items():
                percentage = (count / total_customers) * 100
                print(f"     {pattern}: {count} customers ({percentage:.1f}%)")

class BestChannelRankingLabeler(BaseEstimator, TransformerMixin):
    """Create the final best channel ranking labels for model training"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
        self.best_channel_encoder = LabelEncoder()

    def fit(self, X, y=None):
        if 'Top_Channel' in X.columns:
            self.best_channel_encoder.fit(X['Top_Channel'])
        return self

    def transform(self, X):
        X = X.copy()

        print("üèÜ Creating final best channel ranking labels...")

        # 1. Primary target: Best Channel (Multi-class classification)
        X['Best_Channel_Label'] = self.best_channel_encoder.transform(X['Top_Channel'])

        # 2. Channel Ranking Scores (Regression targets)
        for channel in self.channels:
            score_col = f'{channel}_Effectiveness_Score'
            if score_col in X.columns:
                X[f'{channel}_Ranking_Score'] = X[score_col]

        # 3. Preference Strength Indicator
        if 'Ranking_Scores' in X.columns:
            X['Preference_Strength'] = X['Ranking_Scores'].apply(
                lambda scores: max(scores.values()) - min(scores.values()) if scores else 0
            )

        # 4. Channel Switch Recommendation
        X['Recommended_Channel_Switch'] = X.apply(self._recommend_channel_switch, axis=1)

        return X

    def _recommend_channel_switch(self, row):
        """Recommend if customer should switch from current top channel"""
        if 'Top_Channel' not in row or 'Ranking_Scores' not in row:
            return 'No_Recommendation'

        current_top = row['Top_Channel']
        scores = row['Ranking_Scores']

        if not scores or current_top not in scores:
            return 'No_Recommendation'

        current_score = scores[current_top]
        max_score = max(scores.values())

        # Recommend switch if there's a significantly better channel
        if max_score - current_score > 0.2:
            best_channel = max(scores.items(), key=lambda x: x[1])[0]
            return f'Switch_to_{best_channel}'
        else:
            return 'Maintain_Current'

class IntelligentLabelingPipeline:
    """Complete pipeline for creating intelligent channel preference labels"""

    def __init__(self):
        self.pipeline = None
        self.labels_df = None
        self._build_pipeline()

    def _build_pipeline(self):
        """Build the intelligent labeling pipeline"""

        self.pipeline = Pipeline([
            # Step 1: Calculate comprehensive channel effectiveness
            ('effectiveness_calculator', SmartChannelEffectivenessCalculator()),

            # Step 2: Rank channels with financial context
            ('channel_ranker', FinancialContextAwareRanker()),

            # Step 3: Create multiple label types
            ('label_encoder', MultiLevelLabelEncoder()),

            # Step 4: Create final best channel ranking labels
            ('best_channel_labeler', BestChannelRankingLabeler()),

            # Step 5: Analyze and validate labels
            ('label_analyzer', AdvancedLabelAnalyzer())
        ])

    def fit_transform(self, X, y=None):
        """Create intelligent labels from engineered features"""
        print("=" * 60)
        print("ü§ñ INTELLIGENT CHANNEL PREFERENCE LABELING")
        print("=" * 60)

        try:
            # Store original Customer_id and Agent IDs
            original_columns = {}
            important_columns = ['Customer_id', 'Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']

            for col in important_columns:
                if col in X.columns:
                    original_columns[col] = X[col].copy()

            # Transform data
            print("üîÑ Applying feature engineering steps...")
            X_transformed = self.pipeline.fit_transform(X)

            # Ensure important columns are preserved
            for col, values in original_columns.items():
                if col not in X_transformed.columns:
                    X_transformed[col] = values

            # Move Customer_id to first column
            if 'Customer_id' in X_transformed.columns:
                cols = ['Customer_id'] + [col for col in X_transformed.columns if col != 'Customer_id']
                X_transformed = X_transformed[cols]

            # Extract label columns for easy access
            label_columns = self._get_label_columns(X_transformed)
            self.labels_df = X_transformed[label_columns]

            print("\n‚úÖ Label construction completed successfully!")
            print(f"üìä Number of customers labeled: {len(self.labels_df)}")
            print(f"üéØ Number of unique preference patterns: {self.labels_df['Channel_Preference_Order'].nunique()}")

            # Show target variable distribution
            self._show_target_distribution()

            return X_transformed, self.labels_df

        except Exception as e:
            print(f"‚ùå Error in label construction: {e}")
            import traceback
            traceback.print_exc()
            return X, None

    def _get_label_columns(self, X):
        """Get all label-related columns"""
        label_columns = ['Customer_id', 'Channel_Preference_Order', 'Top_Channel',
                       'Primary_Channel_Label', 'Top_Channel_Label', 'Best_Channel_Label',
                       'Preference_Cluster', 'Contact_Urgency_Level',
                       'Prefers_Personal_Contact', 'Prefers_Digital_Contact',
                       'Preference_Strength', 'Recommended_Channel_Switch']

        # Add binary preference columns
        binary_cols = [col for col in X.columns if 'Prefers_' in col and 'Top3' in col]
        label_columns.extend(binary_cols)

        # Add ranking score columns
        ranking_cols = [col for col in X.columns if 'Ranking_Score' in col]
        label_columns.extend(ranking_cols)

        # Add agent columns if they exist
        agent_cols = [col for col in ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']
                     if col in X.columns]
        label_columns.extend(agent_cols)

        return [col for col in label_columns if col in X.columns]

    def _show_target_distribution(self):
        """Show distribution of target variables"""
        if self.labels_df is None:
            return

        print("\nüéØ TARGET VARIABLE DISTRIBUTION:")

        # Best Channel Distribution
        if 'Best_Channel_Label' in self.labels_df.columns:
            channel_dist = self.labels_df['Top_Channel'].value_counts()
            print("\nüèÜ Best Channel Distribution:")
            for channel, count in channel_dist.items():
                percentage = (count / len(self.labels_df)) * 100
                print(f"   {channel}: {count} customers ({percentage:.1f}%)")

        # Urgency Level Distribution
        if 'Contact_Urgency_Level' in self.labels_df.columns:
            urgency_dist = self.labels_df['Contact_Urgency_Level'].value_counts()
            print("\nüö® Contact Urgency Distribution:")
            for level, count in urgency_dist.items():
                percentage = (count / len(self.labels_df)) * 100
                print(f"   {level}: {count} customers ({percentage:.1f}%)")

    def get_label_summary(self):
        """Get comprehensive summary of created labels"""
        if self.labels_df is None:
            print("No labels available. Run fit_transform first.")
            return

        summary = {
            'total_customers': len(self.labels_df),
            'unique_preference_patterns': self.labels_df['Channel_Preference_Order'].nunique(),
            'top_channel_distribution': self.labels_df['Top_Channel'].value_counts().to_dict(),
            'best_channel_labels': self.labels_df['Best_Channel_Label'].value_counts().to_dict() if 'Best_Channel_Label' in self.labels_df.columns else {},
            'urgency_level_distribution': self.labels_df['Contact_Urgency_Level'].value_counts().to_dict() if 'Contact_Urgency_Level' in self.labels_df.columns else {},
            'preference_clusters': self.labels_df['Preference_Cluster'].value_counts().to_dict() if 'Preference_Cluster' in self.labels_df.columns else {}
        }

        return summary

    def save_labels(self, filepath='best_channel_ranking_labels.csv'):
        """Save the labels to CSV"""
        if self.labels_df is not None:
            self.labels_df.to_csv(filepath, index=False)
            print(f"üíæ Labels saved to {filepath}")
        else:
            print("‚ùå No labels to save")

    def get_training_data(self, features_df):
        """Prepare features and targets for model training"""
        if self.labels_df is None:
            print("No labels available. Run fit_transform first.")
            return None, None

        # Merge features with labels
        training_data = features_df.merge(
            self.labels_df[['Customer_id', 'Best_Channel_Label', 'Top_Channel']],
            on='Customer_id',
            how='inner'
        )

        # Separate features and target
        feature_columns = [col for col in training_data.columns if col not in
                         ['Customer_id', 'Best_Channel_Label', 'Top_Channel',
                          'Channel_Preference_Order', 'Ranking_Scores']]

        X = training_data[feature_columns]
        y = training_data['Best_Channel_Label']

        print(f"üìö Training data prepared: {X.shape[0]} samples, {X.shape[1]} features")
        print(f"üéØ Target variable: Best_Channel_Label ({len(y.unique())} classes)")

        return X, y

# Quick usage function
def create_best_channel_labels(engineered_features_df):
    """
    One-function approach to create best channel ranking labels

    Parameters:
    -----------
    engineered_features_df : pandas.DataFrame
        Output from ComprehensiveFeaturePipeline

    Returns:
    --------
    tuple : (features_with_labels, labels_dataframe, training_data)
    """
    # Initialize pipeline
    label_pipeline = IntelligentLabelingPipeline()

    # Create labels
    features_with_labels, labels_df = label_pipeline.fit_transform(engineered_features_df)

    # Prepare training data
    X_train, y_train = None, None
    if labels_df is not None:
        X_train, y_train = label_pipeline.get_training_data(engineered_features_df)

        # Save results
        label_pipeline.save_labels('best_channel_ranking_labels.csv')
        features_with_labels.to_csv('features_with_best_channel_labels.csv', index=False)

        # Print summary
        summary = label_pipeline.get_label_summary()
        print("\nüìã LABELING SUMMARY:")
        print(f"   Total customers: {summary['total_customers']}")
        print(f"   Unique patterns: {summary['unique_preference_patterns']}")
        print(f"   Most common best channel: {max(summary['top_channel_distribution'].items(), key=lambda x: x[1])[0]}")
        print(f"   Training samples: {X_train.shape[0] if X_train is not None else 0}")
        print(f"   Features for training: {X_train.shape[1] if X_train is not None else 0}")

    return features_with_labels, labels_df, (X_train, y_train)

# Main execution
if __name__ == "__main__":
    # Load your engineered features
    print("üîç Loading engineered features...")
    df = pd.read_csv('comprehensive_engineered_features.csv')

    print(f"üìä Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"üë§ Customer count: {df['Customer_id'].nunique() if 'Customer_id' in df.columns else 'N/A'}")

    # Create intelligent labels
    features_with_labels, labels, training_data = create_best_channel_labels(df)

    if labels is not None:
        print("\nüéâ SUCCESS! Best channel ranking labels created.")
        print("\nüìã Sample of created labels:")
        sample_columns = ['Customer_id', 'Top_Channel', 'Best_Channel_Label', 'Contact_Urgency_Level']
        available_columns = [col for col in sample_columns if col in labels.columns]
        print(labels[available_columns].head(10))

        # Show different label types available for modeling
        print("\nüè∑Ô∏è AVAILABLE TARGET VARIABLES FOR MODELING:")
        target_variables = {
            'Multi-class Classification': ['Best_Channel_Label', 'Top_Channel_Label'],
            'Multi-label Classification': [col for col in labels.columns if 'Prefers_' in col and 'Top3' in col],
            'Regression': [col for col in labels.columns if 'Ranking_Score' in col],
            'Binary Classification': ['Prefers_Personal_Contact', 'Prefers_Digital_Contact'],
            'Clustering': ['Preference_Cluster'],
            'Strategic Segmentation': ['Contact_Urgency_Level', 'Recommended_Channel_Switch']
        }

        for model_type, expected_cols in target_variables.items():
            available_cols = [col for col in expected_cols if col in labels.columns]
            if available_cols:
                print(f"   {model_type}: {', '.join(available_cols)}")

        # Show training data info
        if training_data[0] is not None:
            print(f"\nüìö TRAINING DATA READY:")
            print(f"   Features shape: {training_data[0].shape}")
            print(f"   Target shape: {training_data[1].shape}")
            print(f"   Unique target classes: {len(np.unique(training_data[1]))}")
    else:
        print("‚ùå Label creation failed!")

üîç Loading engineered features...
üìä Loaded data: 100000 rows, 42 columns
üë§ Customer count: 100000
ü§ñ INTELLIGENT CHANNEL PREFERENCE LABELING
üîÑ Applying feature engineering steps...
üìä Calculating channel effectiveness scores...
   ‚úÖ Using Call_Efficiency for Call
   ‚úÖ Using SMS_Efficiency for SMS
   ‚úÖ Using WhatsApp_Efficiency for WhatsApp
üéØ Ranking channels with financial context...
üè∑Ô∏è Creating multiple label types...
üèÜ Creating final best channel ranking labels...

üìä ADVANCED LABEL ANALYSIS

üéØ CHANNEL PREFERENCE DISTRIBUTION:

üèÜ Top Channel Preferences:
   Call: 40463 customers (40.5%)
   Email: 21365 customers (21.4%)
   Field_Agent: 14552 customers (14.6%)
   SMS: 14334 customers (14.3%)
   WhatsApp: 9123 customers (9.1%)
   IVR: 163 customers (0.2%)

üì± Channel Presence in Top 3:
   Call: 65143 customers (65.1%)
   SMS: 35537 customers (35.5%)
   WhatsApp: 37823 customers (37.8%)
   Email: 58090 customers (58.1%)

üë• CHANNEL PREFERENCES

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import warnings

warnings.filterwarnings('ignore')

class ChannelEffectivenessCalculator(BaseEstimator, TransformerMixin):
    """Calculate how effective each channel is for each customer using comprehensive features"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        print("üìä Calculating channel effectiveness scores...")

        # For each channel, calculate an effectiveness score using multiple factors
        for channel in self.channels:
            effectiveness_components = []

            # 1. Historical Efficiency (if available)
            efficiency_col = f'{channel}_Efficiency'
            if efficiency_col in X.columns:
                effectiveness_components.append(X[efficiency_col].fillna(0))
                print(f"   ‚úÖ Using {efficiency_col} for {channel}")

            # 2. Channel Adaptability
            if 'Channel_Adaptability_Score' in X.columns:
                adaptability_boost = X['Channel_Adaptability_Score'] * 0.2
                effectiveness_components.append(adaptability_boost)

            # 3. Digital Preference (for digital channels)
            if channel in ['SMS', 'WhatsApp', 'Email'] and 'Digital_vs_Traditional_Preference' in X.columns:
                digital_boost = X['Digital_vs_Traditional_Preference'] * 0.3
                effectiveness_components.append(digital_boost)

            # 4. Traditional Preference (for traditional channels)
            if channel in ['Call', 'IVR', 'Field_Agent'] and 'Digital_vs_Traditional_Preference' in X.columns:
                traditional_boost = -X['Digital_vs_Traditional_Preference'] * 0.3
                effectiveness_components.append(traditional_boost)

            # 5. Customer Segment-based adjustments
            if 'Behavioral_Segment' in X.columns:
                segment_boost = self._get_segment_boost(X['Behavioral_Segment'], channel)
                effectiveness_components.append(segment_boost)

            # 6. Time-based adjustments
            if 'Time_Adjusted_Responsiveness' in X.columns:
                time_factor = X['Time_Adjusted_Responsiveness'] * 0.1
                effectiveness_components.append(time_factor)

            # Combine all components
            if effectiveness_components:
                # Weighted combination (historical efficiency gets highest weight)
                if len(effectiveness_components) > 1:
                    weights = [0.4] + [0.6/(len(effectiveness_components)-1)] * (len(effectiveness_components)-1)
                    weighted_components = [comp * weight for comp, weight in zip(effectiveness_components, weights)]
                    effectiveness_score = pd.concat(weighted_components, axis=1).sum(axis=1)
                else:
                    effectiveness_score = effectiveness_components[0]

                # Normalize to 0-1 range
                min_score = effectiveness_score.min()
                max_score = effectiveness_score.max()
                if max_score > min_score:
                    effectiveness_score = (effectiveness_score - min_score) / (max_score - min_score)

                X[f'{channel}_Effectiveness'] = effectiveness_score
            else:
                # Default score if no components available
                X[f'{channel}_Effectiveness'] = 0.5

        return X

    def _get_segment_boost(self, segments, channel):
        """Get channel preference boost based on customer segment"""
        boost = pd.Series(0, index=segments.index)

        # Segment-specific channel preferences
        segment_rules = {
            'Digital_Savvy': {'SMS': 0.3, 'WhatsApp': 0.4, 'Email': 0.3, 'Call': -0.2, 'IVR': -0.3, 'Field_Agent': -0.4},
            'Traditional': {'Call': 0.3, 'IVR': 0.2, 'Field_Agent': 0.3, 'SMS': -0.2, 'WhatsApp': -0.3, 'Email': -0.2},
            'High_Risk_Delinquent': {'Call': 0.4, 'Field_Agent': 0.4, 'SMS': 0.1, 'WhatsApp': 0.1, 'Email': -0.1, 'IVR': -0.2},
            'High_Value': {'Call': 0.3, 'Email': 0.2, 'WhatsApp': 0.2, 'SMS': 0.1, 'IVR': -0.1, 'Field_Agent': 0.3},
            'Stable_Payer': {'SMS': 0.2, 'Email': 0.2, 'WhatsApp': 0.2, 'Call': 0.1, 'IVR': 0.1, 'Field_Agent': -0.1}
        }

        for segment, rules in segment_rules.items():
            segment_mask = segments == segment
            if channel in rules:
                boost[segment_mask] = rules[channel]

        return boost

class ChannelRanker(BaseEstimator, TransformerMixin):
    """Rank channels from best to worst for each customer"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        print("üéØ Ranking channels from best to worst...")

        # For each customer, rank channels by effectiveness
        preference_orders = []
        top_channels = []

        for i in range(len(X)):
            customer_scores = {}

            # Get scores for this customer
            for channel in self.channels:
                score_col = f'{channel}_Effectiveness'
                if score_col in X.columns:
                    customer_scores[channel] = X[score_col].iloc[i]
                else:
                    customer_scores[channel] = 0  # Default if missing

            # Sort channels by score (highest to lowest)
            ranked_channels = sorted(customer_scores.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

            # Create preference order string
            preference_order = ','.join([channel for channel, score in ranked_channels])
            preference_orders.append(preference_order)
            top_channels.append(ranked_channels[0][0])

        # Add preference order to dataframe
        X['Channel_Preference_Order'] = preference_orders
        X['Top_Channel'] = top_channels

        return X

class PreferenceLabelEncoder(BaseEstimator, TransformerMixin):
    """Convert preference orders into model-friendly formats"""

    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.top_channel_encoder = LabelEncoder()
        self.unique_orders = None

    def fit(self, X, y=None):
        if 'Channel_Preference_Order' in X.columns:
            # Learn all possible preference patterns
            self.unique_orders = X['Channel_Preference_Order'].unique()
            self.label_encoder.fit(self.unique_orders)

        if 'Top_Channel' in X.columns:
            # Learn all possible top channels
            self.top_channel_encoder.fit(X['Top_Channel'])

        return self

    def transform(self, X):
        X = X.copy()
        print("üè∑Ô∏è Converting preferences to model-friendly formats...")

        if 'Channel_Preference_Order' not in X.columns:
            print("No preference orders found. Run ChannelRanker first.")
            return X

        # Method 1: Encode entire order as one label (like animal classification)
        X['Preference_Label'] = self.label_encoder.transform(X['Channel_Preference_Order'])

        # Method 2: Create simple "top choice" label
        X['Top_Channel_Label'] = self.top_channel_encoder.transform(X['Top_Channel'])

        # Method 3: Binary indicators for each channel being in top 3
        for channel in ['Call', 'SMS', 'WhatsApp', 'Email']:
            X[f'Prefers_{channel}_Top3'] = X['Channel_Preference_Order'].apply(
                lambda x: 1 if channel in x.split(',')[:3] else 0
            )

        return X

class LabelConstructionPipeline:
    """Complete pipeline for creating channel preference labels"""

    def __init__(self):
        self.pipeline = None
        self.labels = None
        self._build_pipeline()

    def _build_pipeline(self):
        """Build the label construction pipeline"""

        self.pipeline = Pipeline([
            # Step 1: Calculate how good each channel is using comprehensive features
            ('effectiveness_calculator', ChannelEffectivenessCalculator()),

            # Step 2: Rank channels from best to worst
            ('channel_ranker', ChannelRanker()),

            # Step 3: Convert to model-friendly formats
            ('label_encoder', PreferenceLabelEncoder())
        ])

    def fit_transform(self, X, y=None):
        """Create labels from the engineered features"""
        print("=" * 60)
        print("BUILDING CHANNEL PREFERENCE LABELS")
        print("=" * 60)

        try:
            # Store original Customer_id and Agent IDs
            original_columns = {}
            important_columns = ['Customer_id', 'Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']

            for col in important_columns:
                if col in X.columns:
                    original_columns[col] = X[col].copy()

            # Apply the pipeline
            X_with_labels = self.pipeline.fit_transform(X)

            # Ensure important columns are preserved
            for col, values in original_columns.items():
                if col not in X_with_labels.columns:
                    X_with_labels[col] = values

            # Extract the important label columns
            label_columns = ['Customer_id', 'Channel_Preference_Order',
                           'Preference_Label', 'Top_Channel', 'Top_Channel_Label']

            # Add binary preference columns
            binary_cols = [col for col in X_with_labels.columns if 'Prefers_' in col]
            label_columns.extend(binary_cols)

            # Add agent columns if they exist
            agent_cols = [col for col in ['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']
                         if col in X_with_labels.columns]
            label_columns.extend(agent_cols)

            self.labels = X_with_labels[label_columns]

            print("‚úÖ Label construction completed!")
            print(f"üìä Number of customers: {len(self.labels)}")
            print(f"üéØ Number of unique preference patterns: {len(self.labels['Channel_Preference_Order'].unique())}")

            # Show what we created
            self._analyze_labels()

            return X_with_labels, self.labels

        except Exception as e:
            print(f"‚ùå Error in label construction: {e}")
            import traceback
            traceback.print_exc()
            return X, None

    def _analyze_labels(self):
        """Analyze the created labels"""
        print("\n" + "="*50)
        print("LABEL ANALYSIS")
        print("="*50)

        # Top channels overall
        top_channels = self.labels['Top_Channel'].value_counts()
        print("\nüèÜ Most popular top channels:")
        for channel, count in top_channels.items():
            percentage = (count / len(self.labels)) * 100
            print(f"   {channel}: {count} customers ({percentage:.1f}%)")

        # Preference order distribution
        print(f"\nüéØ Unique preference orders: {len(self.labels['Channel_Preference_Order'].unique())}")

        # Show most common preference patterns
        common_orders = self.labels['Channel_Preference_Order'].value_counts().head(5)
        print("\nüîù Top 5 preference patterns:")
        for order, count in common_orders.items():
            print(f"   {order}: {count} customers")

        # Binary preference distribution
        print("\nüì± Channel presence in Top 3:")
        for channel in ['Call', 'SMS', 'WhatsApp', 'Email']:
            col = f'Prefers_{channel}_Top3'
            if col in self.labels.columns:
                count = self.labels[col].sum()
                percentage = (count / len(self.labels)) * 100
                print(f"   {channel}: {count} customers ({percentage:.1f}%)")

    def get_labels(self):
        """Get the constructed labels"""
        return self.labels

    def save_labels(self, filepath='channel_preference_labels.csv'):
        """Save labels to CSV file"""
        if self.labels is not None:
            self.labels.to_csv(filepath, index=False)
            print(f"üíæ Labels saved to {filepath}")
        else:
            print("‚ùå No labels to save")

    def get_training_data(self, features_df):
        """Prepare features and targets for model training"""
        if self.labels is None:
            print("No labels available. Run fit_transform first.")
            return None, None, None

        # Merge features with labels
        training_data = features_df.merge(
            self.labels[['Customer_id', 'Preference_Label', 'Top_Channel_Label', 'Top_Channel']],
            on='Customer_id',
            how='inner'
        )

        # Separate features and targets
        feature_columns = [col for col in training_data.columns if col not in
                         ['Customer_id', 'Preference_Label', 'Top_Channel_Label', 'Top_Channel',
                          'Channel_Preference_Order', 'Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs']]

        X = training_data[feature_columns]
        y_preference = training_data['Preference_Label']  # Multi-class (full order)
        y_top_channel = training_data['Top_Channel_Label']  # Multi-class (top channel only)

        return X, y_preference, y_top_channel

# üöÄ **MAIN EXECUTION**

if __name__ == "__main__":
    # Load your engineered features
    print("üîç Loading engineered features...")

    try:
        # Try to load the comprehensive engineered features
        df = pd.read_csv('comprehensive_engineered_features.csv')
        print("‚úÖ Loaded comprehensive_engineered_features.csv")
    except FileNotFoundError:
        try:
            # Fallback to processed features
            df = pd.read_csv('processed_updated_data.csv')
            print("‚úÖ Loaded processed_updated_data.csv")
        except FileNotFoundError:
            print("‚ùå No feature file found. Please ensure you have run the feature engineering pipeline first.")
            exit()

    print(f"üìä Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"üë§ Customer count: {df['Customer_id'].nunique() if 'Customer_id' in df.columns else 'N/A'}")

    # Ensure Customer_id exists
    if 'Customer_id' not in df.columns:
        print("‚ö†Ô∏è Creating temporary Customer_id...")
        df['Customer_id'] = [f'CUST_{i+1:06d}' for i in range(len(df))]

    # Use the full comprehensive pipeline
    print("\n" + "="*60)
    print("USING COMPREHENSIVE LABELING PIPELINE")
    print("="*60)

    label_pipeline = LabelConstructionPipeline()
    df_with_labels, labels = label_pipeline.fit_transform(df)

    if labels is not None:
        # Save results
        labels.to_csv('channel_preference_labels.csv', index=False)
        df_with_labels.to_csv('features_with_channel_labels.csv', index=False)

        print("\nüíæ Saved results:")
        print("   - channel_preference_labels.csv (just the labels)")
        print("   - features_with_channel_labels.csv (features + labels)")

        # Show sample of what we created
        print("\nüìã Sample of created labels:")
        sample_cols = ['Customer_id', 'Top_Channel', 'Channel_Preference_Order', 'Preference_Label']
        available_cols = [col for col in sample_cols if col in labels.columns]
        print(labels[available_cols].head(10))

        # Prepare training data (silently, without printing)
        X, y_preference, y_top_channel = label_pipeline.get_training_data(df)

    else:
        print("‚ùå Label creation failed!")

    print("\nüéâ Channel preference labeling completed successfully!")

üîç Loading engineered features...
‚úÖ Loaded comprehensive_engineered_features.csv
üìä Loaded data: 100000 rows, 42 columns
üë§ Customer count: 100000

USING COMPREHENSIVE LABELING PIPELINE
BUILDING CHANNEL PREFERENCE LABELS
üìä Calculating channel effectiveness scores...
   ‚úÖ Using Call_Efficiency for Call
   ‚úÖ Using SMS_Efficiency for SMS
   ‚úÖ Using WhatsApp_Efficiency for WhatsApp
üéØ Ranking channels from best to worst...
üè∑Ô∏è Converting preferences to model-friendly formats...
‚úÖ Label construction completed!
üìä Number of customers: 100000
üéØ Number of unique preference patterns: 352

LABEL ANALYSIS

üèÜ Most popular top channels:
   Call: 43519 customers (43.5%)
   SMS: 21552 customers (21.6%)
   WhatsApp: 17458 customers (17.5%)
   Field_Agent: 10991 customers (11.0%)
   Email: 6257 customers (6.3%)
   IVR: 223 customers (0.2%)

üéØ Unique preference orders: 352

üîù Top 5 preference patterns:
   Call,SMS,Field_Agent,IVR,Email,WhatsApp: 12832 customers
   

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

class ChannelRankingDataPreparator:
    """Prepare data for XGBRanker training"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
        self.label_encoders = {}
        self.feature_cols = None

    def prepare_ranking_data(self, df):
        """Convert preference data to ranking format"""
        print("Preparing ranking format...")

        # Get feature columns (exclude label columns and Customer_id)
        exclude_cols = ['Customer_id', 'Channel_Preference_Order', 'Preference_Label', 'Top_Channel']
        exclude_cols.extend([col for col in df.columns if 'Prefers_' in col])
        exclude_cols.extend(['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs'])

        self.feature_cols = [col for col in df.columns if col not in exclude_cols]

        # Encode categorical columns
        X = df[self.feature_cols].copy()
        categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        if categorical_cols:
            print(f"Encoding {len(categorical_cols)} categorical columns...")
            for col in categorical_cols:
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
                self.label_encoders[col] = le

        # Create ranking dataset
        ranking_data = []
        group_sizes = []

        print(f"Processing {len(df)} customers...")

        for idx, (_, row) in enumerate(df.iterrows()):
            if (idx + 1) % 10000 == 0:
                print(f"Processed {idx + 1:,} customers")

            customer_id = row['Customer_id']
            preference_order = row['Channel_Preference_Order'].split(',')

            # Get customer features
            customer_features = X.iloc[idx].values

            # Create one sample per channel
            for rank, channel in enumerate(preference_order):
                if channel in self.channels:
                    # Channel one-hot encoding
                    channel_features = np.zeros(len(self.channels))
                    channel_idx = self.channels.index(channel)
                    channel_features[channel_idx] = 1

                    # Combine customer and channel features
                    combined_features = np.concatenate([customer_features, channel_features])

                    # Relevance score as integer (required by XGBoost)
                    # Best channel gets highest score, worst gets lowest
                    relevance = len(self.channels) - rank

                    ranking_data.append({
                        'customer_id': customer_id,
                        'channel': channel,
                        'features': combined_features,
                        'relevance': relevance,
                        'group_id': idx
                    })

            group_sizes.append(len(self.channels))

        # Convert to arrays
        X_ranking = np.array([item['features'] for item in ranking_data])
        y_ranking = np.array([item['relevance'] for item in ranking_data])
        groups = np.array(group_sizes)

        print(f"\nRanking dataset created:")
        print(f"Total samples: {len(X_ranking):,}")
        print(f"Total customers (groups): {len(groups):,}")
        print(f"Features per sample: {X_ranking.shape[1]}")
        print(f"Group sizes (samples per customer): {groups[0]} (all should be {len(self.channels)})")
        print(f"Relevance score range: {y_ranking.min()} to {y_ranking.max()}")

        return X_ranking, y_ranking, groups

def train_xgb_ranker(X, y, groups, test_size=0.2, random_state=42):
    """Train XGBRanker model"""
    print("\n" + "="*60)
    print("TRAINING XGBRANKER MODEL")
    print("="*60)

    # Calculate split points for groups
    n_train_groups = int(len(groups) * (1 - test_size))
    train_samples = sum(groups[:n_train_groups])

    # Split data
    X_train = X[:train_samples]
    y_train = y[:train_samples]
    groups_train = groups[:n_train_groups]

    X_test = X[train_samples:]
    y_test = y[train_samples:]
    groups_test = groups[n_train_groups:]

    print(f"Feature matrix shape: {X_train.shape}")
    print(f"Label vector shape: {y_train.shape}")
    print(f"Number of groups: {len(groups_train):,}")
    print(f"\nTrain set: {len(X_train):,} samples from {len(groups_train):,} customers")
    print(f"Test set: {len(X_test):,} samples from {len(groups_test):,} customers")

    # Initialize XGBRanker
    print("\nInitializing XGBRanker...")
    model = xgb.XGBRanker(
        objective='rank:ndcg',
        learning_rate=0.1,
        max_depth=6,
        n_estimators=100,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=random_state,
        eval_metric='ndcg@6'
    )

    # Train model
    print("Training XGBRanker...")
    model.fit(
        X_train, y_train,
        group=groups_train,
        eval_set=[(X_test, y_test)],
        eval_group=[groups_test],
        verbose=True
    )

    print("\nModel training completed successfully!")

    # Make predictions
    print("\nMaking predictions...")
    y_pred = model.predict(X_test)

    # Calculate NDCG
    print("Calculating NDCG score...")
    # Reshape predictions for NDCG calculation
    y_test_reshaped = []
    y_pred_reshaped = []

    start_idx = 0
    for group_size in groups_test:
        end_idx = start_idx + group_size
        y_test_reshaped.append(y_test[start_idx:end_idx])
        y_pred_reshaped.append(y_pred[start_idx:end_idx])
        start_idx = end_idx

    ndcg = ndcg_score(y_test_reshaped, y_pred_reshaped)
    print(f"NDCG Score: {ndcg:.4f}")

    return model, ndcg

# Main execution
if __name__ == "__main__":
    print("="*60)
    print("PREPARING CHANNEL RANKING DATA FOR XGBRANKER")
    print("="*60)

    # Load data
    print("Loading features_with_channel_labels.csv...")
    try:
        df = pd.read_csv('features_with_channel_labels.csv')
        print(f"Loaded data shape: {df.shape}")
    except FileNotFoundError:
        print("Error: features_with_channel_labels.csv not found!")
        print("Please run the labeling pipeline first.")
        exit(1)

    # Prepare data
    preparator = ChannelRankingDataPreparator()
    X, y, groups = preparator.prepare_ranking_data(df)

    # Train model
    model, ndcg_score = train_xgb_ranker(X, y, groups)

    # Save model only (no preparator)
    model_filename = 'xgb_channel_ranker.pkl'
    joblib.dump(model, model_filename)
    print(f"\nModel saved as '{model_filename}'")

    print("\n" + "="*60)
    print("MODEL TRAINING COMPLETED SUCCESSFULLY!")
    print("="*60)
    print(f"Feature columns used: {X.shape[1]}")
    print(f"Model saved as: {model_filename}")
    print(f"Total customers processed: {len(df):,}")
    print(f"Final NDCG Score: {ndcg_score:.4f}")

PREPARING CHANNEL RANKING DATA FOR XGBRANKER
Loading features_with_channel_labels.csv...
Loaded data shape: (100000, 56)
Preparing ranking format...
Encoding 1 categorical columns...
Processing 100000 customers...
Processed 10,000 customers
Processed 20,000 customers
Processed 30,000 customers
Processed 40,000 customers
Processed 50,000 customers
Processed 60,000 customers
Processed 70,000 customers
Processed 80,000 customers
Processed 90,000 customers
Processed 100,000 customers

Ranking dataset created:
Total samples: 600,000
Total customers (groups): 100,000
Features per sample: 52
Group sizes (samples per customer): 6 (all should be 6)
Relevance score range: 1 to 6

TRAINING XGBRANKER MODEL
Feature matrix shape: (480000, 52)
Label vector shape: (480000,)
Number of groups: 80,000

Train set: 480,000 samples from 80,000 customers
Test set: 120,000 samples from 20,000 customers

Initializing XGBRanker...
Training XGBRanker...
[0]	validation_0-ndcg@6:0.91221
[1]	validation_0-ndcg@6:0.9

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

class ChannelRankingDataPreparator:
    """Prepare data for LightGBM Ranker training"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
        self.label_encoders = {}
        self.feature_cols = None

    def prepare_ranking_data(self, df):
        """Convert preference data to ranking format"""
        print("Preparing ranking format...")

        # Get feature columns (exclude label columns and Customer_id)
        exclude_cols = ['Customer_id', 'Channel_Preference_Order', 'Preference_Label', 'Top_Channel']
        exclude_cols.extend([col for col in df.columns if 'Prefers_' in col])
        exclude_cols.extend(['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs'])

        self.feature_cols = [col for col in df.columns if col not in exclude_cols]

        # Encode categorical columns
        X = df[self.feature_cols].copy()
        categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        if categorical_cols:
            print(f"Encoding {len(categorical_cols)} categorical columns...")
            for col in categorical_cols:
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
                self.label_encoders[col] = le

        # Create ranking dataset
        ranking_data = []
        group_sizes = []

        print(f"Processing {len(df)} customers...")

        for idx, (_, row) in enumerate(df.iterrows()):
            if (idx + 1) % 10000 == 0:
                print(f"Processed {idx + 1:,} customers")

            customer_id = row['Customer_id']
            preference_order = row['Channel_Preference_Order'].split(',')

            # Get customer features
            customer_features = X.iloc[idx].values

            # Create one sample per channel
            for rank, channel in enumerate(preference_order):
                if channel in self.channels:
                    # Channel one-hot encoding
                    channel_features = np.zeros(len(self.channels))
                    channel_idx = self.channels.index(channel)
                    channel_features[channel_idx] = 1

                    # Combine customer and channel features
                    combined_features = np.concatenate([customer_features, channel_features])

                    # Relevance score as integer (required by LightGBM)
                    # Best channel gets highest score, worst gets lowest
                    relevance = len(self.channels) - rank

                    ranking_data.append({
                        'customer_id': customer_id,
                        'channel': channel,
                        'features': combined_features,
                        'relevance': relevance,
                        'group_id': idx
                    })

            group_sizes.append(len(self.channels))

        # Convert to arrays
        X_ranking = np.array([item['features'] for item in ranking_data])
        y_ranking = np.array([item['relevance'] for item in ranking_data])
        groups = np.array(group_sizes)

        print(f"\nRanking dataset created:")
        print(f"Total samples: {len(X_ranking):,}")
        print(f"Total customers (groups): {len(groups):,}")
        print(f"Features per sample: {X_ranking.shape[1]}")
        print(f"Group sizes (samples per customer): {groups[0]} (all should be {len(self.channels)})")
        print(f"Relevance score range: {y_ranking.min()} to {y_ranking.max()}")

        return X_ranking, y_ranking, groups

def train_lightgbm_ranker(X, y, groups, test_size=0.2, random_state=42):
    """Train LightGBM Ranker model"""
    print("\n" + "="*60)
    print("TRAINING LIGHTGBM RANKER MODEL")
    print("="*60)

    # Calculate split points for groups
    n_train_groups = int(len(groups) * (1 - test_size))
    train_samples = sum(groups[:n_train_groups])

    # Split data
    X_train = X[:train_samples]
    y_train = y[:train_samples]
    groups_train = groups[:n_train_groups]

    X_test = X[train_samples:]
    y_test = y[train_samples:]
    groups_test = groups[n_train_groups:]

    print(f"Feature matrix shape: {X_train.shape}")
    print(f"Label vector shape: {y_train.shape}")
    print(f"Number of groups: {len(groups_train):,}")
    print(f"\nTrain set: {len(X_train):,} samples from {len(groups_train):,} customers")
    print(f"Test set: {len(X_test):,} samples from {len(groups_test):,} customers")

    # Create LightGBM datasets
    print("\nCreating LightGBM datasets...")
    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        group=groups_train,
        free_raw_data=False
    )

    test_data = lgb.Dataset(
        X_test,
        label=y_test,
        group=groups_test,
        free_raw_data=False,
        reference=train_data
    )

    # LightGBM parameters for ranking
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'ndcg_eval_at': [6],
        'learning_rate': 0.1,
        'num_leaves': 31,
        'max_depth': 6,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'random_state': random_state,
        'verbosity': -1
    }

    # Train model
    print("Training LightGBM Ranker...")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=100,
        valid_sets=[train_data, test_data],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.log_evaluation(50),  # Print every 50 rounds
            lgb.early_stopping(20)   # Early stopping if no improvement for 20 rounds
        ]
    )

    print("\nModel training completed successfully!")

    # Make predictions
    print("\nMaking predictions...")
    y_pred = model.predict(X_test)

    # Calculate NDCG
    print("Calculating NDCG score...")
    # Reshape predictions for NDCG calculation
    y_test_reshaped = []
    y_pred_reshaped = []

    start_idx = 0
    for group_size in groups_test:
        end_idx = start_idx + group_size
        y_test_reshaped.append(y_test[start_idx:end_idx])
        y_pred_reshaped.append(y_pred[start_idx:end_idx])
        start_idx = end_idx

    ndcg = ndcg_score(y_test_reshaped, y_pred_reshaped)
    print(f"NDCG Score: {ndcg:.4f}")

    return model, ndcg

# Main execution
if __name__ == "__main__":
    print("="*60)
    print("PREPARING CHANNEL RANKING DATA FOR LIGHTGBM RANKER")
    print("="*60)

    # Load data
    print("Loading features_with_channel_labels.csv...")
    try:
        df = pd.read_csv('features_with_channel_labels.csv')
        print(f"Loaded data shape: {df.shape}")
    except FileNotFoundError:
        print("Error: features_with_channel_labels.csv not found!")
        print("Please run the labeling pipeline first.")
        exit(1)

    # Prepare data
    preparator = ChannelRankingDataPreparator()
    X, y, groups = preparator.prepare_ranking_data(df)

    # Train model
    model, ndcg_score = train_lightgbm_ranker(X, y, groups)

    # Save model only (no preparator)
    model_filename = 'lightgbm_channel_ranker.pkl'
    joblib.dump(model, model_filename)
    print(f"\nModel saved as '{model_filename}'")

    print("\n" + "="*60)
    print("MODEL TRAINING COMPLETED SUCCESSFULLY!")
    print("="*60)
    print(f"Feature columns used: {X.shape[1]}")
    print(f"Model saved as: {model_filename}")
    print(f"Total customers processed: {len(df):,}")
    print(f"Final NDCG Score: {ndcg_score:.4f}")

PREPARING CHANNEL RANKING DATA FOR LIGHTGBM RANKER
Loading features_with_channel_labels.csv...
Loaded data shape: (100000, 56)
Preparing ranking format...
Encoding 1 categorical columns...
Processing 100000 customers...
Processed 10,000 customers
Processed 20,000 customers
Processed 30,000 customers
Processed 40,000 customers
Processed 50,000 customers
Processed 60,000 customers
Processed 70,000 customers
Processed 80,000 customers
Processed 90,000 customers
Processed 100,000 customers

Ranking dataset created:
Total samples: 600,000
Total customers (groups): 100,000
Features per sample: 52
Group sizes (samples per customer): 6 (all should be 6)
Relevance score range: 1 to 6

TRAINING LIGHTGBM RANKER MODEL
Feature matrix shape: (480000, 52)
Label vector shape: (480000,)
Number of groups: 80,000

Train set: 480,000 samples from 80,000 customers
Test set: 120,000 samples from 20,000 customers

Creating LightGBM datasets...
Training LightGBM Ranker...
Training until validation scores don'

In [10]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

class LightGBMPredictor:
    """LightGBM model predictor for channel preference ranking"""

    def __init__(self, model_path='lightgbm_channel_ranker.pkl'):
        """
        Initialize the LightGBM predictor
        """
        print("üîß Initializing LightGBM Predictor...")
        try:
            # Load the model
            self.model = joblib.load(model_path)
            print("‚úÖ LightGBM model loaded successfully!")

            # Define channels (same as training)
            self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
            print(f"   Channels: {self.channels}")

        except FileNotFoundError:
            print("‚ùå Model file not found. Please ensure the model file exists.")
            raise
        except Exception as e:
            print(f"‚ùå Error loading model: {e}")
            raise

    def load_customer_data(self, data_path, customer_id=None):
        """
        Load customer data for prediction
        """
        print(f"üìÅ Loading customer data from {data_path}...")
        try:
            self.df = pd.read_csv(data_path)
            print(f"‚úÖ Loaded {len(self.df)} customers")
            print(f"‚úÖ Columns in data: {len(self.df.columns)}")

            if customer_id:
                self.customer_data = self.df[self.df['Customer_id'] == customer_id]
                if len(self.customer_data) == 0:
                    available_ids = self.df['Customer_id'].head(5).tolist()
                    raise ValueError(f"Customer ID '{customer_id}' not found! Available: {available_ids}")
                print(f"üéØ Predicting for customer: {customer_id}")
            else:
                self.customer_data = self.df
                print(f"üéØ Predicting for all {len(self.customer_data)} customers")

        except Exception as e:
            print(f"‚ùå Error loading customer data: {e}")
            raise

    def load_processed_data(self, data_path):
        """
        Load processed data for age and income band information
        """
        print(f"üìÅ Loading processed data from {data_path}...")
        try:
            self.processed_df = pd.read_csv(data_path)
            print(f"‚úÖ Loaded {len(self.processed_df)} customers from processed data")
        except Exception as e:
            print(f"‚ùå Error loading processed data: {e}")
            raise

    def prepare_features(self, customer_data):
        """
        Prepare features for prediction
        """
        # Ensure we have a DataFrame
        if isinstance(customer_data, pd.Series):
            customer_data = customer_data.to_frame().T

        # Get feature columns (exclude label and ID columns)
        exclude_cols = ['Customer_id', 'Channel_Preference_Order', 'Preference_Label', 'Top_Channel']
        exclude_cols.extend([col for col in customer_data.columns if 'Prefers_' in col])
        exclude_cols.extend(['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs'])

        feature_cols = [col for col in customer_data.columns if col not in exclude_cols]

        print(f"   üîß Preparing {len(feature_cols)} features...")

        # Extract features
        X_customer = customer_data[feature_cols].copy()

        # Encode categorical features
        categorical_cols = X_customer.select_dtypes(include=['object']).columns.tolist()

        if categorical_cols:
            print(f"   üîß Encoding {len(categorical_cols)} categorical columns...")
            for col in categorical_cols:
                try:
                    # Use LabelEncoder for categorical columns
                    le = LabelEncoder()
                    X_customer[col] = le.fit_transform(X_customer[col].astype(str))
                except Exception as e:
                    print(f"   ‚ö†Ô∏è  Encoding issue with {col}: {e}")
                    # Use default value for encoding issues
                    X_customer[col] = 0

        # Return flattened array for single customer
        return X_customer.values.flatten(), feature_cols

    def predict_single_customer(self, customer_data):
        """
        Predict channel preferences for a single customer with 0-1 normalized scores
        """
        print("   üéØ Making predictions...")

        # Ensure proper DataFrame format
        if isinstance(customer_data, pd.Series):
            customer_data = customer_data.to_frame().T

        customer_id = customer_data['Customer_id'].iloc[0]

        # Prepare customer features
        customer_features, feature_cols = self.prepare_features(customer_data)

        # Create samples for each channel
        channel_predictions = []

        for channel in self.channels:
            # Channel one-hot encoding
            channel_features = np.zeros(len(self.channels))
            channel_idx = self.channels.index(channel)
            channel_features[channel_idx] = 1

            # Combine customer and channel features
            combined_features = np.concatenate([customer_features, channel_features])

            # Make prediction
            try:
                prediction_score = self.model.predict(combined_features.reshape(1, -1))[0]
                channel_predictions.append({
                    'channel': channel,
                    'score': prediction_score,
                    'customer_id': customer_id
                })
            except Exception as e:
                print(f"   ‚ùå Prediction error for {channel}: {e}")
                channel_predictions.append({
                    'channel': channel,
                    'score': 0.0,
                    'customer_id': customer_id
                })

        # Sort channels by prediction score (descending)
        channel_predictions.sort(key=lambda x: x['score'], reverse=True)

        # Normalize scores to 0-1 range using softmax
        scores = np.array([pred['score'] for pred in channel_predictions])

        # Apply softmax to get probabilities between 0-1
        exp_scores = np.exp(scores - np.max(scores))  # Subtract max for numerical stability
        probabilities = exp_scores / np.sum(exp_scores)

        # Update scores with normalized values
        for i, pred in enumerate(channel_predictions):
            pred['score'] = probabilities[i]

        print(f"   ‚úÖ Predictions completed for {customer_id}")
        return channel_predictions

    def predict_multiple_customers(self, customer_ids=None):
        """
        Predict channel preferences for multiple customers
        """
        if customer_ids:
            customers_to_predict = self.df[self.df['Customer_id'].isin(customer_ids)]
            if len(customers_to_predict) == 0:
                print("‚ùå No matching customer IDs found!")
                return {}
            print(f"üîç Predicting for {len(customers_to_predict)} specific customers...")
        else:
            customers_to_predict = self.customer_data
            print(f"üîç Predicting for all {len(customers_to_predict)} customers...")

        all_predictions = {}

        for idx, (_, customer_row) in enumerate(customers_to_predict.iterrows()):
            customer_df = pd.DataFrame([customer_row])
            predictions = self.predict_single_customer(customer_df)
            all_predictions[customer_row['Customer_id']] = predictions

            if (idx + 1) % 100 == 0 and idx > 0:
                print(f"   üìä Processed {idx + 1} customers...")

        return all_predictions

    def format_predictions(self, predictions, top_n=6):
        """
        Format predictions in a user-friendly way with 0-1 scores
        """
        if not predictions:
            return "‚ùå No predictions available."

        if isinstance(predictions, list):
            # Single customer prediction
            result = f"\nüéØ CHANNEL PREFERENCE PREDICTION\n"
            result += "=" * 50 + "\n"
            result += f"Customer: {predictions[0]['customer_id']}\n"
            result += "=" * 50 + "\n"

            for i, pred in enumerate(predictions[:top_n], 1):
                stars = "‚òÖ" * min(i, 5)
                # Format score as percentage between 0-1 with 4 decimal places
                result += f"{i:2d}. {pred['channel']:12} Score: {pred['score']:7.4f} {stars}\n"

            # Add confidence analysis based on normalized scores
            confidence_gap = predictions[0]['score'] - predictions[1]['score']
            result += f"\nüí° Confidence: "
            if confidence_gap > 0.3:
                result += "HIGH (Clear preference)\n"
            elif confidence_gap > 0.15:
                result += "MEDIUM (Strong preference)\n"
            else:
                result += "LOW (Consider multiple channels)\n"

            # Add recommendation
            top_channel = predictions[0]['channel']
            result += f"üöÄ RECOMMENDATION: Start with {top_channel}\n"

        else:
            # Multiple customers prediction
            result = f"\nüìä BATCH PREDICTION RESULTS ({len(predictions)} customers)\n"
            result += "=" * 60 + "\n"

            for i, (customer_id, customer_predictions) in enumerate(list(predictions.items())[:10]):
                result += f"\n{i+1:2d}. üë§ {customer_id}:\n"
                result += f"    ü•á {customer_predictions[0]['channel']} "
                result += f"(Score: {customer_predictions[0]['score']:.4f})\n"
                if len(customer_predictions) > 2:
                    result += f"    üìã Strategy: {customer_predictions[0]['channel']} ‚Üí "
                    result += f"{customer_predictions[1]['channel']} ‚Üí "
                    result += f"{customer_predictions[2]['channel']}\n"

        return result

    def get_customer_insights(self, customer_id):
        """
        Get additional insights about a customer from processed_data.csv
        """
        if not hasattr(self, 'processed_df'):
            return None

        customer_row = self.processed_df[self.processed_df['Customer_id'] == customer_id]
        if customer_row.empty:
            return None

        insights = {
            'customer_id': customer_id,
            'age': customer_row['Age'].iloc[0] if 'Age' in customer_row.columns else 'N/A',
            'income_band': customer_row['Income_Band_SGD'].iloc[0] if 'Income_Band_SGD' in customer_row.columns else 'N/A'
        }

        return insights

    def save_predictions_to_csv(self, predictions, output_path='lightgbm_predictions.csv'):
        """
        Save predictions to CSV file
        """
        try:
            if isinstance(predictions, list):
                # Single customer
                df_output = pd.DataFrame(predictions)
            else:
                # Multiple customers
                all_predictions = []
                for customer_id, customer_predictions in predictions.items():
                    for i, pred in enumerate(customer_predictions, 1):
                        pred['rank'] = i
                        pred['customer_id'] = customer_id
                        all_predictions.append(pred)

                df_output = pd.DataFrame(all_predictions)

            df_output.to_csv(output_path, index=False)
            print(f"üíæ Predictions saved to {output_path}")
            return df_output
        except Exception as e:
            print(f"‚ùå Error saving predictions: {e}")
            return None

def predict_single_customer(customer_id):
    """Prediction function for a single customer"""
    print(f"\nüîç PREDICTION FOR: {customer_id}")
    print("=" * 50)

    try:
        # Initialize predictor
        predictor = LightGBMPredictor('lightgbm_channel_ranker.pkl')

        # Load customer data for prediction
        predictor.load_customer_data('features_with_channel_labels.csv', customer_id)

        # Load processed data for age and income band
        predictor.load_processed_data('processed_data.csv')

        # Predict
        predictions = predictor.predict_single_customer(predictor.customer_data)

        # Format results
        result = predictor.format_predictions(predictions)

        # Add customer insights (age and income band only)
        insights = predictor.get_customer_insights(customer_id)
        if insights:
            result += "\nüìä CUSTOMER INSIGHTS:\n"
            result += "-" * 30 + "\n"
            result += f"   Age: {insights['age']}\n"
            result += f"   Income Band: {insights['income_band']}\n"

        return result

    except FileNotFoundError as e:
        return f"‚ùå File error: {e}"
    except ValueError as e:
        return f"‚ùå Customer error: {e}"
    except Exception as e:
        return f"‚ùå Prediction error: {str(e)}"

def batch_predict(customer_ids):
    """Batch prediction function for multiple customers"""
    print(f"\nüîç BATCH PREDICTION FOR {len(customer_ids)} CUSTOMERS")
    print("=" * 50)

    try:
        predictor = LightGBMPredictor('lightgbm_channel_ranker.pkl')
        predictor.load_customer_data('features_with_channel_labels.csv')
        predictor.load_processed_data('processed_data.csv')
        predictions = predictor.predict_multiple_customers(customer_ids)
        return predictions
    except Exception as e:
        print(f"‚ùå Batch prediction error: {e}")
        return {}

def interactive_prediction():
    """Interactive interface for predicting channel preferences"""
    print("=" * 60)
    print("üîç LIGHTGBM CHANNEL PREFERENCE PREDICTOR")
    print("=" * 60)
    print("üìä Using features_with_channel_labels.csv for predictions")
    print("üìä Using processed_data.csv for age and income band information")
    print("üìà Scores are normalized to 0-1 range using softmax")

    try:
        predictor = LightGBMPredictor('lightgbm_channel_ranker.pkl')
        predictor.load_customer_data('features_with_channel_labels.csv')
        predictor.load_processed_data('processed_data.csv')
    except Exception as e:
        print(f"‚ùå Initialization failed: {e}")
        return

    # Show sample customer IDs
    sample_customers = predictor.df['Customer_id'].head(10).tolist()
    print(f"\nüìã Sample Customer IDs (first 10):")
    for i, cust_id in enumerate(sample_customers, 1):
        print(f"   {i:2d}. {cust_id}")

    while True:
        print("\n" + "-" * 60)
        print("Prediction Options:")
        print("1. üîç Predict for a specific customer")
        print("2. üìä Predict for multiple customers")
        print("3. üåê Predict for all customers")
        print("4. üíæ Save current predictions to CSV")
        print("5. ‚ùå Exit")

        choice = input("\nEnter your choice (1-5): ").strip()

        if choice == '1':
            customer_id = input("Enter Customer ID: ").strip()
            if customer_id:
                result = predict_single_customer(customer_id)
                print(result)

        elif choice == '2':
            customer_ids_input = input("Enter Customer IDs (comma-separated): ").strip()
            customer_ids = [cid.strip() for cid in customer_ids_input.split(',')]

            if customer_ids:
                predictions = batch_predict(customer_ids)
                if predictions:
                    print(predictor.format_predictions(predictions))

        elif choice == '3':
            confirm = input("Predict for ALL customers? This may take time. (y/n): ").strip().lower()
            if confirm in ['y', 'yes']:
                predictions = predictor.predict_multiple_customers()
                print(predictor.format_predictions(predictions))

        elif choice == '4':
            try:
                output_path = input("Enter output file name (default: lightgbm_predictions.csv): ").strip()
                if not output_path:
                    output_path = 'lightgbm_predictions.csv'

                # Get predictions for all customers
                predictions = predictor.predict_multiple_customers()
                saved_file = predictor.save_predictions_to_csv(predictions, output_path)
                if saved_file is not None:
                    print(f"‚úÖ Successfully saved {len(saved_file)} predictions to {output_path}")
            except Exception as e:
                print(f"‚ùå Error saving file: {e}")

        elif choice == '5':
            print("üëã Thank you for using LightGBM Predictor!")
            break

        else:
            print("‚ùå Invalid choice. Please try again.")

# üöÄ MAIN EXECUTION

if __name__ == "__main__":
    print("üöÄ LIGHTGBM CHANNEL PREFERENCE PREDICTION SYSTEM")
    print("=" * 60)
    print("üìä Using features_with_channel_labels.csv for predictions")
    print("üìä Using processed_data.csv for age and income band information")
    print("üìà Scores are normalized to 0-1 range using softmax")

    # Directly launch interactive interface
    interactive_prediction()

üöÄ LIGHTGBM CHANNEL PREFERENCE PREDICTION SYSTEM
üìä Using features_with_channel_labels.csv for predictions
üìä Using processed_data.csv for age and income band information
üìà Scores are normalized to 0-1 range using softmax
üîç LIGHTGBM CHANNEL PREFERENCE PREDICTOR
üìä Using features_with_channel_labels.csv for predictions
üìä Using processed_data.csv for age and income band information
üìà Scores are normalized to 0-1 range using softmax
üîß Initializing LightGBM Predictor...
‚úÖ LightGBM model loaded successfully!
   Channels: ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
üìÅ Loading customer data from features_with_channel_labels.csv...
‚úÖ Loaded 100000 customers
‚úÖ Columns in data: 56
üéØ Predicting for all 100000 customers
üìÅ Loading processed data from processed_data.csv...
‚úÖ Loaded 100000 customers from processed data

üìã Sample Customer IDs (first 10):
    1. SCB843421788
    2. SCB998027725
    3. SCB871158951
    4. SCB938686930
    5. SCB84

In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

class ChannelRankingDataPreparator:
    """Prepare data for Random Forest Ranker training"""

    def __init__(self):
        self.channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
        self.label_encoders = {}
        self.feature_cols = None

    def prepare_ranking_data(self, df):
        """Convert preference data to ranking format"""
        print("Preparing ranking format...")

        # Get feature columns (exclude label columns and Customer_id)
        exclude_cols = ['Customer_id', 'Channel_Preference_Order', 'Preference_Label', 'Top_Channel']
        exclude_cols.extend([col for col in df.columns if 'Prefers_' in col])
        exclude_cols.extend(['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs'])

        self.feature_cols = [col for col in df.columns if col not in exclude_cols]

        # Encode categorical columns
        X = df[self.feature_cols].copy()
        categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        if categorical_cols:
            print(f"Encoding {len(categorical_cols)} categorical columns...")
            for col in categorical_cols:
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
                self.label_encoders[col] = le

        # Create ranking dataset
        ranking_data = []
        group_sizes = []

        print(f"Processing {len(df)} customers...")

        for idx, (_, row) in enumerate(df.iterrows()):
            if (idx + 1) % 10000 == 0:
                print(f"Processed {idx + 1:,} customers")

            customer_id = row['Customer_id']
            preference_order = row['Channel_Preference_Order'].split(',')

            # Get customer features
            customer_features = X.iloc[idx].values

            # Create one sample per channel
            for rank, channel in enumerate(preference_order):
                if channel in self.channels:
                    # Channel one-hot encoding
                    channel_features = np.zeros(len(self.channels))
                    channel_idx = self.channels.index(channel)
                    channel_features[channel_idx] = 1

                    # Combine customer and channel features
                    combined_features = np.concatenate([customer_features, channel_features])

                    # Relevance score as integer
                    relevance = len(self.channels) - rank

                    ranking_data.append({
                        'customer_id': customer_id,
                        'channel': channel,
                        'features': combined_features,
                        'relevance': relevance,
                        'group_id': idx
                    })

            group_sizes.append(len(self.channels))

        # Convert to arrays
        X_ranking = np.array([item['features'] for item in ranking_data])
        y_ranking = np.array([item['relevance'] for item in ranking_data])
        groups = np.array(group_sizes)

        print(f"\nRanking dataset created:")
        print(f"Total samples: {len(X_ranking):,}")
        print(f"Total customers (groups): {len(groups):,}")
        print(f"Features per sample: {X_ranking.shape[1]}")
        print(f"Group sizes (samples per customer): {groups[0]} (all should be {len(self.channels)})")
        print(f"Relevance score range: {y_ranking.min()} to {y_ranking.max()}")

        return X_ranking, y_ranking, groups

def train_random_forest_ranker(X, y, groups, test_size=0.2, random_state=42):
    """Train Random Forest model for ranking"""
    print("\n" + "="*60)
    print("TRAINING RANDOM FOREST RANKER MODEL")
    print("="*60)

    # Calculate split points for groups
    n_train_groups = int(len(groups) * (1 - test_size))
    train_samples = sum(groups[:n_train_groups])

    # Split data
    X_train = X[:train_samples]
    y_train = y[:train_samples]
    groups_train = groups[:n_train_groups]

    X_test = X[train_samples:]
    y_test = y[train_samples:]
    groups_test = groups[n_train_groups:]

    print(f"Feature matrix shape: {X_train.shape}")
    print(f"Label vector shape: {y_train.shape}")
    print(f"Number of groups: {len(groups_train):,}")
    print(f"\nTrain set: {len(X_train):,} samples from {len(groups_train):,} customers")
    print(f"Test set: {len(X_test):,} samples from {len(groups_test):,} customers")

    # Random Forest parameters
    rf_params = {
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 5,
        'min_samples_leaf': 2,
        'max_features': 'sqrt',
        'bootstrap': True,
        'random_state': random_state,
        'n_jobs': -1,
        'verbose': 0  # Suppress training output
    }

    # Initialize and train Random Forest
    print("\nTraining Random Forest...")
    model = RandomForestRegressor(**rf_params)
    model.fit(X_train, y_train)

    print("Model training completed successfully!")

    # Make predictions
    print("\nMaking predictions...")
    y_pred = model.predict(X_test)

    # Calculate NDCG
    print("Calculating NDCG score...")
    # Reshape predictions for NDCG calculation
    y_test_reshaped = []
    y_pred_reshaped = []

    start_idx = 0
    for group_size in groups_test:
        end_idx = start_idx + group_size
        y_test_reshaped.append(y_test[start_idx:end_idx])
        y_pred_reshaped.append(y_pred[start_idx:end_idx])
        start_idx = end_idx

    ndcg = ndcg_score(y_test_reshaped, y_pred_reshaped)
    print(f"NDCG Score: {ndcg:.4f}")

    return model, ndcg

# Main execution
if __name__ == "__main__":
    print("="*60)
    print("PREPARING CHANNEL RANKING DATA FOR RANDOM FOREST RANKER")
    print("="*60)

    # Load data
    print("Loading features_with_channel_labels.csv...")
    try:
        df = pd.read_csv('features_with_channel_labels.csv')
        print(f"Loaded data shape: {df.shape}")
    except FileNotFoundError:
        print("Error: features_with_channel_labels.csv not found!")
        print("Please run the labeling pipeline first.")
        exit(1)

    # Prepare data
    preparator = ChannelRankingDataPreparator()
    X, y, groups = preparator.prepare_ranking_data(df)

    # Train model
    model, ndcg_score = train_random_forest_ranker(X, y, groups)

    # Save model only (no preparator)
    model_filename = 'random_forest_channel_ranker.pkl'
    joblib.dump(model, model_filename)
    print(f"\nModel saved as '{model_filename}'")

    print("\n" + "="*60)
    print("MODEL TRAINING COMPLETED SUCCESSFULLY!")
    print("="*60)
    print(f"Feature columns used: {X.shape[1]}")
    print(f"Model saved as: {model_filename}")
    print(f"Total customers processed: {len(df):,}")
    print(f"Final NDCG Score: {ndcg_score:.4f}")

PREPARING CHANNEL RANKING DATA FOR RANDOM FOREST RANKER
Loading features_with_channel_labels.csv...
Loaded data shape: (100000, 56)
Preparing ranking format...
Encoding 1 categorical columns...
Processing 100000 customers...
Processed 10,000 customers
Processed 20,000 customers
Processed 30,000 customers
Processed 40,000 customers
Processed 50,000 customers
Processed 60,000 customers
Processed 70,000 customers
Processed 80,000 customers
Processed 90,000 customers
Processed 100,000 customers

Ranking dataset created:
Total samples: 600,000
Total customers (groups): 100,000
Features per sample: 52
Group sizes (samples per customer): 6 (all should be 6)
Relevance score range: 1 to 6

TRAINING RANDOM FOREST RANKER MODEL
Feature matrix shape: (480000, 52)
Label vector shape: (480000,)
Number of groups: 80,000

Train set: 480,000 samples from 80,000 customers
Test set: 120,000 samples from 20,000 customers

Training Random Forest...
Model training completed successfully!

Making predictions.

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
import joblib
import warnings
warnings.filterwarnings('ignore')

class ModelComparison:
    """Compare XGBoost, LightGBM, and Random Forest models based on NDCG scores"""

    def __init__(self):
        self.models = {}
        self.results = {}

    def load_models(self):
        """Load all trained models"""
        print("üìÅ Loading trained models...")

        model_files = {
            'XGBoost': 'xgb_channel_ranker.pkl',
            'LightGBM': 'lightgbm_channel_ranker.pkl',
            'Random Forest': 'random_forest_channel_ranker.pkl'
        }

        for model_name, file_path in model_files.items():
            try:
                self.models[model_name] = joblib.load(file_path)
                print(f"‚úÖ {model_name} loaded successfully")
            except FileNotFoundError:
                print(f"‚ùå {model_name} model file not found: {file_path}")

        return len(self.models) > 0

    def load_test_data(self, data_path='features_with_channel_labels.csv'):
        """Load and prepare test data for evaluation"""
        print("\nüìä Loading test data...")
        self.df = pd.read_csv(data_path)

        # Use the same data preparation as during training
        exclude_cols = ['Customer_id', 'Channel_Preference_Order', 'Preference_Label', 'Top_Channel']
        exclude_cols.extend([col for col in self.df.columns if 'Prefers_' in col])
        exclude_cols.extend(['Last_Successful_Agent_ID', 'Best_Contact_Agent_IDs'])
        self.feature_cols = [col for col in self.df.columns if col not in exclude_cols]

        # Prepare ranking format data
        X, y, groups = self.prepare_ranking_data(self.df)

        # Split for evaluation (use last 20% as test set)
        n_test_groups = int(len(groups) * 0.2)
        test_samples = sum(groups[-n_test_groups:])

        self.X_test = X[-test_samples:]
        self.y_test = y[-test_samples:]
        self.groups_test = groups[-n_test_groups:]

        print(f"üìà Test set: {len(self.X_test):,} samples from {len(self.groups_test):,} customers")
        return True

    def prepare_ranking_data(self, df):
        """Prepare data in ranking format (same as training)"""
        from sklearn.preprocessing import LabelEncoder

        # Encode categorical features
        X = df[self.feature_cols].copy()
        categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
        label_encoders = {}

        for col in categorical_cols:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            label_encoders[col] = le

        # Create ranking dataset
        channels = ['Call', 'SMS', 'WhatsApp', 'Email', 'IVR', 'Field_Agent']
        ranking_data = []
        group_sizes = []

        for idx, (_, row) in enumerate(df.iterrows()):
            preference_order = row['Channel_Preference_Order'].split(',')
            customer_features = X.iloc[idx].values

            for rank, channel in enumerate(preference_order):
                if channel in channels:
                    channel_features = np.zeros(len(channels))
                    channel_idx = channels.index(channel)
                    channel_features[channel_idx] = 1

                    combined_features = np.concatenate([customer_features, channel_features])
                    relevance = len(channels) - rank

                    ranking_data.append({
                        'features': combined_features,
                        'relevance': relevance,
                        'group_id': idx
                    })

            group_sizes.append(len(channels))

        X_ranking = np.array([item['features'] for item in ranking_data])
        y_ranking = np.array([item['relevance'] for item in ranking_data])
        groups = np.array(group_sizes)

        return X_ranking, y_ranking, groups

    def evaluate_models(self):
        """Evaluate all models on test data"""
        print("\nüìà Evaluating models on test data...")

        for model_name, model in self.models.items():
            print(f"\nüîç Evaluating {model_name}...")

            # Make predictions
            y_pred = model.predict(self.X_test)

            # Calculate NDCG
            ndcg = self.calculate_ndcg(self.y_test, y_pred, self.groups_test)

            self.results[model_name] = {
                'ndcg': ndcg,
                'predictions': y_pred
            }

            print(f"   ‚úÖ NDCG: {ndcg:.4f}")

        return self.results

    def calculate_ndcg(self, y_true, y_pred, groups):
        """Calculate NDCG score for ranking evaluation"""
        y_true_reshaped = []
        y_pred_reshaped = []

        start_idx = 0
        for group_size in groups:
            end_idx = start_idx + group_size
            y_true_reshaped.append(y_true[start_idx:end_idx])
            y_pred_reshaped.append(y_pred[start_idx:end_idx])
            start_idx = end_idx

        return ndcg_score(y_true_reshaped, y_pred_reshaped)

    def generate_comparison_report(self):
        """Generate model comparison report sorted by NDCG score"""
        print("\n" + "="*60)
        print("üèÜ MODEL COMPARISON RESULTS")
        print("="*60)

        # Create comparison table
        comparison_data = []
        for model_name, results in self.results.items():
            comparison_data.append({
                'Model': model_name,
                'NDCG Score': results['ndcg']
            })

        # Sort by NDCG score (highest first)
        df_comparison = pd.DataFrame(comparison_data)
        df_comparison = df_comparison.sort_values('NDCG Score', ascending=False)
        df_comparison['Rank'] = range(1, len(df_comparison) + 1)

        print("\n" + df_comparison.to_string(index=False))

        # Winner announcement
        best_model = df_comparison.iloc[0]
        print(f"\nüéØ BEST PERFORMING MODEL: {best_model['Model']}")
        print(f"   ‚Ä¢ NDCG Score: {best_model['NDCG Score']:.4f}")
        print(f"   ‚Ä¢ Rank: #{best_model['Rank']}")

        # Performance gaps
        if len(df_comparison) > 1:
            best_score = best_model['NDCG Score']
            second_best = df_comparison.iloc[1]['NDCG Score']
            gap = best_score - second_best
            print(f"   ‚Ä¢ Performance gap: {gap:.4f} over second best")

        return df_comparison

# Main execution
if __name__ == "__main__":
    print("="*60)
    print("ü§ñ MODEL COMPARISON: XGBoost vs LightGBM vs Random Forest")
    print("="*60)

    # Initialize comparison
    comparator = ModelComparison()

    # Load models
    if not comparator.load_models():
        print("‚ùå Failed to load models. Please ensure all model files exist.")
        exit(1)

    # Load test data
    if not comparator.load_test_data():
        print("‚ùå Failed to load test data.")
        exit(1)

    # Evaluate models
    comparator.evaluate_models()

    # Generate comparison report
    results_df = comparator.generate_comparison_report()

    print("\n" + "="*60)
    print("‚úÖ MODEL COMPARISON COMPLETED!")
    print("="*60)

ü§ñ MODEL COMPARISON: XGBoost vs LightGBM vs Random Forest
üìÅ Loading trained models...
‚úÖ XGBoost loaded successfully
‚úÖ LightGBM loaded successfully
‚úÖ Random Forest loaded successfully

üìä Loading test data...
üìà Test set: 120,000 samples from 20,000 customers

üìà Evaluating models on test data...

üîç Evaluating XGBoost...
   ‚úÖ NDCG: 0.9991

üîç Evaluating LightGBM...
   ‚úÖ NDCG: 0.9998

üîç Evaluating Random Forest...
   ‚úÖ NDCG: 0.9977

üèÜ MODEL COMPARISON RESULTS

        Model  NDCG Score  Rank
     LightGBM    0.999807     1
      XGBoost    0.999092     2
Random Forest    0.997741     3

üéØ BEST PERFORMING MODEL: LightGBM
   ‚Ä¢ NDCG Score: 0.9998
   ‚Ä¢ Rank: #1
   ‚Ä¢ Performance gap: 0.0007 over second best

‚úÖ MODEL COMPARISON COMPLETED!
