In [10]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from collections import defaultdict
import gc

# Configuration
num_records = 100000
batch_size = 20000

# Singapore-specific data
singapore_districts = {
    "01": ["Raffles Place", "Marina", "People's Park"],
    "02": ["Anson", "Tanjong Pagar"],
    "03": ["Queenstown", "Tiong Bahru"],
    "04": ["Telok Blangah", "HarbourFront"],
    "05": ["Pasir Panjang", "Hong Leong Garden", "Clementi New Town"],
    "06": ["High Street", "Beach Road"],
    "07": ["Middle Road", "Golden Mile"],
    "08": ["Little India", "Farrer Park"],
    "09": ["Orchard", "Cairnhill", "River Valley"],
    "10": ["Ardmore", "Bukit Timah", "Holland Road"],
    "11": ["Watten Estate", "Novena", "Thomson"],
    "12": ["Balestier", "Toa Payoh", "Serangoon"],
    "13": ["Macpherson", "Braddell"],
    "14": ["Geylang", "Eunos"],
    "15": ["Katong", "Joo Chiat", "Amber Road"],
    "16": ["Bedok", "Upper East Coast", "Eastwood", "Kew Drive"],
    "17": ["Loyang", "Changi"],
    "18": ["Tampines", "Pasir Ris"],
    "19": ["Serangoon Garden", "Hougang", "Punggol"],
    "20": ["Bishan", "Ang Mo Kio"],
    "21": ["Upper Bukit Timah", "Clementi Park", "Ulu Pandan"],
    "22": ["Jurong"],
    "23": ["Hillview", "Dairy Farm", "Bukit Panjang", "Choa Chu Kang"],
    "24": ["Lim Chu Kang", "Tengah"],
    "25": ["Kranji", "Woodgrove"],
    "26": ["Upper Thomson", "Springleaf"],
    "27": ["Yishun", "Sembawang"],
    "28": ["Seletar"]
}

street_names = np.array([
    "Orchard Road", "North Bridge Road", "South Bridge Road", "Victoria Street",
    "Beach Road", "Serangoon Road", "Havelock Road", "Outram Road",
    "Cecil Street", "Robinson Road", "Maxwell Road", "Telok Ayer Street",
    "Bras Basah Road", "Bugis Street", "Chinatown Street", "Clarke Quay"
])

# Names arrays
chinese_first_names = np.array(["Wei", "Jie", "Ming", "Li", "Xin", "Hui", "Yan", "Feng", "Jun", "Ling"])
chinese_last_names = np.array(["Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Chua", "Chan", "Koh", "Teo"])
malay_first_names = np.array(["Ahmad", "Mohamed", "Siti", "Abdullah", "Fatimah", "Ali", "Rahman", "Zainal", "Nor", "Hassan"])
malay_last_names = np.array(["bin Ahmad", "binti Mohamed", "bin Ismail", "binti Abdullah", "bin Ali", "binti Hassan"])
indian_first_names = np.array(["Raj", "Kumar", "Suresh", "Priya", "Latha", "Arjun", "Divya", "Vijay", "Anand", "Deepa"])
indian_last_names = np.array(["Kumar", "Devi", "Singh", "Rao", "Patel", "Menon", "Pillai", "Sharma", "Subramaniam", "Gopal"])
western_first_names = np.array(["John", "David", "Mary", "Sarah", "Michael", "Jennifer", "James", "Linda", "Robert", "Elizabeth"])
western_last_names = np.array(["Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Garcia", "Rodriguez", "Wilson"])

email_domains = np.array(["gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "singnet.com.sg"])
product_types = np.array(["Personal loan", "Auto loan", "Credit card", "Education loan", "Business loan"])
interest_rates = {"Personal loan": 8.5, "Auto loan": 9.0, "Credit card": 10.0, "Education loan": 8.0, "Business loan": 9.5}
loan_amount_limits = {
    "Personal loan": (5000, 100000), "Auto loan": (5000, 200000), "Credit card": (5000, 50000),
    "Education loan": (5000, 100000), "Business loan": (5000, 500000)
}

income_bands = {
    "50,000 or Below": (25000, 50000),
    "50,000 to 100,000": (50000, 100000),
    "100,000 to 200,000": (100000, 200000),
    "200,000 to 300,000": (200000, 300000),
    "300,000 to 500,000": (300000, 500000),
    "500,000 or Above": (500000, 2000000)
}

singapore_cities = np.array([
    "Singapore City", "Jurong East", "Tampines", "Woodlands", "Bedok", "Sengkang",
    "Hougang", "Yishun", "Ang Mo Kio", "Bukit Merah", "Bukit Batok", "Pasir Ris",
    "Clementi", "Bishan", "Toa Payoh", "Serangoon", "Queenstown", "Punggol", "Kallang", "Bukit Panjang"
])

agent_ids = np.array([f"SCB_AG_{i:04d}" for i in range(1, 201)])

# Enhanced Customer ID generation
def generate_customer_id():
    """Generate Customer ID starting with 8 or 9, no 3 consecutive repeating numbers, and unique"""
    while True:
        # Start with 8 or 9
        first_digit = str(np.random.choice([8, 9]))

        # Generate remaining 8 digits
        digits = [first_digit]
        for i in range(8):
            if len(digits) >= 2 and digits[-1] == digits[-2]:
                # If last two digits are same, avoid making third consecutive
                available_digits = [str(x) for x in range(10) if str(x) != digits[-1]]
            else:
                available_digits = [str(x) for x in range(10)]

            next_digit = np.random.choice(available_digits)
            digits.append(next_digit)

        customer_id = "SCB" + ''.join(digits)

        # Check for three consecutive repeating digits
        has_three_consecutive = False
        for i in range(len(digits) - 2):
            if digits[i] == digits[i+1] == digits[i+2]:
                has_three_consecutive = True
                break

        if not has_three_consecutive:
            return customer_id

# Pre-generate unique customer IDs to ensure uniqueness
def generate_unique_customer_ids(n):
    """Generate n unique customer IDs"""
    customer_ids = set()
    while len(customer_ids) < n:
        customer_id = generate_customer_id()
        customer_ids.add(customer_id)
    return list(customer_ids)

# Enhanced Age-Occupation distribution with reduced extreme stress
def generate_age_occupation_distribution(n):
    """Generate realistic age-occupation pairs with reduced extreme stress profiles"""
    ages = []
    occupations = []

    for i in range(n):
        # Age distribution with realistic peaks
        age_group = np.random.choice([
            "young_adult", "adult", "middle_aged", "senior"
        ], p=[0.25, 0.40, 0.25, 0.10])

        if age_group == "young_adult":
            age = np.random.randint(18, 30)
            occupation = np.random.choice([
                "Student", "Employed", "Employed", "Self-Employed", "Unemployed"
            ], p=[0.30, 0.50, 0.10, 0.05, 0.05])

        elif age_group == "adult":
            age = np.random.randint(30, 45)
            occupation = np.random.choice([
                "Employed", "Self-Employed", "Homemaker", "Unemployed"
            ], p=[0.78, 0.15, 0.05, 0.02])  # Reduced unemployed

        elif age_group == "middle_aged":
            age = np.random.randint(45, 60)
            occupation = np.random.choice([
                "Employed", "Self-Employed", "Retired", "Homemaker", "Unemployed"
            ], p=[0.70, 0.20, 0.05, 0.04, 0.01])  # Reduced unemployed

        else:  # senior
            age = np.random.randint(60, 76)
            occupation = np.random.choice([
                "Retired", "Employed", "Self-Employed", "Homemaker", "Unemployed"
            ], p=[0.75, 0.15, 0.05, 0.04, 0.01])  # Reduced unemployed

        # Ensure realistic constraints
        if occupation == "Student" and age > 25:
            if np.random.random() < 0.02:
                age = min(age, 35)
            else:
                occupation = "Employed"

        if occupation == "Retired" and age < 55:
            if np.random.random() < 0.03:
                age = max(age, 50)
            else:
                occupation = "Employed"

        ages.append(age)
        occupations.append(occupation)

    return np.array(ages), np.array(occupations)

# Enhanced financial stress score with realistic logic
def calculate_financial_stress_score(monthly_income, total_spend, savings_ratio, payment_history,
                                   missed_payments, employment_status, age, occupation, debt_to_income_ratio):
    """Calculate realistic financial stress score based on multiple factors"""

    # Factor 1: Spending to Income Ratio (30% weight)
    spend_income_ratio = total_spend / monthly_income if monthly_income > 0 else 2.0
    if spend_income_ratio <= 0.6:
        factor1 = 0.1
    elif spend_income_ratio <= 0.8:
        factor1 = 0.3
    elif spend_income_ratio <= 1.0:
        factor1 = 0.6
    elif spend_income_ratio <= 1.2:
        factor1 = 0.8
    else:
        factor1 = 1.0

    # Factor 2: Savings Ratio (25% weight)
    if savings_ratio >= 0.2:
        factor2 = 0.1
    elif savings_ratio >= 0.1:
        factor2 = 0.3
    elif savings_ratio >= 0:
        factor2 = 0.6
    elif savings_ratio >= -0.1:
        factor2 = 0.8
    else:
        factor2 = 1.0

    # Factor 3: Payment History (20% weight)
    factor3 = 1 - payment_history

    # Factor 4: Missed Payments (15% weight)
    missed_payment_factor = min(1.0, missed_payments * 0.2)

    # Factor 5: Debt-to-Income Ratio (10% weight)
    if debt_to_income_ratio <= 0.3:
        factor5 = 0.1
    elif debt_to_income_ratio <= 0.5:
        factor5 = 0.3
    elif debt_to_income_ratio <= 0.7:
        factor5 = 0.6
    elif debt_to_income_ratio <= 1.0:
        factor5 = 0.8
    else:
        factor5 = 1.0

    # Calculate weighted stress score
    weighted_score = (
        factor1 * 0.30 +
        factor2 * 0.25 +
        factor3 * 0.20 +
        missed_payment_factor * 0.15 +
        factor5 * 0.10
    )

    # Age and employment adjustments
    if employment_status == "Unemployed":
        weighted_score = min(1.0, weighted_score * 1.3)
    elif occupation == "Student":
        weighted_score = min(1.0, weighted_score * 0.9)
    elif age > 60:
        weighted_score = min(1.0, weighted_score * 1.2)

    financial_stress_score = weighted_score * 100

    # Stress status classification
    if financial_stress_score <= 25:
        stress_status = "Low stress"
        health_status = "Healthy"
    elif financial_stress_score <= 50:
        stress_status = "Medium stress"
        health_status = "Moderate"
    elif financial_stress_score <= 75:
        stress_status = "High stress"
        health_status = "Stressed"
    else:
        stress_status = "Extreme High stress"
        health_status = "Critical"

    # Balance trends - More natural and realistic
    if savings_ratio > 0.15:
        balance_trend = "Rising"
    elif savings_ratio < -0.08:
        balance_trend = "Falling"
    else:
        balance_trend = "Stable"

    # Overdraft flag - More natural and realistic
    overdraft_flag = (savings_ratio < -0.12 or spend_income_ratio > 1.25)

    return {
        "finance_stress_status": stress_status,
        "financial_health_status": health_status,
        "financial_stress_score": round(financial_stress_score, 2),
        "avg_balance_trend": balance_trend,
        "overdraft_flag": overdraft_flag
    }

# Enhanced AAR score calculation with proper distribution
def calculate_aar_score(upi_count, debit_count, credit_count, cash_count, recurring_count,
                       total_transactions, monthly_income, total_spend, age, occupation):
    """Calculate AAR score based on transaction patterns"""

    # Base factors
    transaction_diversity = min(1.0, (upi_count + debit_count + credit_count) / max(1, total_transactions) * 2)
    recurring_ratio = recurring_count / max(1, total_transactions)
    digital_adoption = (upi_count + debit_count + credit_count) / max(1, total_transactions)

    # Age factor
    if age < 30:
        age_factor = 1.1
    elif age > 60:
        age_factor = 0.9
    else:
        age_factor = 1.0

    # Occupation factor
    if occupation in ["Employed", "Self-Employed"]:
        occupation_factor = 1.05
    elif occupation == "Student":
        occupation_factor = 0.95
    else:
        occupation_factor = 1.0

    # Calculate base AAR score
    base_score = (
        transaction_diversity * 0.4 +
        recurring_ratio * 0.3 +
        digital_adoption * 0.3
    ) * 0.7  # Scale to 0.7 max

    # Apply factors
    aar_score = base_score * age_factor * occupation_factor

    # Ensure distribution: 88% low, 10% medium, 2% high
    rand_val = np.random.random()
    if rand_val <= 0.88:  # Low risk
        aar_score = np.random.uniform(0.3, 0.5)
    elif rand_val <= 0.98:  # Medium risk (10%)
        aar_score = np.random.uniform(0.5, 0.65)
    else:  # High risk (2%)
        aar_score = np.random.uniform(0.65, 0.75)

    # Very rarely above 0.75
    if np.random.random() < 0.005:  # 0.5% chance
        aar_score = min(0.8, aar_score + 0.05)

    # Round to 6 decimal places
    aar_score = round(aar_score, 6)

    # Risk level classification
    if aar_score <= 0.5:
        aar_risk = "Low"
    elif aar_score <= 0.65:
        aar_risk = "Medium"
    else:
        aar_risk = "High"

    return {
        "aar_score": aar_score,
        "aar_risk_level": aar_risk
    }

# Enhanced payment behavior with reduced negative savings
def generate_enhanced_payment_behavior(monthly_income, total_spend, age, employment_status, occupation):
    """Generate payment behavior with reduced negative savings"""
    base_intensity = total_spend / 80

    # Age and occupation factors
    if age < 25:
        upi_factor, credit_factor, cash_factor = 1.4, 0.7, 0.8
    elif age > 55:
        upi_factor, credit_factor, cash_factor = 0.6, 1.2, 1.4
    else:
        upi_factor, credit_factor, cash_factor = 1.0, 1.0, 1.0

    # Occupation adjustments
    if occupation == "Student":
        upi_factor *= 1.3
        credit_factor *= 0.6
    elif occupation == "Retired":
        upi_factor *= 0.7
        cash_factor *= 1.3

    upi_count = max(5, int(np.random.normal(base_intensity * 0.4 * upi_factor, 8)))
    debit_count = max(3, int(np.random.normal(base_intensity * 0.3, 5)))
    credit_count = max(2, int(np.random.normal(base_intensity * 0.2 * credit_factor, 4)))
    cash_count = max(1, int(np.random.normal(base_intensity * 0.1 * cash_factor, 3)))

    # Recurring transactions
    if occupation in ["Employed", "Self-Employed"] or age > 35:
        recurring_count = np.random.randint(6, 13)
    else:
        recurring_count = np.random.randint(3, 9)

    # Preferred channel
    channels = ["UPI", "Debit Card", "Credit Card", "Cash"]
    preferred_channel = channels[np.argmax([upi_count, debit_count, credit_count, cash_count])]

    # ENHANCED: Reduced negative savings cases
    if employment_status == "Unemployed":
        savings_ratio = np.random.uniform(-0.15, 0.05)
    elif occupation == "Student":
        savings_ratio = np.random.uniform(-0.08, 0.1)
    elif age > 50:
        savings_ratio = np.random.uniform(0.15, 0.35)
    elif age < 30:
        savings_ratio = np.random.uniform(0.05, 0.20)
    else:
        savings_ratio = np.random.uniform(0.08, 0.25)

    # Further reduced extreme savings anomalies
    if np.random.random() < 0.015:
        savings_ratio = np.random.choice([
            np.random.uniform(-0.3, -0.15),
            np.random.uniform(0.5, 0.7)
        ])

    # Other metrics
    total_transactions = upi_count + debit_count + credit_count + cash_count
    recurring_ratio = recurring_count / total_transactions if total_transactions > 0 else 0.1

    if employment_status == "Unemployed":
        spend_growth = np.random.normal(-0.05, 0.12)
    else:
        spend_growth = np.random.normal(0.04, 0.12)

    high_value_threshold = monthly_income * 0.07
    high_value_count = max(0, int(np.random.poisson(total_spend / high_value_threshold * 0.08)))

    return {
        "upi_count": upi_count,
        "debit_count": debit_count,
        "credit_count": credit_count,
        "cash_count": cash_count,
        "recurring_count": recurring_count,
        "preferred_channel": preferred_channel,
        "recurring_ratio": round(recurring_ratio, 2),
        "savings_ratio": round(savings_ratio, 2),
        "spend_growth": round(spend_growth, 2),
        "high_value_count": high_value_count,
        "total_transactions": total_transactions
    }

# FIXED: Enhanced spending profile with corrected normal distribution
def generate_enhanced_spend_profile(monthly_income, employment_status, age, occupation):
    """Generate spending profile with realistic distributions"""

    # Determine spending profile based on demographics
    if occupation == "Student":
        base_ratios = {"utility": 0.12, "shopping": 0.25, "entertainment": 0.25, "health": 0.08, "education": 0.20, "travel": 0.10}
    elif occupation == "Retired":
        base_ratios = {"utility": 0.18, "shopping": 0.22, "entertainment": 0.12, "health": 0.25, "education": 0.05, "travel": 0.18}
    elif age < 30:
        base_ratios = {"utility": 0.10, "shopping": 0.28, "entertainment": 0.20, "health": 0.08, "education": 0.12, "travel": 0.22}
    elif age > 50:
        base_ratios = {"utility": 0.15, "shopping": 0.25, "entertainment": 0.15, "health": 0.15, "education": 0.08, "travel": 0.22}
    else:
        base_ratios = {"utility": 0.12, "shopping": 0.30, "entertainment": 0.15, "health": 0.12, "education": 0.08, "travel": 0.23}

    # Individual variation
    for category in base_ratios:
        base_ratios[category] *= np.random.uniform(0.8, 1.2)

    # Normalize
    total = sum(base_ratios.values())
    for category in base_ratios:
        base_ratios[category] /= total

    # Total spend calculation
    if employment_status == "Unemployed":
        spend_ratio = np.random.uniform(0.8, 1.2)
    elif occupation == "Student":
        spend_ratio = np.random.uniform(0.8, 1.1)
    else:
        spend_ratio = np.random.uniform(0.5, 0.8)

    # Reduced extreme spending anomalies
    if np.random.random() < 0.03:
        spend_ratio = np.random.choice([
            np.random.uniform(0.3, 0.5),
            np.random.uniform(1.1, 1.5)
        ])

    total_monthly_spend = monthly_income * spend_ratio

    # Calculate category spends
    spends = {}
    for category, ratio in base_ratios.items():
        base_spend = total_monthly_spend * ratio
        monthly_variation = np.random.uniform(0.85, 1.15)
        spends[category] = max(50, base_spend * monthly_variation)

    actual_total_spend = sum(spends.values())

    monthly_trend = np.random.normal(total_monthly_spend, total_monthly_spend * 0.12)
    monthly_trend = max(0, monthly_trend)

    return {
        "utility_spend": round(spends["utility"], 2),
        "shopping_spend": round(spends["shopping"], 2),
        "entertainment_spend": round(spends["entertainment"], 2),
        "health_spend": round(spends["health"], 2),
        "education_spend": round(spends["education"], 2),
        "travel_spend": round(spends["travel"], 2),
        "monthly_spend_trend": round(monthly_trend, 2),
        "seasonal_spend_variation": round(np.random.uniform(0.08, 0.35), 2),
        "weekend_spend_ratio": round(np.random.uniform(0.22, 0.48), 2),
        "festive_spend": round(total_monthly_spend * np.random.uniform(1.15, 2.0), 2),
        "total_monthly_spend": round(actual_total_spend, 2),
        "spend_ratio": round(spend_ratio, 2)
    }

# Proper outstanding balance calculation using amortization formula
def calculate_outstanding_balance(loan_amount, interest_rate, tenure, months_paid, months_missed, current_emi):
    """Calculate outstanding balance using amortization formula"""
    monthly_rate = interest_rate / 12 / 100
    total_months = tenure

    if months_paid >= total_months:
        return loan_amount * 0.01  # Small residual amount

    # Calculate the original EMI
    original_emi = loan_amount * monthly_rate * (1 + monthly_rate) ** total_months / ((1 + monthly_rate) ** total_months - 1)

    # Calculate remaining principal using amortization formula
    remaining_months = total_months - months_paid
    outstanding_principal = original_emi * (1 - (1 + monthly_rate) ** (-remaining_months)) / monthly_rate

    # Adjust for missed payments with interest
    missed_penalty = 0
    for i in range(months_missed):
        missed_penalty += original_emi * (1 + monthly_rate) ** (i + 1)

    total_outstanding = outstanding_principal + missed_penalty

    # Ensure it doesn't exceed original loan amount with maximum interest
    max_possible = loan_amount * (1 + monthly_rate) ** total_months
    return min(total_outstanding, max_possible)

# FIXED: Proper date calculations - convert numpy types to Python native types
def calculate_loan_dates(tenure, months_completed, days_past_due):
    """Calculate proper loan dates based on tenure and payment history"""
    # Convert numpy types to Python native types
    tenure_int = int(tenure)
    months_completed_int = int(months_completed)
    days_past_due_int = int(days_past_due)

    # Loan start date (random in the past, ensuring tenure is reasonable)
    total_days = 30 * months_completed_int + days_past_due_int
    loan_start = datetime.now() - timedelta(days=total_days)

    # Last payment date (if any payments made)
    if months_completed_int > 0:
        last_payment_days = 30 * (months_completed_int - 1) + np.random.randint(0, 10)
        last_payment_date = loan_start + timedelta(days=int(last_payment_days))
    else:
        last_payment_date = None

    # Next installment due date
    next_due_date = loan_start + timedelta(days=30 * months_completed_int)

    return loan_start.date(), last_payment_date, next_due_date.date()

# ENHANCED: Contact success data with guaranteed successful contacts and realistic distribution
def generate_contact_success_data(call_attempts, payment_history, employment_status, age, occupation, financial_stress_score):
    """Generate realistic contact success data with guaranteed successful contacts"""

    # Base success rate based on customer profile
    if payment_history > 0.8 and employment_status == "Employed":
        base_success_rate = np.random.uniform(0.6, 0.9)  # Higher for good payers
    elif payment_history > 0.6:
        base_success_rate = np.random.uniform(0.4, 0.7)
    else:
        base_success_rate = np.random.uniform(0.3, 0.6)  # Higher minimum for poor payers

    # Age adjustments
    if age > 60:
        base_success_rate *= 1.3  # Older customers much more likely to answer
    elif age < 30:
        base_success_rate *= 0.8  # Younger customers less likely

    # Occupation adjustments
    if occupation in ["Employed", "Self-Employed"]:
        base_success_rate *= 0.9  # Working people slightly harder to reach
    elif occupation == "Retired":
        base_success_rate *= 1.4  # Retired people much easier to reach

    # Financial stress adjustments
    if financial_stress_score > 70:
        base_success_rate *= 0.7  # High stress customers harder to reach
    elif financial_stress_score < 30:
        base_success_rate *= 1.2  # Low stress customers easier to reach

    success_rate = np.clip(np.random.normal(base_success_rate, 0.1), 0.3, 0.95)  # Higher minimum

    # ENSURE NO ZERO SUCCESSFUL CONTACTS: Calculate with minimum guarantee
    successful_contacts = max(1, int(call_attempts * success_rate))

    # Recalculate actual success rate based on guaranteed successful contacts
    actual_success_rate = successful_contacts / call_attempts

    # Generate realistic agent interactions
    available_agents = np.random.choice(agent_ids, min(len(agent_ids), np.random.randint(3, 8)), replace=False)

    # Create agent interaction counts
    agent_interactions = {}
    if successful_contacts > 0:
        # Distribute successful contacts among available agents
        agent_weights = np.random.dirichlet(np.ones(len(available_agents)))
        agent_counts = (agent_weights * successful_contacts).astype(int)

        # Ensure total matches (adjust for rounding)
        while sum(agent_counts) < successful_contacts:
            agent_counts[np.random.randint(0, len(agent_counts))] += 1

        # Assign interactions to agents
        for i, agent in enumerate(available_agents):
            if agent_counts[i] > 0:
                agent_interactions[agent] = agent_counts[i]

    # Determine best agents (sorted by interaction count)
    sorted_agents = sorted(agent_interactions.items(), key=lambda x: x[1], reverse=True)
    best_agents = [agent for agent, count in sorted_agents[:3]]  # Top 3 agents

    # Calculate average times for best agents
    avg_times = []
    for agent in best_agents:
        # Base call duration based on customer profile
        base_duration = 3 + (actual_success_rate * 10)
        if financial_stress_score > 70:  # High stress customers take longer
            base_duration += 5
        elif financial_stress_score < 30:  # Low stress customers are quicker
            base_duration -= 1

        avg_time = round(np.random.normal(base_duration, 2), 2)
        avg_times.append(avg_time)

    # Last successful agent (most recent successful contact)
    last_successful_agent = best_agents[0] if best_agents else np.random.choice(agent_ids)  # Always assign an agent

    # Best agent interaction count
    best_agent_interaction_count = sorted_agents[0][1] if sorted_agents else successful_contacts

    return {
        "successful_contacts": successful_contacts,  # GUARANTEED at least 1
        "contact_success_rate": round(actual_success_rate, 2),
        "last_successful_agent": last_successful_agent,  # Never empty
        "best_agents": ",".join(best_agents) if best_agents else last_successful_agent,
        "avg_times_best_agents": ",".join(map(str, avg_times)) if avg_times else str(round(np.random.uniform(3, 10), 2)),
        "best_agent_interaction_count": best_agent_interaction_count
    }

# ENHANCED: Missed payments calculation with very few zeros (less than 20000)
def calculate_missed_payments(months_completed, on_time_percentage, age, occupation, employment_status):
    """Calculate realistic missed payments count with very few zeros"""

    # Only about 15-18% of customers should have 0 missed payments (15000-18000 out of 100000)
    zero_missed_probability = 0.17  # ~17,000 customers will have 0 missed payments

    if np.random.random() < zero_missed_probability:
        return 0

    # Base missed payment probability for non-zero cases
    if employment_status == "Unemployed":
        base_miss_rate = 0.4
    elif occupation == "Student":
        base_miss_rate = 0.3
    else:
        base_miss_rate = 0.2

    # Age adjustments
    if age > 50:
        base_miss_rate *= 0.6  # Older customers more responsible
    elif age < 25:
        base_miss_rate *= 1.4  # Younger customers less responsible

    # Calculate expected missed payments based on completion rate
    expected_missed = int((1 - on_time_percentage) * months_completed * base_miss_rate)

    # Add some natural variation
    variation = np.random.poisson(expected_missed * 0.4)
    missed_payments = max(1, expected_missed + variation - np.random.randint(0, 2))

    # Ensure it doesn't exceed months completed
    missed_payments = min(missed_payments, months_completed)

    return missed_payments

# Enhanced settlement history based on communication and payment behavior
def determine_settlement_history(missed_payments, successful_contacts, contact_success_rate,
                               payment_history, financial_stress_score):
    """Determine settlement history based on communication and payment patterns"""

    if missed_payments == 0:
        return "Settled"
    elif payment_history > 0.7 and successful_contacts > 2:
        return "Under Negotiation"
    elif payment_history > 0.5 and contact_success_rate > 0.3:
        return "Partial Settlement"
    elif financial_stress_score > 70 and successful_contacts == 0:
        return "Not Settled"
    else:
        return np.random.choice(["Partial Settlement", "Not Settled", "Under Negotiation"],
                               p=[0.4, 0.4, 0.2])

# Enhanced smartphone penetration logic
def determine_smartphone_penetration(age, occupation, monthly_income, city):
    """Determine realistic smartphone penetration based on demographics"""

    # Base penetration by age
    if age < 25:
        base_penetration = "High"
    elif age < 40:
        base_penetration = "High"
    elif age < 60:
        base_penetration = "Medium"
    else:
        base_penetration = "Low"

    # Occupation adjustments
    if occupation in ["Student", "Employed", "Self-Employed"]:
        base_penetration = "High"
    elif occupation == "Retired" and monthly_income > 5000:
        base_penetration = "Medium"
    elif occupation == "Unemployed":
        base_penetration = "Low"

    # Income adjustments
    if monthly_income < 2000:
        base_penetration = "Low"
    elif monthly_income > 8000:
        base_penetration = "High"

    # City adjustments (urban vs less urban)
    urban_cities = ["Singapore City", "Jurong East", "Tampines", "Woodlands", "Bedok"]
    if city in urban_cities:
        if base_penetration == "Low":
            base_penetration = "Medium"
        elif base_penetration == "Medium":
            base_penetration = "High"

    return base_penetration

# Enhanced preferred channel logic
def determine_preferred_channel(age, occupation, smartphone_penetration, upi_count,
                              debit_count, credit_count, cash_count):
    """Determine realistic preferred communication channel"""

    # Calculate digital transaction ratio
    total_digital = upi_count + debit_count + credit_count
    total_all = total_digital + cash_count
    digital_ratio = total_digital / total_all if total_all > 0 else 0

    # Age-based preferences
    if age < 30:
        if digital_ratio > 0.7 and smartphone_penetration == "High":
            return "WhatsApp"
        else:
            return "SMS"
    elif age < 50:
        if digital_ratio > 0.5:
            return "Call"
        else:
            return "SMS"
    else:
        if smartphone_penetration == "High":
            return "SMS"
        else:
            return "Call"

# Flight risk calculation
def calculate_flight_risk(financial_stress_score, payment_history, missed_payments, employment_status, age):
    """Calculate flight risk based on financial behavior"""
    financial_factors = financial_stress_score / 100 * 0.4
    behavioral_factors = (1 - payment_history) * 0.3
    stability_factors = min(1, missed_payments * 0.1) * 0.2
    engagement_factors = (1 - min(1, payment_history)) * 0.1

    flight_risk = min(1.0, financial_factors + behavioral_factors + stability_factors + engagement_factors)

    # Age adjustments to flight risk
    if age < 30:
        flight_risk *= 1.2
    elif age > 60:
        flight_risk *= 0.8

    # Reduced extreme flight risk cases
    if np.random.random() < 0.02:
        flight_risk = np.random.choice([np.random.uniform(0.7, 0.9), np.random.uniform(0.05, 0.15)])

    return round(flight_risk, 2)

# Other helper functions
def generate_singaporean_names(n):
    ethnicities = np.random.choice(["Chinese", "Malay", "Indian", "Western"], n, p=[0.74, 0.13, 0.09, 0.04])
    first_names, last_names = [], []

    for i in range(n):
        ethnicity = ethnicities[i]
        if ethnicity == "Chinese":
            first_names.append(np.random.choice(chinese_first_names))
            last_names.append(np.random.choice(chinese_last_names))
        elif ethnicity == "Malay":
            first_names.append(np.random.choice(malay_first_names))
            last_names.append(np.random.choice(malay_last_names))
        elif ethnicity == "Indian":
            first_names.append(np.random.choice(indian_first_names))
            last_names.append(np.random.choice(indian_last_names))
        else:
            first_names.append(np.random.choice(western_first_names))
            last_names.append(np.random.choice(western_last_names))

    return [f"{first} {last}" for first, last in zip(first_names, last_names)]

def generate_singapore_mobile_numbers(n, valid_ratio=0.92):
    prefixes = np.random.choice(['8', '9'], n)
    numbers = np.random.randint(0, 10000000, n)
    numbers = [f"{num:07d}" for num in numbers]

    valid_mask = np.random.random(n) < valid_ratio
    mobile_numbers = []

    for i in range(n):
        if valid_mask[i]:
            mobile_numbers.append(f"+65 {prefixes[i]}{numbers[i][:3]} {numbers[i][3:]}")
        else:
            invalid_types = [
                f"+65 {prefixes[i]}{numbers[i][:6]}",
                f"+65 {np.random.choice(['6', '7'])}{numbers[i]}",
                f"+65 {prefixes[i]}{numbers[i]}{np.random.randint(0,10)}",
            ]
            mobile_numbers.append(np.random.choice(invalid_types))

    return mobile_numbers, valid_mask

def generate_loan_data(n):
    product_choices = np.random.choice(product_types, n)
    tenures = np.random.choice([12, 24, 36, 48, 60], n)

    loan_amounts, interest_rates_arr = [], []

    for product in product_choices:
        min_amt, max_amt = loan_amount_limits[product]
        step = 5000 if product in ["Auto loan", "Business loan"] else 1000
        amount = np.random.choice(range(min_amt, max_amt + 1, step))
        loan_amounts.append(amount)
        interest_rates_arr.append(interest_rates[product])

    emis = calculate_emis_vectorized(np.array(loan_amounts), np.array(interest_rates_arr), tenures)

    return product_choices, loan_amounts, interest_rates_arr, tenures, emis

def calculate_emis_vectorized(principals, rates, tenures):
    monthly_rates = rates / 1200
    emis = (principals * monthly_rates * (1 + monthly_rates) ** tenures) / ((1 + monthly_rates) ** tenures - 1)
    return np.round(emis, 2)

def calculate_monthly_income(income_band):
    """Calculate realistic monthly income with some variation within bands"""
    min_income, max_income = income_bands[income_band]

    # Create more realistic distribution within bands
    if income_band == "50,000 or Below":
        # Skew toward lower end
        annual_income = min_income + (max_income - min_income) * random.betavariate(2, 5)
    elif income_band == "500,000 or Above":
        # Skew toward higher end
        annual_income = min_income + (max_income - min_income) * random.betavariate(5, 2)
    else:
        # More normal distribution for middle bands
        annual_income = random.normalvariate(
            (min_income + max_income) / 2,
            (max_income - min_income) / 6
        )

    annual_income = max(min_income, min(max_income, annual_income))
    monthly_income = annual_income / 12

    return round(monthly_income, 2)

def calculate_monthly_incomes(income_bands_arr):
    monthly_incomes = []
    for income_band in income_bands_arr:
        monthly_income = calculate_monthly_income(income_band)
        monthly_incomes.append(monthly_income)
    return np.array(monthly_incomes)

def adjust_emi_for_delinquency(base_emi, days_past_due):
    if days_past_due <= 0: return base_emi
    elif days_past_due <= 15: return round(base_emi * 1.02, 2)
    elif days_past_due <= 30: return round(base_emi * 1.05, 2)
    elif days_past_due <= 60: return round(base_emi * 1.08, 2)
    elif days_past_due <= 90: return round(base_emi * 1.12, 2)
    else: return round(base_emi * 1.15, 2)

def calculate_credit_score(payment_history, missed_payments, delinquency, partial_payments):
    base_score = 650
    payment_impact = (payment_history - 0.5) * 200
    missed_payment_impact = -min(missed_payments * 15, 100)
    delinquency_impact = -50 if delinquency else 0
    partial_impact = -20 if partial_payments else 0

    score = base_score + payment_impact + missed_payment_impact + delinquency_impact + partial_impact
    return max(300, min(850, int(score)))

# Enhanced response outcome based on communication
def determine_response_outcome(successful_contacts, contact_success_rate, payment_history, financial_stress_score):
    """Determine realistic response outcome based on communication patterns"""

    if successful_contacts == 0:
        return "Disconnected"
    elif contact_success_rate > 0.7:
        if payment_history > 0.8:
            return "Connected"
        else:
            return "Promised to pay"
    elif contact_success_rate > 0.4:
        if financial_stress_score > 60:
            return "Partial paid"
        else:
            return "Ignored"
    else:
        return np.random.choice(["Ignored", "Disconnected"], p=[0.6, 0.4])

# Enhanced channel used based on recent contact patterns
def determine_channel_used(successful_contacts, preferred_channel, age):
    """Determine the channel used for most recent contact"""

    if successful_contacts == 0:
        # If no successful contacts, use preferred channel as fallback
        channels = ["Call", "SMS", "WhatsApp", "Email", "Field Agent", "IVR"]
        if age < 40:
            return np.random.choice(["Call", "SMS", "WhatsApp"], p=[0.5, 0.3, 0.2])
        else:
            return np.random.choice(["Call", "SMS"], p=[0.6, 0.4])
    else:
        # Use preferred channel with some variation
        channels = {
            "Call": ["Call", "IVR"],
            "SMS": ["SMS", "WhatsApp"],
            "WhatsApp": ["WhatsApp", "SMS"],
            "Email": ["Email", "SMS"],
            "Field Agent": ["Field Agent", "Call"]
        }

        primary_channel = preferred_channel if preferred_channel in channels else "Call"
        return np.random.choice(channels[primary_channel])

# Main batch generation function
def generate_batch_data(batch_size, customer_ids_batch):
    """Generate a batch of customer data"""

    # Generate realistic age-occupation pairs
    ages, occupations = generate_age_occupation_distribution(batch_size)
    employment_status = np.array(["Employed" if occ in ["Employed", "Self-Employed"] else "Unemployed" for occ in occupations])

    # Income bands
    income_bands_arr = np.random.choice(list(income_bands.keys()), batch_size, p=[0.20, 0.35, 0.25, 0.12, 0.05, 0.03])
    monthly_incomes = calculate_monthly_incomes(income_bands_arr)

    # Generate names and contact info
    genders_arr = np.random.choice(["Male", "Female", "Others"], batch_size, p=[0.48, 0.48, 0.04])
    full_names = generate_singaporean_names(batch_size)
    first_names = [name.split()[0] for name in full_names]
    last_names = [name.split()[1] if ' ' in name else '' for name in full_names]

    # Communication data
    mobile_numbers, valid_primary_mask = generate_singapore_mobile_numbers(batch_size)
    has_secondary = np.random.random(batch_size) > 0.15
    secondary_numbers, valid_secondary_mask = generate_singapore_mobile_numbers(batch_size, 0.75)
    # Set empty secondary numbers to "NULL"
    secondary_numbers = [num if has_sec else "NULL" for num, has_sec in zip(secondary_numbers, has_secondary)]
    valid_secondary_mask = [mask if has_sec else False for mask, has_sec in zip(valid_secondary_mask, has_secondary)]

    # FIXED: Ensure No_of_Valid_Numbers is integer only
    no_of_valid_numbers = []
    no_of_invalid_numbers = []
    for i in range(batch_size):
        valid_count = int(sum([valid_primary_mask[i], valid_secondary_mask[i]])) if has_secondary[i] else int(valid_primary_mask[i])
        no_of_valid_numbers.append(valid_count)
        invalid_count = (2 if has_secondary[i] else 1) - valid_count
        no_of_invalid_numbers.append(int(invalid_count))

    # Communication completion and changes
    communication_completion = []
    mobile_change_count = []
    address_change_count = []

    for i in range(batch_size):
        age = int(ages[i])  # Convert to native Python int
        occupation = occupations[i]

        if occupation in ["Employed", "Self-Employed"]:
            base_completion = 0.9
        elif occupation == "Student":
            base_completion = 0.85
        else:
            base_completion = 0.75

        if age > 60:
            base_completion -= 0.1

        completion = np.clip(np.random.normal(base_completion, 0.08), 0.7, 1.0)
        communication_completion.append(round(completion, 2))

        if occupation == "Student" or age < 25:
            changes = np.random.poisson(2.5)
        elif occupation == "Unemployed":
            changes = np.random.poisson(1.5)
        else:
            changes = np.random.poisson(0.8)

        mobile_change_count.append(max(0, int(changes)))

        if occupation == "Student" or (age < 30 and occupation == "Employed"):
            address_changes = np.random.poisson(1.2)
        elif occupation == "Unemployed":
            address_changes = np.random.poisson(0.8)
        else:
            address_changes = np.random.poisson(0.3)

        address_change_count.append(max(0, int(address_changes)))

    communication_completion = np.array(communication_completion)
    mobile_change_count = np.array(mobile_change_count)
    mobile_change_this_year = np.minimum(mobile_change_count, np.random.randint(0, 3, batch_size))
    address_change_count = np.array(address_change_count)
    address_change_this_year = np.minimum(address_change_count, np.random.randint(0, 2, batch_size))

    # Contact change frequency
    customer_age_years = np.maximum(5, ages - 18)
    total_changes = mobile_change_count + address_change_count
    contact_change_frequency = np.round(total_changes / customer_age_years, 2)

    # Generate loan data
    product_types_arr, loan_amounts, interest_rates_arr, tenures, base_emis = generate_loan_data(batch_size)

    # Payment behavior
    on_time_percentages = []
    for i in range(batch_size):
        if occupations[i] in ["Employed", "Self-Employed"]:
            base_payment = np.random.beta(4, 2)
        elif occupations[i] == "Student":
            base_payment = np.random.beta(3, 3)
        else:
            base_payment = np.random.beta(2, 4)

        if ages[i] > 40:
            base_payment = min(1.0, base_payment * 1.1)
        elif ages[i] < 25:
            base_payment = max(0.1, base_payment * 0.9)

        on_time_percentages.append(base_payment)

    on_time_percentages = np.array(on_time_percentages)
    payment_frequencies = ["Regular" if p >= 0.8 else "Irregular" for p in on_time_percentages]

    # Loan dates and status - PROPER CALCULATION
    months_completed = []
    days_past_due_arr = []

    for i in range(batch_size):
        # More realistic months completed based on tenure
        max_possible_months = int(tenures[i]) - 1  # Convert to native int
        if on_time_percentages[i] > 0.8:
            months_completed.append(min(max_possible_months, np.random.randint(1, max_possible_months + 1)))
            days_past_due = 0 if np.random.random() > 0.1 else np.random.randint(1, 16)
        elif on_time_percentages[i] > 0.6:
            months_completed.append(min(max_possible_months, np.random.randint(1, max(2, max_possible_months))))
            days_past_due = np.random.randint(1, 31) if np.random.random() > 0.3 else 0
        else:
            months_completed.append(min(max_possible_months, np.random.randint(0, max(1, max_possible_months // 2))))
            days_past_due = np.random.randint(15, 91) if np.random.random() > 0.1 else np.random.randint(1, 16)

        days_past_due_arr.append(days_past_due)

    months_completed = np.array(months_completed)
    days_past_due_arr = np.array(days_past_due_arr)

    # Generate batch data
    batch_data = []
    for i in range(batch_size):
        # Use pre-generated customer ID
        customer_id = customer_ids_batch[i]

        # Convert to native Python types for calculations
        age = int(ages[i])
        tenure = int(tenures[i])
        months_completed_int = int(months_completed[i])
        days_past_due = int(days_past_due_arr[i])
        monthly_income = float(monthly_incomes[i])
        loan_amount = float(loan_amounts[i])
        interest_rate = float(interest_rates_arr[i])
        base_emi_val = float(base_emis[i])

        # Loan-specific calculations
        current_emi = adjust_emi_for_delinquency(base_emi_val, days_past_due)

        # ENHANCED: Missed payments calculation with very few zeros
        missed_payments_count = calculate_missed_payments(
            months_completed_int, on_time_percentages[i], age, occupations[i], employment_status[i]
        )

        # Enhanced spending profile
        spend_profile = generate_enhanced_spend_profile(
            monthly_income, employment_status[i], age, occupations[i]
        )

        # Enhanced payment behavior (with reduced negative savings)
        payment_behavior = generate_enhanced_payment_behavior(
            monthly_income, spend_profile["total_monthly_spend"],
            age, employment_status[i], occupations[i]
        )

        # Calculate debt-to-income ratio for financial stress
        debt_to_income_ratio = (current_emi + spend_profile["total_monthly_spend"] * 0.3) / monthly_income if monthly_income > 0 else 1.0

        # Enhanced financial stress score (with realistic logic)
        financial_health = calculate_financial_stress_score(
            monthly_income, spend_profile["total_monthly_spend"],
            payment_behavior["savings_ratio"], on_time_percentages[i],
            missed_payments_count, employment_status[i], age, occupations[i], debt_to_income_ratio
        )

        # Calculate AAR score with proper distribution
        aar_data = calculate_aar_score(
            payment_behavior["upi_count"], payment_behavior["debit_count"],
            payment_behavior["credit_count"], payment_behavior["cash_count"],
            payment_behavior["recurring_count"], payment_behavior["total_transactions"],
            monthly_income, spend_profile["total_monthly_spend"], age, occupations[i]
        )

        # Calculate flight risk
        flight_risk = calculate_flight_risk(
            financial_health["financial_stress_score"], on_time_percentages[i],
            missed_payments_count, employment_status[i], age
        )

        # Calculate outstanding balance using proper amortization
        outstanding_balance = calculate_outstanding_balance(
            loan_amount, interest_rate, tenure,
            months_completed_int, missed_payments_count, current_emi
        )

        # Calculate proper loan dates
        loan_start_date, last_payment_date, installment_due_date = calculate_loan_dates(
            tenure, months_completed_int, days_past_due
        )

        # Set last payment date to "NULL" if no payments made
        last_payment_date_str = "NULL" if last_payment_date is None else last_payment_date.date()

        # ENHANCED: Contact success data with guaranteed successful contacts
        call_attempts = max(3, int(np.random.poisson(5)))  # Higher minimum call attempts
        contact_success = generate_contact_success_data(
            call_attempts, on_time_percentages[i], employment_status[i], age,
            occupations[i], financial_health["financial_stress_score"]
        )

        # Enhanced settlement history
        settlement_history = determine_settlement_history(
            missed_payments_count, contact_success["successful_contacts"],
            contact_success["contact_success_rate"], on_time_percentages[i],
            financial_health["financial_stress_score"]
        )

        # Credit score
        delinquency = np.random.random() < 0.15
        credit_score = calculate_credit_score(
            on_time_percentages[i], missed_payments_count, delinquency,
            np.random.random() < 0.15
        )

        # Enhanced smartphone penetration
        city = np.random.choice(singapore_cities)
        smartphone_penetration = determine_smartphone_penetration(
            age, occupations[i], monthly_income, city
        )

        # Enhanced preferred channel
        preferred_channel = determine_preferred_channel(
            age, occupations[i], smartphone_penetration,
            payment_behavior["upi_count"], payment_behavior["debit_count"],
            payment_behavior["credit_count"], payment_behavior["cash_count"]
        )

        # Enhanced response outcome and channel used
        response_outcome = determine_response_outcome(
            contact_success["successful_contacts"], contact_success["contact_success_rate"],
            on_time_percentages[i], financial_health["financial_stress_score"]
        )

        channel_used = determine_channel_used(
            contact_success["successful_contacts"], preferred_channel, age
        )

        # Enhanced communication history columns
        successful_contacts_count = contact_success["successful_contacts"]
        contact_history_call_attempts = int(successful_contacts_count * np.random.uniform(1.2, 2.0))
        contact_history_sms = int(successful_contacts_count * np.random.uniform(0.8, 1.5))
        contact_history_whatsapp = int(successful_contacts_count * np.random.uniform(0.5, 1.2))
        contact_history_emaillogs = int(successful_contacts_count * np.random.uniform(0.3, 0.8))
        no_of_attempts = contact_history_call_attempts + contact_history_sms + contact_history_whatsapp + contact_history_emaillogs
        average_handling_time = round(np.random.uniform(3, 15), 2) if successful_contacts_count > 0 else 0

        # Additional communication fields
        app_login_frequency = np.random.poisson(12)
        online_banking_activity = np.random.poisson(10)
        call_sms_activity_patterns = np.random.choice(["Low", "Medium", "High"])
        whatsapp_ott_usage_indicator = np.random.choice([True, False], p=[0.8, 0.2])
        recent_score_change = np.random.randint(-40, 41)
        unemployeement_rate_region = round(np.random.uniform(1.8, 2.2), 2)
        inflation_rate = round(np.random.uniform(3.5, 5.0), 2)
        interest_rate_trend = round(np.random.uniform(-0.1, 0.3), 2)
        economic_stress_index = round(np.random.uniform(0.1, 0.3), 2)
        do_not_call_registry_data = np.random.choice([True, False], p=[0.15, 0.85])
        regional_time_restrictions = np.random.choice(["Morning", "Afternoon", "Evening", "Night"])
        communication_complaince_limits = np.random.choice(["Daytime", "Evening", "Weekdays", "Weekends", "Holidays"])

        record = {
            # Basic loan information
            "Customer_id": customer_id,
            "Loan_Account_id": int(np.random.randint(10000000, 99999999)),
            "Product_Type": product_types_arr[i],
            "Loan_Amount_SGD": round(loan_amount, 2),
            "Outstanding_Balance_SGD": round(outstanding_balance, 2),
            "Day_Past_Due": days_past_due,
            "Tenure": tenure,
            "Interest_Rate": interest_rate,
            "Current_EMI_SGD": current_emi,
            "Installment_Due_Date": installment_due_date,
            "Last_Payment_Date": last_payment_date_str,  # "NULL" for empty cells

            # Payment behavior
            "Partial_Payment_Indicator": np.random.random() < 0.15,
            "Number_of_Past_Payments": months_completed_int,
            "Payment_Frequency": payment_frequencies[i],
            "Amount_Paid_Each_Month_SGD": base_emi_val,
            "Missed_Payments_Count": missed_payments_count,  # VERY FEW ZEROS (less than 20000)
            "Settlement_History": settlement_history,  # Based on communication data
            "Repayment_Irregularity_Flags": on_time_percentages[i] < 0.7,

            # Customer profile
            "Customer_Employment_Status": employment_status[i],

            # Communication data
            "Communication_Data_Completion": communication_completion[i],
            "Valid_Phone_Number": valid_primary_mask[i],
            "No_of_Valid_Numbers": no_of_valid_numbers[i],
            "No_of_Invalid_Numbers": no_of_invalid_numbers[i],
            "Mobile_Number_Change_Count": int(mobile_change_count[i]),
            "Mobile_Number_Change_Count_This_Year": int(mobile_change_this_year[i]),
            "Address_Change_Count": int(address_change_count[i]),
            "Address_Change_Count_This_Year": int(address_change_this_year[i]),
            "Contact_Data_Change_Frequency": contact_change_frequency[i],

            # Spend analysis
            "Finance_Stress_Status": financial_health["finance_stress_status"],
            "Utility_Spend_SGD": spend_profile["utility_spend"],
            "Shopping_Spend_SGD": spend_profile["shopping_spend"],
            "Entertainment_Spend_SGD": spend_profile["entertainment_spend"],
            "Health_Spend_SGD": spend_profile["health_spend"],
            "Education_Spend_SGD": spend_profile["education_spend"],
            "Travel_Spend_SGD": spend_profile["travel_spend"],
            "Monthly_Spend_Trend_SGD": spend_profile["monthly_spend_trend"],
            "Seasonal_Spend_Variation": spend_profile["seasonal_spend_variation"],
            "Weekend_Spend_Ratio": spend_profile["weekend_spend_ratio"],
            "Festive_Season_Spend_SGD": spend_profile["festive_spend"],
            "Total_Monthly_Spend_SGD": spend_profile["total_monthly_spend"],
            "Spend_to_Income_Ratio": spend_profile["spend_ratio"],

            # Payment behavior
            "UPI_Transaction_Count": payment_behavior["upi_count"],
            "Debit_Card_Transaction_Count": payment_behavior["debit_count"],
            "Credit_Card_Transaction_Count": payment_behavior["credit_count"],
            "Cash_Withdrawal_Count": payment_behavior["cash_count"],
            "Recurring_Transaction_Count": payment_behavior["recurring_count"],
            "Preferred_Payment_Channel": payment_behavior["preferred_channel"],
            "Recurring_Payment_Ratio": payment_behavior["recurring_ratio"],
            "Savings_to_Spend_Ratio": payment_behavior["savings_ratio"],
            "Spend_Growth_Rate_YoY": payment_behavior["spend_growth"],
            "High_Value_Transaction_Count": payment_behavior["high_value_count"],

            # Financial health
            "Flight_Risk_Score": flight_risk,
            "Financial_Health_Status": financial_health["financial_health_status"],
            "Financial_Stress_Score": financial_health["financial_stress_score"],
            "Avg_Balance_Trends": financial_health["avg_balance_trend"],  # Natural and realistic
            "Overdraft_or_Low_Balance_Flag": financial_health["overdraft_flag"],  # Natural and realistic
            "AAR_Score": aar_data["aar_score"],  # 6 decimal places
            "AAR_Risk_Level": aar_data["aar_risk_level"],

            # Contact success
            "Successful_Contacts_Count": contact_success["successful_contacts"],  # GUARANTEED NO ZEROS
            "Contact_Success_Rate": contact_success["contact_success_rate"],
            "Last_Successful_Agent_ID": contact_success["last_successful_agent"],  # Never empty
            "Best_Contact_Agent_IDs": contact_success["best_agents"],  # Top agents by successful contacts
            "Avg_Time_With_Best_Agents_Min": contact_success["avg_times_best_agents"],  # Avg time with best agents
            "Customer_Best_Agent_Interaction_Count": contact_success["best_agent_interaction_count"],  # Count with top agent

            # Customer demographics
            "Name": full_names[i],
            "Age": age,
            "Occupation": occupations[i],
            "Gender": genders_arr[i],
            "Primary_Phone_Number": mobile_numbers[i],
            "Secondary_Mobile_Number": secondary_numbers[i],  # "NULL" for empty cells
            "Landline_Phone_Number": f"+65 6{np.random.randint(1000000, 9999999):07d}",
            "Email_ID": f"{first_names[i].lower()}.{last_names[i].lower()}@{np.random.choice(email_domains)}",
            "Income_Band_SGD": income_bands_arr[i],
            "Monthly_Income_SGD": monthly_income,
            "Employeement_Type": np.random.choice(["Full time", "Part time", "Contract", "Freelance", "Unemployed"]),
            "Address": f"{np.random.randint(1, 999)} {np.random.choice(street_names)} #{np.random.randint(1, 50):02d}-{np.random.randint(1, 99):02d} Singapore {np.random.randint(100000, 999999)}",
            "City": city,
            "Language_Preference": np.random.choice(["English", "Regional"], p=[0.7, 0.3]),
            "Mobile_Number_Active_Status": valid_primary_mask[i],
            "Email_Activity": np.random.random() < 0.8,

            # Additional fields
            "Credit_Score": credit_score,
            "Recent_Inquiries": int(np.random.poisson(2)),
            "Loan_Exposure_Across_Banks": int(np.random.poisson(1)),
            "Delinquency_on_other_Loans": delinquency,

            # Communication history (created in batch generation)
            "Contact_History_Call_Attempts": contact_history_call_attempts,
            "Contact_History_SMS": contact_history_sms,
            "Contact_History_WhatsApp": contact_history_whatsapp,
            "Contact_History_EmailLogs": contact_history_emaillogs,
            "No_of_Attempts": no_of_attempts,
            "Average_Handling_Time": average_handling_time,
            "Channel_used": channel_used,
            "Response_Outcome": response_outcome,
            "App_Login_Frequency": app_login_frequency,
            "Online_Banking_Activity": online_banking_activity,
            "Smartphone_Penetration": smartphone_penetration,
            "Preferred_Channel": preferred_channel,
            "Call_SMS_Activity_Patterns": call_sms_activity_patterns,
            "WhatsApp_OTT_usage_Indicator": whatsapp_ott_usage_indicator,
            "Recent_Score_Change": recent_score_change,
            "Unemployeement_rate_region": unemployeement_rate_region,
            "Inflation_Rate": inflation_rate,
            "Interest_Rate_Trend": interest_rate_trend,
            "Economic_Stress_Index": economic_stress_index,
            "Do_Not_Call_Registry_Data": do_not_call_registry_data,
            "Regional_Time_Restrictions": regional_time_restrictions,
            "Communication_Complaince_Limits": communication_complaince_limits,
        }
        batch_data.append(record)

    return batch_data

# Main execution
def main():
    print(f"Generating {num_records:,} customer records")

    # Pre-generate unique customer IDs
    print("Generating unique Customer IDs...")
    customer_ids = generate_unique_customer_ids(num_records)
    print(f"Generated {len(customer_ids):,} unique Customer IDs")

    all_data = []
    start_time = datetime.now()

    # Track distributions for validation
    age_occupation_tracker = defaultdict(lambda: defaultdict(int))
    stress_tracker = defaultdict(int)
    savings_tracker = defaultdict(int)
    aar_risk_tracker = defaultdict(int)
    last_payment_tracker = defaultdict(int)
    missed_payments_tracker = defaultdict(int)
    successful_contacts_tracker = defaultdict(int)

    # Process in batches
    for batch_num in range(0, num_records, batch_size):
        current_batch_size = min(batch_size, num_records - batch_num)
        print(f"Processing batch {batch_num//batch_size + 1}/{(num_records + batch_size - 1)//batch_size} "
              f"({current_batch_size} records)...")

        # Get customer IDs for this batch
        batch_customer_ids = customer_ids[batch_num:batch_num + current_batch_size]

        batch_data = generate_batch_data(current_batch_size, batch_customer_ids)
        all_data.extend(batch_data)

        # Track distributions
        for record in batch_data:
            age_group = f"{(record['Age'] // 10) * 10}s"
            age_occupation_tracker[age_group][record['Occupation']] += 1
            stress_tracker[record['Finance_Stress_Status']] += 1
            savings_tracker["Negative" if record['Savings_to_Spend_Ratio'] < 0 else "Positive"] += 1
            aar_risk_tracker[record['AAR_Risk_Level']] += 1
            last_payment_tracker[record['Last_Payment_Date']] += 1
            missed_payments_tracker[record['Missed_Payments_Count']] += 1
            successful_contacts_tracker[record['Successful_Contacts_Count']] += 1

        if (batch_num // batch_size) % 5 == 0:
            gc.collect()

    # Create DataFrame
    df = pd.DataFrame(all_data)

    # ENSURE INTEGER COLUMNS: Convert all count columns to integers
    integer_columns = [
        'No_of_Valid_Numbers', 'No_of_Invalid_Numbers', 'Mobile_Number_Change_Count',
        'Mobile_Number_Change_Count_This_Year', 'Address_Change_Count', 'Address_Change_Count_This_Year',
        'UPI_Transaction_Count', 'Debit_Card_Transaction_Count', 'Credit_Card_Transaction_Count',
        'Cash_Withdrawal_Count', 'Recurring_Transaction_Count', 'High_Value_Transaction_Count',
        'Successful_Contacts_Count', 'Customer_Best_Agent_Interaction_Count', 'Contact_History_Call_Attempts',
        'Contact_History_SMS', 'Contact_History_WhatsApp', 'Contact_History_EmailLogs', 'No_of_Attempts',
        'App_Login_Frequency', 'Online_Banking_Activity', 'Recent_Score_Change', 'Recent_Inquiries',
        'Loan_Exposure_Across_Banks', 'Missed_Payments_Count', 'Number_of_Past_Payments', 'Tenure',
        'Day_Past_Due', 'Age', 'Credit_Score'
    ]

    for col in integer_columns:
        if col in df.columns:
            df[col] = df[col].astype(int)

    # Validate data
    print("Validating data constraints...")
    assert all(5000 <= amt <= 500000 for amt in df.Loan_Amount_SGD)
    assert all(ob > 0 for ob in df.Outstanding_Balance_SGD)
    assert all(tenure in [12,24,36,48,60] for tenure in df.Tenure)
    assert all(8.0 <= rate <= 10.0 for rate in df.Interest_Rate)

    # Format float columns
    float_columns = df.select_dtypes(include=['float64']).columns
    df[float_columns] = df[float_columns].round(2)

    # Ensure AAR_Score has 6 decimal places
    df['AAR_Score'] = df['AAR_Score'].apply(lambda x: round(float(x), 6))

    # Save to CSV
    output_file = 'singapore_loan_data.gz'
    print(f"Saving to {output_file}...")
    df.to_csv(output_file, index=False, compression='gzip')

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    print(f"\n=== GENERATION COMPLETE ===")
    print(f"Generated {len(df):,} customer records")
    print(f"Time taken: {duration:.2f} seconds")
    print(f"Records per second: {len(df)/duration:.0f}")
    print(f"File saved: {output_file}")

if __name__ == "__main__":
    main()

Generating 100,000 customer records
Generating unique Customer IDs...
Generated 100,000 unique Customer IDs
Processing batch 1/5 (20000 records)...
Processing batch 2/5 (20000 records)...
Processing batch 3/5 (20000 records)...
Processing batch 4/5 (20000 records)...
Processing batch 5/5 (20000 records)...
Validating data constraints...
Saving to singapore_loan_data.gz...

=== GENERATION COMPLETE ===
Generated 100,000 customer records
Time taken: 119.33 seconds
Records per second: 838
File saved: singapore_loan_data.gz
