In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Configuration
num_records = 10000  # Number of records to generate

# Singapore-specific data
singapore_districts = {
    "01": ["Raffles Place", "Marina", "People's Park"],
    "02": ["Anson", "Tanjong Pagar"],
    "03": ["Queenstown", "Tiong Bahru"],
    "04": ["Telok Blangah", "HarbourFront"],
    "05": ["Pasir Panjang", "Hong Leong Garden", "Clementi New Town"],
    "06": ["High Street", "Beach Road"],
    "07": ["Middle Road", "Golden Mile"],
    "08": ["Little India", "Farrer Park"],
    "09": ["Orchard", "Cairnhill", "River Valley"],
    "10": ["Ardmore", "Bukit Timah", "Holland Road"],
    "11": ["Watten Estate", "Novena", "Thomson"],
    "12": ["Balestier", "Toa Payoh", "Serangoon"],
    "13": ["Macpherson", "Braddell"],
    "14": ["Geylang", "Eunos"],
    "15": ["Katong", "Joo Chiat", "Amber Road"],
    "16": ["Bedok", "Upper East Coast", "Eastwood", "Kew Drive"],
    "17": ["Loyang", "Changi"],
    "18": ["Tampines", "Pasir Ris"],
    "19": ["Serangoon Garden", "Hougang", "Punggol"],
    "20": ["Bishan", "Ang Mo Kio"],
    "21": ["Upper Bukit Timah", "Clementi Park", "Ulu Pandan"],
    "22": ["Jurong"],
    "23": ["Hillview", "Dairy Farm", "Bukit Panjang", "Choa Chu Kang"],
    "24": ["Lim Chu Kang", "Tengah"],
    "25": ["Kranji", "Woodgrove"],
    "26": ["Upper Thomson", "Springleaf"],
    "27": ["Yishun", "Sembawang"],
    "28": ["Seletar"]
}

street_names = [
    "Orchard Road", "North Bridge Road", "South Bridge Road", "Victoria Street",
    "Beach Road", "Serangoon Road", "Havelock Road", "Outram Road",
    "Cecil Street", "Robinson Road", "Maxwell Road", "Telok Ayer Street",
    "Bras Basah Road", "Bugis Street", "Chinatown Street", "Clarke Quay",
    "Collyer Quay", "East Coast Road", "Emerald Hill Road", "Farrer Road",
    "Geylang Road", "Holland Road", "Jalan Besar", "Joo Chiat Road",
    "Jurong East Street", "Kallang Road", "Lavender Street", "MacPherson Road",
    "Marine Parade Road", "Newton Road", "Pasir Panjang Road", "Potong Pasir Avenue",
    "Scotts Road", "Sembawang Road", "Simei Street", "Tampines Avenue",
    "Tanglin Road", "Toa Payoh Central", "Upper Serangoon Road", "Woodlands Avenue"
]

# Singaporean names (expanded)

chinese_first_names = [
    "Wei", "Jie", "Ming", "Li", "Xin", "Hui", "Yan", "Feng", "Jun", "Ling",
    "Chen", "Qian", "Tao", "Shan", "Xiao", "Ying", "Kai", "Zhi", "Rui", "Bo",
    "Guo", "Han", "Lei", "Pei", "Qing", "Shu", "Ting", "Xiang", "Yuan", "Zhen",
    "Hao", "Dong", "Cheng", "An", "Yun", "Lan", "Ke", "Ping", "Yao", "Lu"
]
chinese_last_names = [
    "Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Chua", "Chan", "Koh", "Teo",
    "Goh", "Loh", "Toh", "Chew", "Seah", "Phua", "Yeo", "Low", "Quek", "Pang",
    "Sim", "Ho", "Peh", "Ang", "Cheong", "Soon", "Neo", "Mak", "Cheng", "Foo"
]

malay_first_names = [
    "Ahmad", "Mohamed", "Siti", "Abdullah", "Fatimah", "Ali", "Rahman", "Zainal", "Nor", "Hassan",
    "Azman", "Roslan", "Fauziah", "Sharifah", "Yusof", "Nordin", "Halijah", "Ismail", "Khalid", "Rahimah",
    "Aminah", "Mustafa", "Latifah", "Rashid", "Saiful", "Najib", "Faridah", "Salim", "Halim", "Karim",
    "Jamilah", "Mazlan", "Nurul", "Zaharah", "Fadzil", "Hanisah", "Rahmat", "Syed", "Shahrul", "Ridwan"
]
malay_last_names = [
    "bin Ahmad", "binti Mohamed", "bin Ismail", "binti Abdullah", "bin Ali", "binti Hassan",
    "bin Omar", "binti Yusof", "bin Ibrahim", "binti Rahman", "bin Salleh", "binti Iskandar",
    "bin Osman", "binti Hamid", "bin Daud", "binti Idris", "bin Mustapha", "binti Latif",
    "bin Ramli", "binti Shahrin", "bin Zulkifli", "binti Jalil", "bin Khalid", "binti Karim"
]

indian_first_names = [
    "Raj", "Kumar", "Suresh", "Priya", "Latha", "Arjun", "Divya", "Vijay", "Anand", "Deepa",
    "Ravi", "Lakshmi", "Manoj", "Aishwarya", "Santosh", "Meena", "Harish", "Shalini", "Naveen", "Geetha",
    "Karthik", "Sanjay", "Pooja", "Sunita", "Balaji", "Anjali", "Mohan", "Indira", "Gopalakrishnan", "Swapna",
    "Ramesh", "Chitra", "Ajay", "Hemant", "Sangeetha", "Vinod", "Parvati", "Ashok", "Radha", "Usha"
]
indian_last_names = [
    "Kumar", "Devi", "Singh", "Rao", "Patel", "Menon", "Pillai", "Sharma", "Subramaniam", "Gopal",
    "Iyer", "Nair", "Das", "Varma", "Chandran", "Naidu", "Prasad", "Chatterjee", "Bose", "Reddy",
    "Shankar", "Krishnan", "Gupta", "Verma", "Kapoor", "Malhotra", "Mehta", "Saxena", "Banerjee", "Mishra"
]

western_first_names = [
    "John", "David", "Mary", "Sarah", "Michael", "Jennifer", "James", "Linda", "Robert", "Elizabeth",
    "William", "Thomas", "Daniel", "Emily", "Matthew", "Christopher", "Ashley", "Jessica", "Andrew", "Michelle",
    "Brian", "Kevin", "Sophia", "Olivia", "Benjamin", "Anthony", "Rebecca", "George", "Richard", "Anna",
    "Charles", "Amanda", "Paul", "Steven", "Laura", "Julie", "Karen", "Henry", "Scott", "Victoria"
]
western_last_names = [
    "Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Garcia", "Rodriguez", "Wilson",
    "Taylor", "Anderson", "Thomas", "Moore", "Martin", "Jackson", "Thompson", "White", "Harris", "Clark",
    "Lewis", "Walker", "Allen", "King", "Wright", "Scott", "Green", "Baker", "Hall", "Adams",
    "Nelson", "Carter", "Mitchell", "Perez", "Roberts", "Turner", "Phillips", "Campbell", "Evans", "Edwards"
]

# Email domains commonly used in Singapore
email_domains = [
    "gmail.com", "yahoo.com", "hotmail.com", "outlook.com",
    "singnet.com.sg", "starhub.net.sg", "pacific.net.sg",
    "gmail.com.sg", "yahoo.com.sg", "live.com.sg"
]

# Updated product types (removed Home and Marriage loans)
product_types = [
    "Personal loan", "Auto loan", "Credit card",
    "Education loan", "Business loan"
]
interest_rates = {
    "Personal loan": 8.5, "Auto loan": 9.0, "Credit card": 10.0,
    "Education loan": 8.0, "Business loan": 9.5
}
# Loan amounts in SGD (minimum 5000 for all)
loan_amount_limits = {
    "Personal loan": (5000, 100000),
    "Auto loan": (5000, 200000),
    "Credit card": (5000, 50000),
    "Education loan": (5000, 100000),
    "Business loan": (5000, 500000)
}

# Income bands in SGD
income_bands = [
    "50,000 or Below", "50,000 to 100,000", "100,000 to 200,000",
    "200,000 to 300,000", "300,000 to 500,000", "500,000 or Above"
]

# Helper functions
def generate_singaporean_name():
    # Singapore's ethnic distribution: ~74% Chinese, ~13% Malay, ~9% Indian, ~4% others
    ethnicity = np.random.choice(["Chinese", "Malay", "Indian", "Western"],
                                p=[0.74, 0.13, 0.09, 0.04])

    if ethnicity == "Chinese":
        first_name = random.choice(chinese_first_names)
        last_name = random.choice(chinese_last_names)
    elif ethnicity == "Malay":
        first_name = random.choice(malay_first_names)
        last_name = random.choice(malay_last_names)
    elif ethnicity == "Indian":
        first_name = random.choice(indian_first_names)
        last_name = random.choice(indian_last_names)
    else:
        first_name = random.choice(western_first_names)
        last_name = random.choice(western_last_names)

    return f"{first_name} {last_name}"

def generate_singapore_mobile_number():
    # Singapore mobile numbers: +65 8/9XXX XXXX (8 digits after +65)
    # Starts with 8 or 9, followed by 7 random digits
    prefix = random.choice(['8', '9'])
    number = ''.join([str(random.randint(0, 9)) for _ in range(7)])
    return f"+65 {prefix}{number[:3]} {number[3:]}"

def generate_singapore_landline_number():
    # Singapore landline numbers: +65 6XXX XXXX (8 digits after +65)
    # Starts with 6, followed by 7 random digits
    number = '6' + ''.join([str(random.randint(0, 9)) for _ in range(7)])
    return f"+65 {number[:4]} {number[4:]}"

def generate_email(first_name, last_name):
    # Remove spaces and special characters from names for email
    clean_first = first_name.lower().replace(' ', '')
    clean_last = last_name.lower().replace(' ', '')

    # Different email formats
    formats = [
        f"{clean_first}.{clean_last}",
        f"{clean_first}{clean_last}",
        f"{clean_first}_{clean_last}",
        f"{clean_first[0]}{clean_last}",
        f"{clean_first}{clean_last}{random.randint(10, 999)}"
    ]

    username = random.choice(formats)
    domain = random.choice(email_domains)
    return f"{username}@{domain}"

def generate_loan_amount(product_type):
    min_amt, max_amt = loan_amount_limits[product_type]
    # Ensure amount is multiple of appropriate step
    if product_type in ["Auto loan", "Business loan"]:
        step = 5000
    else:
        step = 1000
    amount = np.random.choice(range(min_amt, max_amt + 1, step))
    return round(float(amount), 2)

def calculate_emi(principal, rate, tenure):
    monthly_rate = rate / 1200  # Convert annual percentage to monthly decimal
    emi = (principal * monthly_rate * (1 + monthly_rate) ** tenure) / ((1 + monthly_rate) ** tenure - 1)
    return round(emi, 2)

def adjust_emi_for_delinquency(base_emi, days_past_due):
    if days_past_due <= 0:
        return base_emi
    elif days_past_due <= 15:
        return round(base_emi * 1.02, 2)  # 2% increase
    elif days_past_due <= 30:
        return round(base_emi * 1.05, 2)  # 5% increase
    elif days_past_due <= 60:
        return round(base_emi * 1.08, 2)  # 8% increase
    elif days_past_due <= 90:
        return round(base_emi * 1.12, 2)  # 12% increase
    else:
        return round(base_emi * 1.15, 2)  # 15% increase

def calculate_outstanding_balance(loan_amount, interest_rate, tenure, months_completed, emi_paid, days_past_due):
    if months_completed >= tenure:
        # Loan should be fully paid, but we need to ensure it's not zero
        return round(loan_amount * 0.01, 2)  # Return a small amount (1% of loan amount)

    # Calculate remaining principal using amortization formula
    monthly_rate = interest_rate / 1200
    remaining_periods = tenure - months_completed

    # Calculate the remaining principal
    remaining_principal = emi_paid * ((1 - (1 + monthly_rate) ** -remaining_periods) / monthly_rate)

    # Add penalty interest for days past due
    if days_past_due > 0:
        penalty_rate = 0
        if days_past_due <= 15:
            penalty_rate = 0.02
        elif days_past_due <= 30:
            penalty_rate = 0.05
        elif days_past_due <= 60:
            penalty_rate = 0.08
        elif days_past_due <= 90:
            penalty_rate = 0.12
        else:
            penalty_rate = 0.15

        penalty_interest = remaining_principal * (penalty_rate / 365) * days_past_due
        remaining_principal += penalty_interest

    # Ensure the outstanding balance is at least 1% of the original loan amount
    min_outstanding = loan_amount * 0.01
    return max(min_outstanding, round(remaining_principal, 2))

def generate_customer_id():
    return f"SCB{np.random.randint(100000000, 999999999)}"

def generate_singapore_address():
    district = random.choice(list(singapore_districts.keys()))
    area = random.choice(singapore_districts[district])
    street = random.choice(street_names)
    house_number = random.randint(1, 999)
    unit = f"#{random.randint(1, 50):02d}-{random.randint(1, 99):02d}"
    postal_code = random.randint(100000, 999999)
    return f"{house_number} {street} {unit} {area} Singapore {postal_code} Singapore"

def calculate_day_past_due(last_payment_date, due_date):
    if last_payment_date >= due_date:
        return 0

    # Calculate the difference in days
    days_diff = (due_date - last_payment_date).days

    # If the difference is less than 30 days, no days past due
    if days_diff <= 30:
        return 0

    # If more than 30 days, subtract 30 to get the actual days past due
    return days_diff - 30

def determine_payment_frequency(on_time_percentage):
    return "Regular" if on_time_percentage >= 0.8 else "Irregular"

def calculate_credit_score(payment_history, bounce_history, delinquency, partial_payments):
    base_score = 650
    # Payment history impact (up to ±100 points)
    payment_impact = (payment_history - 0.5) * 200
    # Bounce history impact (up to -50 points)
    bounce_impact = -min(bounce_history * 10, 50)
    # Delinquency impact (up to -100 points)
    delinquency_impact = -50 if delinquency else 0
    # Partial payment impact
    partial_impact = -20 if partial_payments else 0

    score = base_score + payment_impact + bounce_impact + delinquency_impact + partial_impact
    return max(300, min(850, int(score)))

def determine_smartphone_penetration(age, occupation):
    if age > 60:
        return "Low"  # Older people less likely to use smartphones
    elif occupation in ["Student", "Employed", "Self-Employed"]:
        return "High"  # Working people and students more likely to use smartphones
    elif occupation in ["Retired", "Homemaker"]:
        return "Medium"
    else:
        return "Low"

# Generate base data
np.random.seed(42)
data = []

for _ in range(num_records):
    product_type = np.random.choice(product_types)
    tenure = int(np.random.choice([12, 24, 36, 48, 60]))
    loan_amount = generate_loan_amount(product_type)
    interest_rate = interest_rates[product_type]
    base_emi = calculate_emi(loan_amount, interest_rate, tenure)

    # Generate realistic dates
    days_since_loan_start = int(np.random.randint(180, 1800))
    loan_start_date = datetime.now() - timedelta(days=days_since_loan_start)
    months_completed = min(tenure-1, int((datetime.now() - loan_start_date).days / 30))  # Ensure not fully paid

    # Next due date should be in the future
    next_due_date = datetime.now() + timedelta(days=int(np.random.randint(1, 30)))

    # Calculate payment behavior
    on_time_percentage = np.random.uniform(0.5, 1.0)
    payment_frequency = determine_payment_frequency(on_time_percentage)

    # Determine last payment date based on payment behavior
    if np.random.random() < on_time_percentage:
        # On-time payment: paid before or on the due date of the previous installment
        prev_due_date = next_due_date - timedelta(days=30)
        # Last payment date is before or on the previous due date
        days_before = int(np.random.randint(0, 30))
        last_payment_date = prev_due_date - timedelta(days=days_before)
    else:
        # Late payment: paid after the due date
        days_late = int(np.random.randint(1, 120))  # Up to 120 days late
        prev_due_date = next_due_date - timedelta(days=30)
        last_payment_date = prev_due_date + timedelta(days=days_late)

    # Ensure last payment date is in the past
    if last_payment_date > datetime.now():
        last_payment_date = datetime.now() - timedelta(days=int(np.random.randint(1, 30)))

    # Calculate days past due based on the new logic
    day_past_due = calculate_day_past_due(last_payment_date, next_due_date)

    # Adjust EMI based on days past due
    current_emi = adjust_emi_for_delinquency(base_emi, day_past_due)

    # Calculate outstanding balance
    amount_paid_each_month = base_emi  # Actual EMI paid
    outstanding_balance = calculate_outstanding_balance(
        loan_amount, interest_rate, tenure, months_completed,
        amount_paid_each_month, day_past_due
    )

    # Bounce history based on payment behavior
    bounce_probability = 0.3 if on_time_percentage < 0.7 else 0.1
    bounce_history = int(np.random.poisson(bounce_probability * months_completed / 12))

    # Partial payment indicator
    partial_payment = np.random.choice([True, False], p=[0.2, 0.8])

    # Credit score calculation
    delinquency = np.random.choice([True, False], p=[0.2, 0.8])
    credit_score = calculate_credit_score(on_time_percentage, bounce_history, delinquency, partial_payment)

    # Singapore economic indicators (approximate 2023 values)
    unemployment_rate = round(np.random.uniform(1.8, 2.2), 2)
    inflation_rate = round(np.random.uniform(4.0, 5.5), 2)
    interest_trend = round(np.random.uniform(-0.2, 0.5), 2)
    economic_stress = round(np.random.uniform(0.1, 0.4), 2)

    # Contact history metrics
    call_attempts = int(np.random.poisson(3))
    sms_attempts = int(np.random.poisson(5))
    whatsapp_attempts = int(np.random.poisson(2))
    email_attempts = int(np.random.poisson(1))

    # Calculate total contact attempts
    no_of_attempts = call_attempts + sms_attempts + whatsapp_attempts + email_attempts

    # Generate customer details
    age = int(np.random.randint(25, 70))
    occupation = np.random.choice(["Employed", "Self-Employed", "Student", "Retired", "Unemployed", "Homemaker"])
    smartphone_penetration = determine_smartphone_penetration(age, occupation)

    # Gender with Others being less common
    gender = np.random.choice(["Male", "Female", "Others"], p=[0.48, 0.48, 0.04])

    # Generate name and contact information
    full_name = generate_singaporean_name()
    first_name, last_name = full_name.split(' ', 1) if ' ' in full_name else (full_name, '')

    # Generate contact details
    primary_phone = generate_singapore_mobile_number()
    # Secondary mobile number - 40% chance of having one
    secondary_phone = generate_singapore_mobile_number()
    landline_phone = generate_singapore_landline_number()
    email_id = generate_email(first_name, last_name)

    record = {
        "Customer_id": generate_customer_id(),
        "Loan_Account_id": int(np.random.randint(10000000, 99999999)),
        "Product_Type": product_type,
        "Loan_Amount_SGD": loan_amount,
        "Outstanding_Balance_SGD": outstanding_balance,
        "Day_Past_Due": day_past_due,
        "Tenure": tenure,
        "Interest_Rate": interest_rate,
        "Current_EMI_SGD": current_emi,
        "Installment_Due_Date": next_due_date.date(),
        "Last_Payment_Date": last_payment_date.date(),
        "Partial_Payment_Indicator": partial_payment,
        "Number_of_Past_Payments": months_completed,
        "Payment_Frequency": payment_frequency,
        "Amount_Paid_Each_Month_SGD": amount_paid_each_month,
        "Bounce_History": bounce_history,
        "Settlement_History": np.random.choice(["Settled", "Partial Settlement", "Not Settled", "Under Negotiation"]),
        "Repayment_Irregularity_Flags": on_time_percentage < 0.7,
        "Contact_History_Call_Attempts": call_attempts,
        "Contact_History_SMS": sms_attempts,
        "Contact_History_WhatsApp": whatsapp_attempts,
        "Contact_History_EmailLogs": email_attempts,
        "Channel_used": np.random.choice(["Call", "SMS", "WhatsApp", "Email", "Field Agent", "IVR"]),
        "Response_Outcome": np.random.choice(["Connected", "Promised to pay", "Ignored", "Disconnected", "Paid fully", "Partial paid"]),
        "No_of_Attempts": no_of_attempts,
        "Average_Handling_Time": round(np.random.uniform(10, 30), 2),  # Between 10-30 seconds
        "Name": full_name,
        "Age": age,
        "Gender": gender,
        "Primary_Phone_Number": primary_phone,
        "Secondary_Mobile_Number": secondary_phone,  # New column added here
        "Landline_Phone_Number": landline_phone,
        "Email_ID": email_id,
        "Occupation": occupation,
        "Income_Band_SGD": np.random.choice(income_bands),
        "Employeement_Type": np.random.choice(["Full time", "Part time", "Contract", "Freelance", "Unemployed"]),
        "Address": generate_singapore_address(),
        "Urban_Rural_Tag": "Urban",
        "Language_Preference": np.random.choice(["English", "Regional"], p=[0.7, 0.3]),
        "Mobile_Number_Active_Status": np.random.choice([True, False], p=[0.9, 0.1]),
        "Email_Activity": np.random.choice([True, False], p=[0.8, 0.2]),
        "App_Login_Frequency": int(np.random.poisson(10)),
        "UPI_Transactions": int(np.random.poisson(15)),
        "Online_Banking_Activity": int(np.random.poisson(8)),
        "Smartphone_Penetration": smartphone_penetration,
        "Preferred_Channel": np.random.choice(["Call", "SMS", "WhatsApp", "Email", "App notification", "Field Agent", "IVR"]),
        "Call_SMS_Activity_Patterns": np.random.choice(["Low", "Medium", "High"]),
        "WhatsApp_OTT_usage_Indicator": np.random.choice([True, False], p=[0.8, 0.2]),
        "Credit_Score": credit_score,
        "Recent_Inquiries": int(np.random.poisson(2)),
        "Loan_Exposure_Across_Banks": int(np.random.poisson(1)),
        "Delinquency_on_other_Loans": delinquency,
        "Recent_Score_Change": int(np.random.randint(-50, 50)),
        "Unemployeement_rate_region": unemployment_rate,
        "Inflation_Rate": inflation_rate,
        "Interest_Rate_Trend": interest_trend,
        "Economic_Stress_Index": economic_stress,
        "Do_Not_Call_Registry_Data": np.random.choice([True, False], p=[0.2, 0.8]),
        "Regional_Time_Restrictions": np.random.choice(["Morning", "Afternoon", "Evening", "Night"]),
        "Communication_Complaince_Limits": np.random.choice(["Daytime", "Evening", "Weekdays", "Weekends", "Holidays"])
    }
    data.append(record)

# Create DataFrame
df = pd.DataFrame(data)

# Validate constraints
assert all(5000 <= amt <= 500000 for amt in df.Loan_Amount_SGD)
assert all(ob > 0 for ob in df.Outstanding_Balance_SGD)
assert all(tenure in [12,24,36,48,60] for tenure in df.Tenure)
assert all(8.0 <= rate <= 10.0 for rate in df.Interest_Rate)
assert all(10 <= aht <= 30 for aht in df.Average_Handling_Time)

# Format float columns to 2 decimal places
float_columns = df.select_dtypes(include=['float64']).columns
df[float_columns] = df[float_columns].round(2)

# Save to CSV
df.to_csv('singapore_loan_data.csv', index=False)

print(f"Generated {len(df)} records")
print("Data saved to singapore_loan_data.csv")
print(df[['Name', 'Primary_Phone_Number', 'Secondary_Mobile_Number', 'Landline_Phone_Number', 'Email_ID', 'Product_Type', 'Loan_Amount_SGD']].head(10))

Generated 1000 records
Data saved to singapore_loan_data.csv
               Name Primary_Phone_Number Secondary_Mobile_Number  \
0           Yan Mak        +65 8841 5097           +65 9009 8075   
1         Wei Cheng        +65 8638 8963           +65 9908 8371   
2         Dong Soon        +65 9864 7950           +65 8342 5666   
3          Hao Phua        +65 8492 9022           +65 8053 8734   
4           Bo Soon        +65 8853 7079           +65 8604 9455   
5        Arjun Nair        +65 8468 7780           +65 8295 6506   
6          Xin Phua        +65 9035 6542           +65 8934 3877   
7  Ashok Chatterjee        +65 8825 7647           +65 9588 3623   
8          Shu Seah        +65 8311 7873           +65 8827 3637   
9         Ling Soon        +65 8261 1532           +65 9867 2985   

  Landline_Phone_Number                          Email_ID    Product_Type  \
0         +65 6112 4297            yanmak802@gmail.com.sg  Education loan   
1         +65 6465 4057             

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from collections import defaultdict

# Configuration
num_records = 1000

# Singapore-specific data (same as before)
singapore_districts = {
    "01": ["Raffles Place", "Marina", "People's Park"],
    "02": ["Anson", "Tanjong Pagar"],
    "03": ["Queenstown", "Tiong Bahru"],
    "04": ["Telok Blangah", "HarbourFront"],
    "05": ["Pasir Panjang", "Hong Leong Garden", "Clementi New Town"],
    "06": ["High Street", "Beach Road"],
    "07": ["Middle Road", "Golden Mile"],
    "08": ["Little India", "Farrer Park"],
    "09": ["Orchard", "Cairnhill", "River Valley"],
    "10": ["Ardmore", "Bukit Timah", "Holland Road"],
    "11": ["Watten Estate", "Novena", "Thomson"],
    "12": ["Balestier", "Toa Payoh", "Serangoon"],
    "13": ["Macpherson", "Braddell"],
    "14": ["Geylang", "Eunos"],
    "15": ["Katong", "Joo Chiat", "Amber Road"],
    "16": ["Bedok", "Upper East Coast", "Eastwood", "Kew Drive"],
    "17": ["Loyang", "Changi"],
    "18": ["Tampines", "Pasir Ris"],
    "19": ["Serangoon Garden", "Hougang", "Punggol"],
    "20": ["Bishan", "Ang Mo Kio"],
    "21": ["Upper Bukit Timah", "Clementi Park", "Ulu Pandan"],
    "22": ["Jurong"],
    "23": ["Hillview", "Dairy Farm", "Bukit Panjang", "Choa Chu Kang"],
    "24": ["Lim Chu Kang", "Tengah"],
    "25": ["Kranji", "Woodgrove"],
    "26": ["Upper Thomson", "Springleaf"],
    "27": ["Yishun", "Sembawang"],
    "28": ["Seletar"]
}

street_names = [
    "Orchard Road", "North Bridge Road", "South Bridge Road", "Victoria Street",
    "Beach Road", "Serangoon Road", "Havelock Road", "Outram Road",
    "Cecil Street", "Robinson Road", "Maxwell Road", "Telok Ayer Street",
    "Bras Basah Road", "Bugis Street", "Chinatown Street", "Clarke Quay",
    "Collyer Quay", "East Coast Road", "Emerald Hill Road", "Farrer Road",
    "Geylang Road", "Holland Road", "Jalan Besar", "Joo Chiat Road",
    "Jurong East Street", "Kallang Road", "Lavender Street", "MacPherson Road",
    "Marine Parade Road", "Newton Road", "Pasir Panjang Road", "Potong Pasir Avenue",
    "Scotts Road", "Sembawang Road", "Simei Street", "Tampines Avenue",
    "Tanglin Road", "Toa Payoh Central", "Upper Serangoon Road", "Woodlands Avenue"
]

# Singaporean names (same as before)
chinese_first_names = ["Wei", "Jie", "Ming", "Li", "Xin", "Hui", "Yan", "Feng", "Jun", "Ling"]
chinese_last_names = ["Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Chua", "Chan", "Koh", "Teo"]
malay_first_names = ["Ahmad", "Mohamed", "Siti", "Abdullah", "Fatimah", "Ali", "Rahman", "Zainal", "Nor", "Hassan"]
malay_last_names = ["bin Ahmad", "binti Mohamed", "bin Ismail", "binti Abdullah", "bin Ali", "binti Hassan"]
indian_first_names = ["Raj", "Kumar", "Suresh", "Priya", "Latha", "Arjun", "Divya", "Vijay", "Anand", "Deepa"]
indian_last_names = ["Kumar", "Devi", "Singh", "Rao", "Patel", "Menon", "Pillai", "Sharma", "Subramaniam", "Gopal"]
western_first_names = ["John", "David", "Mary", "Sarah", "Michael", "Jennifer", "James", "Linda", "Robert", "Elizabeth"]
western_last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Garcia", "Rodriguez", "Wilson"]

email_domains = ["gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "singnet.com.sg"]

product_types = ["Personal loan", "Auto loan", "Credit card", "Education loan", "Business loan"]
interest_rates = {"Personal loan": 8.5, "Auto loan": 9.0, "Credit card": 10.0, "Education loan": 8.0, "Business loan": 9.5}
loan_amount_limits = {
    "Personal loan": (5000, 100000), "Auto loan": (5000, 200000), "Credit card": (5000, 50000),
    "Education loan": (5000, 100000), "Business loan": (5000, 500000)
}

# Enhanced income bands with more realistic distributions
income_bands = {
    "50,000 or Below": (25000, 50000),
    "50,000 to 100,000": (50000, 100000),
    "100,000 to 200,000": (100000, 200000),
    "200,000 to 300,000": (200000, 300000),
    "300,000 to 500,000": (300000, 500000),
    "500,000 or Above": (500000, 1500000)  # Wider range for high earners
}

singapore_cities = [
    "Singapore City", "Jurong East", "Tampines", "Woodlands", "Bedok", "Sengkang",
    "Hougang", "Yishun", "Ang Mo Kio", "Bukit Merah", "Bukit Batok", "Pasir Ris",
    "Clementi", "Bishan", "Toa Payoh", "Serangoon", "Queenstown", "Punggol", "Kallang", "Bukit Panjang"
]

agent_ids = [f"SCB_AG_{i:04d}" for i in range(1, 51)]

# Customer lifestyle profiles for unique spending patterns
lifestyle_profiles = {
    "Young Professional": {"age_range": (22, 35), "spending_focus": ["entertainment", "shopping", "travel"]},
    "Family Focused": {"age_range": (30, 50), "spending_focus": ["education", "shopping", "health"]},
    "Established Career": {"age_range": (40, 60), "spending_focus": ["travel", "health", "entertainment"]},
    "Retirement Phase": {"age_range": (60, 75), "spending_focus": ["health", "utility", "shopping"]},
    "Student Lifestyle": {"age_range": (18, 25), "spending_focus": ["education", "entertainment", "shopping"]},
    "High Net Worth": {"age_range": (35, 65), "spending_focus": ["travel", "entertainment", "shopping"]}
}

# Helper functions
def generate_singaporean_name():
    ethnicity = np.random.choice(["Chinese", "Malay", "Indian", "Western"], p=[0.74, 0.13, 0.09, 0.04])
    if ethnicity == "Chinese":
        return f"{random.choice(chinese_first_names)} {random.choice(chinese_last_names)}"
    elif ethnicity == "Malay":
        return f"{random.choice(malay_first_names)} {random.choice(malay_last_names)}"
    elif ethnicity == "Indian":
        return f"{random.choice(indian_first_names)} {random.choice(indian_last_names)}"
    else:
        return f"{random.choice(western_first_names)} {random.choice(western_last_names)}"

def generate_singapore_mobile_number(valid=True):
    prefix = random.choice(['8', '9'])
    if valid:
        number = ''.join([str(random.randint(0, 9)) for _ in range(7)])
        return f"+65 {prefix}{number[:3]} {number[3:]}"
    else:
        invalid_types = [
            f"+65 {prefix}{''.join([str(random.randint(0, 9)) for _ in range(6)])}",
            f"+65 {random.choice(['6', '7'])}{''.join([str(random.randint(0, 9)) for _ in range(7)])}",
            f"+65 {prefix}{''.join([str(random.randint(0, 9)) for _ in range(8)])}",
            f"65 {prefix}{''.join([str(random.randint(0, 9)) for _ in range(3)])} {''.join([str(random.randint(0, 9)) for _ in range(4)])}",
        ]
        return random.choice(invalid_types)

def generate_singapore_landline_number():
    number = '6' + ''.join([str(random.randint(0, 9)) for _ in range(7)])
    return f"+65 {number[:4]} {number[4:]}"

def generate_email(first_name, last_name):
    clean_first = first_name.lower().replace(' ', '')
    clean_last = last_name.lower().replace(' ', '')
    formats = [f"{clean_first}.{clean_last}", f"{clean_first}{clean_last}", f"{clean_first}_{clean_last}"]
    username = random.choice(formats)
    domain = random.choice(email_domains)
    return f"{username}@{domain}"

def generate_loan_amount(product_type):
    min_amt, max_amt = loan_amount_limits[product_type]
    step = 5000 if product_type in ["Auto loan", "Business loan"] else 1000
    amount = np.random.choice(range(min_amt, max_amt + 1, step))
    return round(float(amount), 2)

def calculate_emi(principal, rate, tenure):
    monthly_rate = rate / 1200
    emi = (principal * monthly_rate * (1 + monthly_rate) ** tenure) / ((1 + monthly_rate) ** tenure - 1)
    return round(emi, 2)

def adjust_emi_for_delinquency(base_emi, days_past_due):
    if days_past_due <= 0: return base_emi
    elif days_past_due <= 15: return round(base_emi * 1.02, 2)
    elif days_past_due <= 30: return round(base_emi * 1.05, 2)
    elif days_past_due <= 60: return round(base_emi * 1.08, 2)
    elif days_past_due <= 90: return round(base_emi * 1.12, 2)
    else: return round(base_emi * 1.15, 2)

def calculate_outstanding_balance(loan_amount, interest_rate, tenure, months_completed, emi_paid, days_past_due):
    if months_completed >= tenure:
        return round(loan_amount * 0.01, 2)
    monthly_rate = interest_rate / 1200
    remaining_periods = tenure - months_completed
    remaining_principal = emi_paid * ((1 - (1 + monthly_rate) ** -remaining_periods) / monthly_rate)
    if days_past_due > 0:
        penalty_rate = 0.02 if days_past_due <= 15 else 0.05 if days_past_due <= 30 else 0.08 if days_past_due <= 60 else 0.12 if days_past_due <= 90 else 0.15
        penalty_interest = remaining_principal * (penalty_rate / 365) * days_past_due
        remaining_principal += penalty_interest
    min_outstanding = loan_amount * 0.01
    return max(min_outstanding, round(remaining_principal, 2))

def generate_customer_id():
    return f"SCB{np.random.randint(100000000, 999999999)}"

def generate_singapore_address():
    district = random.choice(list(singapore_districts.keys()))
    area = random.choice(singapore_districts[district])
    street = random.choice(street_names)
    house_number = random.randint(1, 999)
    unit = f"#{random.randint(1, 50):02d}-{random.randint(1, 99):02d}"
    postal_code = random.randint(100000, 999999)
    return f"{house_number} {street} {unit} {area} Singapore {postal_code} Singapore"

def calculate_day_past_due(last_payment_date, due_date):
    if last_payment_date >= due_date: return 0
    days_diff = (due_date - last_payment_date).days
    return max(0, days_diff - 30)

def determine_payment_frequency(on_time_percentage):
    return "Regular" if on_time_percentage >= 0.8 else "Irregular"

def calculate_credit_score(payment_history, missed_payments, delinquency, partial_payments):
    base_score = 650
    payment_impact = (payment_history - 0.5) * 200
    missed_payment_impact = -min(missed_payments * 15, 100)
    delinquency_impact = -50 if delinquency else 0
    partial_impact = -20 if partial_payments else 0
    score = base_score + payment_impact + missed_payment_impact + delinquency_impact + partial_impact
    return max(300, min(850, int(score)))

def determine_lifestyle_profile(age, income_band, occupation):
    """Determine customer lifestyle profile for unique spending patterns"""
    if age >= 60:
        return "Retirement Phase"
    elif age <= 25 and occupation == "Student":
        return "Student Lifestyle"
    elif income_band in ["500,000 or Above", "300,000 to 500,000"]:
        return "High Net Worth"
    elif 30 <= age <= 50 and occupation in ["Employed", "Self-Employed"]:
        return "Family Focused"
    elif 40 <= age <= 60:
        return "Established Career"
    else:
        return "Young Professional"

def calculate_monthly_income(income_band):
    """Calculate realistic monthly income with some variation within bands"""
    min_income, max_income = income_bands[income_band]

    # Create more realistic distribution within bands
    if income_band == "50,000 or Below":
        # Skew toward lower end
        annual_income = min_income + (max_income - min_income) * random.betavariate(2, 5)
    elif income_band == "500,000 or Above":
        # Skew toward higher end
        annual_income = min_income + (max_income - min_income) * random.betavariate(5, 2)
    else:
        # More normal distribution for middle bands
        annual_income = random.normalvariate(
            (min_income + max_income) / 2,
            (max_income - min_income) / 6
        )

    annual_income = max(min_income, min(max_income, annual_income))
    monthly_income = annual_income / 12

    return round(monthly_income, 2)

def generate_communication_data(age, employment_status, monthly_income):
    # More realistic communication patterns
    if employment_status == "Unemployed":
        communication_completion = random.uniform(0.6, 0.85)
        mobile_change_base = random.randint(1, 4)
    else:
        communication_completion = random.uniform(0.85, 0.98)
        mobile_change_base = random.randint(0, 2)

    # Age affects communication stability
    age_factor = max(0, (35 - age) / 15)  # Younger = more changes
    mobile_change_count = max(0, mobile_change_base + int(age_factor * 3))
    mobile_change_this_year = min(mobile_change_count, random.randint(0, 2))

    # Generate phone numbers
    primary_valid = random.random() > 0.08  # 92% valid primary
    secondary_valid = random.random() > 0.25  # 75% valid secondary

    primary_phone = generate_singapore_mobile_number(primary_valid)
    has_secondary = random.random() > 0.15  # 85% have secondary
    secondary_phone = generate_singapore_mobile_number(secondary_valid) if has_secondary else ""

    # Count valid numbers
    valid_numbers = sum([primary_valid, secondary_valid]) if has_secondary else primary_valid
    invalid_numbers = (2 - valid_numbers) if has_secondary else (1 - valid_numbers)

    # Address changes (related to employment and age stability)
    if employment_status == "Unemployed" or age < 30:
        address_change_base = random.randint(0, 3)
    else:
        address_change_base = random.randint(0, 1)

    address_change_count = address_change_base
    address_change_this_year = min(address_change_count, random.randint(0, 1))

    # Contact change frequency
    customer_age_years = max(5, age - 18)  # Years since adulthood
    total_changes = mobile_change_count + address_change_count
    change_frequency = total_changes / customer_age_years if customer_age_years > 0 else 0

    return {
        "communication_completion": round(communication_completion, 2),
        "primary_phone": primary_phone,
        "secondary_phone": secondary_phone,
        "valid_phone_number": primary_valid,
        "no_of_valid_numbers": valid_numbers,
        "no_of_invalid_numbers": invalid_numbers,
        "mobile_change_count": mobile_change_count,
        "mobile_change_this_year": mobile_change_this_year,
        "address_change_count": address_change_count,
        "address_change_this_year": address_change_this_year,
        "contact_change_frequency": round(change_frequency, 2)
    }

def generate_unique_spend_profile(lifestyle, monthly_income, employment_status, age):
    """Generate truly unique spending patterns for each customer"""

    # Base spending ratios by lifestyle with individual variation
    base_profiles = {
        "Young Professional": {"utility": 0.10, "shopping": 0.28, "entertainment": 0.20,
                              "health": 0.08, "education": 0.12, "travel": 0.22},
        "Family Focused": {"utility": 0.15, "shopping": 0.30, "entertainment": 0.10,
                          "health": 0.12, "education": 0.18, "travel": 0.15},
        "Established Career": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.15,
                              "health": 0.15, "education": 0.08, "travel": 0.25},
        "Retirement Phase": {"utility": 0.18, "shopping": 0.22, "entertainment": 0.12,
                            "health": 0.25, "education": 0.05, "travel": 0.18},
        "Student Lifestyle": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.25,
                             "health": 0.08, "education": 0.20, "travel": 0.10},
        "High Net Worth": {"utility": 0.08, "shopping": 0.30, "entertainment": 0.22,
                          "health": 0.12, "education": 0.08, "travel": 0.20}
    }

    # Get base profile
    base_ratios = base_profiles[lifestyle].copy()

    # Add significant individual variation (±30%)
    for category in base_ratios:
        variation = random.uniform(0.7, 1.3)
        base_ratios[category] *= variation

    # Normalize to ensure total = 1
    total = sum(base_ratios.values())
    for category in base_ratios:
        base_ratios[category] /= total

    # Determine total monthly spend with realistic variations
    if employment_status == "Unemployed":
        # Unemployed may spend savings or have irregular income
        spend_ratio = random.uniform(0.7, 1.3)  # Wide range for anomalies
    elif lifestyle == "High Net Worth":
        spend_ratio = random.uniform(0.3, 0.6)  # High earners save more
    elif lifestyle == "Student Lifestyle":
        spend_ratio = random.uniform(0.8, 1.1)  # Students often spend most income
    else:
        spend_ratio = random.uniform(0.5, 0.8)  # Normal range

    # Create some spending anomalies (5% of cases)
    if random.random() < 0.05:
        # Extreme savers or extreme spenders
        spend_ratio = random.choice([random.uniform(0.2, 0.4), random.uniform(1.2, 1.8)])

    total_monthly_spend = monthly_income * spend_ratio

    # Calculate category spends
    spends = {}
    for category, ratio in base_ratios.items():
        base_spend = total_monthly_spend * ratio
        # Add some monthly randomness (±15%)
        monthly_variation = random.uniform(0.85, 1.15)
        spends[category] = max(50, base_spend * monthly_variation)  # Minimum $50 per category

    # Recalculate total to account for variations and minimums
    actual_total_spend = sum(spends.values())

    # Other spend metrics with individual variations
    monthly_trend = random.normalvariate(total_monthly_spend, total_monthly_spend * 0.12)
    seasonal_variation = random.uniform(0.08, 0.35)
    weekend_ratio = random.uniform(0.22, 0.48)
    festive_multiplier = random.uniform(1.15, 2.0)
    festive_spend = total_monthly_spend * festive_multiplier

    return {
        "utility_spend": round(spends["utility"], 2),
        "shopping_spend": round(spends["shopping"], 2),
        "entertainment_spend": round(spends["entertainment"], 2),
        "health_spend": round(spends["health"], 2),
        "education_spend": round(spends["education"], 2),
        "travel_spend": round(spends["travel"], 2),
        "monthly_spend_trend": round(monthly_trend, 2),
        "seasonal_spend_variation": round(seasonal_variation, 2),
        "weekend_spend_ratio": round(weekend_ratio, 2),
        "festive_spend": round(festive_spend, 2),
        "total_monthly_spend": round(actual_total_spend, 2),
        "spend_ratio": round(spend_ratio, 2)
    }

def generate_payment_behavior(monthly_income, total_spend, age, employment_status, lifestyle):
    """Generate unique payment behavior patterns"""

    # Base transaction intensity
    if lifestyle == "High Net Worth":
        base_intensity = total_spend / 200  # Larger transactions
    else:
        base_intensity = total_spend / 80   # More frequent, smaller transactions

    # Age and lifestyle affect payment preferences
    if age < 35:
        upi_factor = 1.4
        credit_factor = 0.8
        cash_factor = 0.7
    elif age > 55:
        upi_factor = 0.6
        credit_factor = 1.1
        cash_factor = 1.3
    else:
        upi_factor = 1.0
        credit_factor = 1.0
        cash_factor = 1.0

    # Generate transaction counts with individual variation
    upi_count = max(5, int(np.random.normal(base_intensity * 0.4 * upi_factor, 8)))
    debit_count = max(3, int(np.random.normal(base_intensity * 0.3, 5)))
    credit_count = max(2, int(np.random.normal(base_intensity * 0.2 * credit_factor, 4)))
    cash_count = max(1, int(np.random.normal(base_intensity * 0.1 * cash_factor, 3)))

    # Recurring transactions based on lifestyle
    if lifestyle in ["Family Focused", "Established Career"]:
        recurring_base = random.randint(6, 12)
    else:
        recurring_base = random.randint(3, 8)

    recurring_count = recurring_base

    # Determine preferred channel
    channel_weights = [upi_count, debit_count, credit_count, cash_count]
    channels = ["UPI", "Debit Card", "Credit Card", "Cash"]
    preferred_channel = channels[np.argmax(channel_weights)]

    # Recurring payment ratio
    total_transactions = upi_count + debit_count + credit_count + cash_count
    recurring_ratio = recurring_count / total_transactions if total_transactions > 0 else 0.1

    # Savings behavior with individual variations
    if employment_status == "Unemployed":
        savings_ratio = random.uniform(-0.25, 0.05)  # May be using savings
    elif lifestyle == "High Net Worth":
        savings_ratio = random.uniform(0.25, 0.45)   # High savings rate
    elif lifestyle == "Student Lifestyle":
        savings_ratio = random.uniform(-0.1, 0.1)    # Minimal savings
    else:
        savings_ratio = random.uniform(0.05, 0.25)   # Normal range

    # Create some savings anomalies (3% of cases)
    if random.random() < 0.03:
        savings_ratio = random.choice([random.uniform(-0.4, -0.2), random.uniform(0.5, 0.7)])

    # Spend growth rate
    if employment_status == "Unemployed":
        spend_growth = random.normalvariate(-0.08, 0.15)
    else:
        spend_growth = random.normalvariate(0.04, 0.12)

    # High value transactions
    high_value_threshold = monthly_income * 0.07
    high_value_count = max(0, int(np.random.poisson(total_spend / high_value_threshold * 0.08)))

    return {
        "upi_count": upi_count,
        "debit_count": debit_count,
        "credit_count": credit_count,
        "cash_count": cash_count,
        "recurring_count": recurring_count,
        "preferred_channel": preferred_channel,
        "recurring_ratio": round(recurring_ratio, 2),
        "savings_ratio": round(savings_ratio, 2),
        "spend_growth": round(spend_growth, 2),
        "high_value_count": high_value_count
    }

def calculate_financial_health(monthly_income, total_spend, savings_ratio, payment_history, missed_payments, employment_status):
    """Calculate comprehensive financial health with realistic variations"""

    # Financial Stress Score with multiple factors
    spend_income_ratio = total_spend / monthly_income if monthly_income > 0 else 2.0

    # Different weighting based on employment
    if employment_status == "Unemployed":
        weights = {"spend_ratio": 0.5, "savings": 0.3, "payments": 0.2}
    else:
        weights = {"spend_ratio": 0.4, "savings": 0.3, "payments": 0.3}

    savings_factor = 1 - max(0, savings_ratio)  # Negative savings increase stress
    payment_factor = 1 - payment_history
    missed_payment_factor = min(1, missed_payments * 0.25)

    base_stress = (
        min(2.0, spend_income_ratio) * weights["spend_ratio"] +
        savings_factor * weights["savings"] +
        (payment_factor * 0.7 + missed_payment_factor * 0.3) * weights["payments"]
    )

    financial_stress_score = min(100, max(0, base_stress * 100))

    # Financial Stress Status with realistic thresholds
    if financial_stress_score >= 75:
        stress_status = "Extreme High stress"
    elif financial_stress_score >= 55:
        stress_status = "High stress"
    elif financial_stress_score >= 35:
        stress_status = "Medium stress"
    else:
        stress_status = "Low stress"

    # Financial Health Status
    if financial_stress_score <= 25:
        health_status = "Healthy"
    elif financial_stress_score <= 50:
        health_status = "Moderate"
    else:
        health_status = "Stressed"

    # Balance trends
    if savings_ratio > 0.2:
        balance_trend = "Rising"
    elif savings_ratio < -0.1:
        balance_trend = "Falling"
    else:
        balance_trend = "Stable"

    # Overdraft flag
    overdraft_flag = (savings_ratio < -0.15 or spend_income_ratio > 1.3)

    # AAR score with some anomalies
    aar_score = random.normalvariate(45, 25)
    if random.random() < 0.05:  # 5% high-risk anomalies
        aar_score = random.uniform(75, 95)
    elif random.random() < 0.08:  # 8% low-risk anomalies
        aar_score = random.uniform(5, 25)

    aar_score = max(0, min(100, aar_score))

    if aar_score <= 25:
        aar_risk = "Low"
    elif aar_score <= 70:
        aar_risk = "Medium"
    else:
        aar_risk = "High"

    # Flight Risk - more sophisticated calculation
    financial_factors = financial_stress_score / 100 * 0.4
    behavioral_factors = (1 - payment_history) * 0.3
    stability_factors = min(1, missed_payments * 0.1) * 0.2
    engagement_factors = (1 - min(1, payment_history)) * 0.1

    flight_risk = min(1.0, financial_factors + behavioral_factors + stability_factors + engagement_factors)

    # Add some flight risk anomalies
    if random.random() < 0.04:  # 4% have unexpectedly high/low flight risk
        flight_risk = random.choice([random.uniform(0.8, 0.95), random.uniform(0.05, 0.15)])

    return {
        "finance_stress_status": stress_status,
        "financial_health_status": health_status,
        "financial_stress_score": round(financial_stress_score, 2),
        "flight_risk": round(flight_risk, 2),
        "avg_balance_trend": balance_trend,
        "overdraft_flag": overdraft_flag,
        "aar_score": round(aar_score, 2),
        "aar_risk_level": aar_risk
    }

def generate_contact_success_data(call_attempts, payment_history, employment_status):
    """Generate contact success data based on customer behavior"""

    # Base success rate based on payment history and employment
    if payment_history > 0.8 and employment_status == "Employed":
        base_success_rate = random.uniform(0.5, 0.8)
    elif payment_history > 0.6:
        base_success_rate = random.uniform(0.3, 0.6)
    else:
        base_success_rate = random.uniform(0.1, 0.4)

    # Add individual variation
    success_rate = max(0.05, min(0.95, random.normalvariate(base_success_rate, 0.15)))
    successful_contacts = int(call_attempts * success_rate)

    # Agent interactions
    available_agents = random.sample(agent_ids, random.randint(2, 5))
    last_successful_agent = random.choice(available_agents) if successful_contacts > 0 else ""

    # Best agents
    num_best_agents = min(3, len(available_agents))
    best_agents = random.sample(available_agents, num_best_agents)

    # Call durations based on success rate
    avg_call_duration_base = 3 + (success_rate * 10)  # 3-13 minutes range
    avg_times = [round(random.normalvariate(avg_call_duration_base, 2), 2) for _ in best_agents]

    best_agent_interaction_count = random.randint(1, successful_contacts) if successful_contacts > 0 else 0

    return {
        "successful_contacts": successful_contacts,
        "contact_success_rate": round(success_rate, 2),
        "last_successful_agent": last_successful_agent,
        "best_agents": ",".join(best_agents),
        "avg_times_best_agents": ",".join(map(str, avg_times)),
        "best_agent_interaction_count": best_agent_interaction_count
    }

# Track unique spending patterns to ensure diversity
spending_patterns_tracker = defaultdict(int)

# Generate base data
np.random.seed(42)
random.seed(42)
data = []

for i in range(num_records):
    # Basic loan and customer data
    product_type = np.random.choice(product_types)
    tenure = int(np.random.choice([12, 24, 36, 48, 60]))
    loan_amount = generate_loan_amount(product_type)
    interest_rate = interest_rates[product_type]
    base_emi = calculate_emi(loan_amount, interest_rate, tenure)

    days_since_loan_start = int(np.random.randint(180, 1800))
    loan_start_date = datetime.now() - timedelta(days=days_since_loan_start)
    months_completed = min(tenure-1, int((datetime.now() - loan_start_date).days / 30))
    next_due_date = datetime.now() + timedelta(days=int(np.random.randint(1, 30)))

    # Payment behavior with individual variation
    on_time_percentage = random.betavariate(3, 2)  # Skewed toward better payment history
    payment_frequency = determine_payment_frequency(on_time_percentage)

    if random.random() < on_time_percentage:
        prev_due_date = next_due_date - timedelta(days=30)
        days_before = int(np.random.randint(0, 30))
        last_payment_date = prev_due_date - timedelta(days=days_before)
    else:
        days_late = int(np.random.randint(1, 120))
        prev_due_date = next_due_date - timedelta(days=30)
        last_payment_date = prev_due_date + timedelta(days=days_late)

    if last_payment_date > datetime.now():
        last_payment_date = datetime.now() - timedelta(days=int(np.random.randint(1, 30)))

    day_past_due = calculate_day_past_due(last_payment_date, next_due_date)
    current_emi = adjust_emi_for_delinquency(base_emi, day_past_due)
    amount_paid_each_month = base_emi
    outstanding_balance = calculate_outstanding_balance(
        loan_amount, interest_rate, tenure, months_completed,
        amount_paid_each_month, day_past_due
    )

    # Missed payments count
    missed_payment_probability = 0.4 if on_time_percentage < 0.7 else 0.05
    missed_payments_count = int(np.random.poisson(missed_payment_probability * months_completed / 6))

    # Partial payment indicator
    partial_payment = np.random.choice([True, False], p=[0.15, 0.85])

    # Customer details with realistic distributions
    age = int(random.normalvariate(45, 15))
    age = max(22, min(75, age))

    occupation_weights = [0.65, 0.10, 0.08, 0.07, 0.05, 0.05]  # More employed, fewer unemployed
    occupation = np.random.choice(["Employed", "Self-Employed", "Student", "Retired", "Unemployed", "Homemaker"],
                                p=occupation_weights)
    employment_status = "Employed" if occupation in ["Employed", "Self-Employed"] else "Unemployed"

    # Income band distribution (more in middle, fewer in extremes)
    income_band_probs = [0.20, 0.35, 0.25, 0.12, 0.05, 0.03]
    income_band = np.random.choice(list(income_bands.keys()), p=income_band_probs)
    monthly_income = calculate_monthly_income(income_band)

    # Determine lifestyle profile
    lifestyle = determine_lifestyle_profile(age, income_band, occupation)

    gender = np.random.choice(["Male", "Female", "Others"], p=[0.48, 0.48, 0.04])
    full_name = generate_singaporean_name()
    first_name, last_name = full_name.split(' ', 1) if ' ' in full_name else (full_name, '')

    # Generate all the new feature groups with interconnected logic
    communication_data = generate_communication_data(age, employment_status, monthly_income)
    spend_data = generate_unique_spend_profile(lifestyle, monthly_income, employment_status, age)
    payment_behavior = generate_payment_behavior(monthly_income, spend_data["total_monthly_spend"], age, employment_status, lifestyle)

    # Calculate financial health
    financial_health = calculate_financial_health(
        monthly_income, spend_data["total_monthly_spend"],
        payment_behavior["savings_ratio"], on_time_percentage, missed_payments_count, employment_status
    )

    # Contact success data
    call_attempts = max(1, int(np.random.poisson(4)))
    contact_success_data = generate_contact_success_data(call_attempts, on_time_percentage, employment_status)

    # Credit score calculation
    delinquency = np.random.choice([True, False], p=[0.15, 0.85])
    credit_score = calculate_credit_score(on_time_percentage, missed_payments_count, delinquency, partial_payment)

    # Contact methods
    sms_attempts = int(np.random.poisson(5))
    whatsapp_attempts = int(np.random.poisson(3))
    email_attempts = int(np.random.poisson(2))
    no_of_attempts = call_attempts + sms_attempts + whatsapp_attempts + email_attempts

    smartphone_penetration = "High" if age < 60 else "Medium" if age < 70 else "Low"

    # Track spending pattern uniqueness
    spend_pattern_key = f"{lifestyle}_{income_band}"
    spending_patterns_tracker[spend_pattern_key] += 1

    record = {
        # Basic loan information
        "Customer_id": generate_customer_id(),
        "Loan_Account_id": int(np.random.randint(10000000, 99999999)),
        "Product_Type": product_type,
        "Loan_Amount_SGD": loan_amount,
        "Outstanding_Balance_SGD": outstanding_balance,
        "Day_Past_Due": day_past_due,
        "Tenure": tenure,
        "Interest_Rate": interest_rate,
        "Current_EMI_SGD": current_emi,
        "Installment_Due_Date": next_due_date.date(),
        "Last_Payment_Date": last_payment_date.date(),

        # Payment behavior
        "Partial_Payment_Indicator": partial_payment,
        "Number_of_Past_Payments": months_completed,
        "Payment_Frequency": payment_frequency,
        "Amount_Paid_Each_Month_SGD": amount_paid_each_month,
        "Missed_Payments_Count": missed_payments_count,
        "Settlement_History": np.random.choice(["Settled", "Partial Settlement", "Not Settled", "Under Negotiation"]),
        "Repayment_Irregularity_Flags": on_time_percentage < 0.7,

        # Customer profile
        "Lifestyle_Profile": lifestyle,
        "Customer_Employment_Status": employment_status,

        # Communication data features
        "Communication_Data_Completion": communication_data["communication_completion"],
        "Valid_Phone_Number": communication_data["valid_phone_number"],
        "No_of_Valid_Numbers": communication_data["no_of_valid_numbers"],
        "No_of_Invalid_Numbers": communication_data["no_of_invalid_numbers"],
        "Mobile_Number_Change_Count": communication_data["mobile_change_count"],
        "Mobile_Number_Change_Count_This_Year": communication_data["mobile_change_this_year"],
        "Address_Change_Count": communication_data["address_change_count"],
        "Address_Change_Count_This_Year": communication_data["address_change_this_year"],
        "Contact_Data_Change_Frequency": communication_data["contact_change_frequency"],

        # Spend analysis features
        "Finance_Stress_Status": financial_health["finance_stress_status"],
        "Utility_Spend_SGD": spend_data["utility_spend"],
        "Shopping_Spend_SGD": spend_data["shopping_spend"],
        "Entertainment_Spend_SGD": spend_data["entertainment_spend"],
        "Health_Spend_SGD": spend_data["health_spend"],
        "Education_Spend_SGD": spend_data["education_spend"],
        "Travel_Spend_SGD": spend_data["travel_spend"],
        "Monthly_Spend_Trend_SGD": spend_data["monthly_spend_trend"],
        "Seasonal_Spend_Variation": spend_data["seasonal_spend_variation"],
        "Weekend_Spend_Ratio": spend_data["weekend_spend_ratio"],
        "Festive_Season_Spend_SGD": spend_data["festive_spend"],
        "Total_Monthly_Spend_SGD": spend_data["total_monthly_spend"],
        "Spend_to_Income_Ratio": spend_data["spend_ratio"],

        # Payment behavior features
        "UPI_Transaction_Count": payment_behavior["upi_count"],
        "Debit_Card_Transaction_Count": payment_behavior["debit_count"],
        "Credit_Card_Transaction_Count": payment_behavior["credit_count"],
        "Cash_Withdrawal_Count": payment_behavior["cash_count"],
        "Recurring_Transaction_Count": payment_behavior["recurring_count"],
        "Preferred_Payment_Channel": payment_behavior["preferred_channel"],
        "Recurring_Payment_Ratio": payment_behavior["recurring_ratio"],
        "Savings_to_Spend_Ratio": payment_behavior["savings_ratio"],
        "Spend_Growth_Rate_YoY": payment_behavior["spend_growth"],
        "High_Value_Transaction_Count": payment_behavior["high_value_count"],

        # Financial health features
        "Flight_Risk_Score": financial_health["flight_risk"],
        "Financial_Health_Status": financial_health["financial_health_status"],
        "Financial_Stress_Score": financial_health["financial_stress_score"],
        "Avg_Balance_Trends": financial_health["avg_balance_trend"],
        "Overdraft_or_Low_Balance_Flag": financial_health["overdraft_flag"],
        "AAR_Score": financial_health["aar_score"],
        "AAR_Risk_Level": financial_health["aar_risk_level"],

        # Contact success features
        "Successful_Contacts_Count": contact_success_data["successful_contacts"],
        "Contact_Success_Rate": contact_success_data["contact_success_rate"],
        "Last_Successful_Agent_ID": contact_success_data["last_successful_agent"],
        "Best_Contact_Agent_IDs": contact_success_data["best_agents"],
        "Avg_Time_With_Best_Agents_Min": contact_success_data["avg_times_best_agents"],
        "Customer_Best_Agent_Interaction_Count": contact_success_data["best_agent_interaction_count"],

        # Existing contact history
        "Contact_History_Call_Attempts": call_attempts,
        "Contact_History_SMS": sms_attempts,
        "Contact_History_WhatsApp": whatsapp_attempts,
        "Contact_History_EmailLogs": email_attempts,
        "Channel_used": np.random.choice(["Call", "SMS", "WhatsApp", "Email", "Field Agent", "IVR"]),
        "Response_Outcome": np.random.choice(["Connected", "Promised to pay", "Ignored", "Disconnected", "Paid fully", "Partial paid"]),
        "No_of_Attempts": no_of_attempts,
        "Average_Handling_Time": round(np.random.uniform(8, 25), 2),

        # Customer demographic data
        "Name": full_name,
        "Age": age,
        "Gender": gender,
        "Primary_Phone_Number": communication_data["primary_phone"],
        "Secondary_Mobile_Number": communication_data["secondary_phone"],
        "Landline_Phone_Number": generate_singapore_landline_number(),
        "Email_ID": generate_email(first_name, last_name),
        "Occupation": occupation,
        "Income_Band_SGD": income_band,
        "Monthly_Income_SGD": monthly_income,
        "Employeement_Type": np.random.choice(["Full time", "Part time", "Contract", "Freelance", "Unemployed"]),
        "Address": generate_singapore_address(),
        "City": random.choice(singapore_cities),
        "Language_Preference": np.random.choice(["English", "Regional"], p=[0.7, 0.3]),
        "Mobile_Number_Active_Status": communication_data["valid_phone_number"],
        "Email_Activity": np.random.choice([True, False], p=[0.8, 0.2]),

        # Digital activity
        "App_Login_Frequency": int(np.random.poisson(12)),
        "UPI_Transactions": payment_behavior["upi_count"],
        "Online_Banking_Activity": int(np.random.poisson(10)),
        "Smartphone_Penetration": smartphone_penetration,
        "Preferred_Channel": np.random.choice(["Call", "SMS", "WhatsApp", "Email", "App notification", "Field Agent", "IVR"]),
        "Call_SMS_Activity_Patterns": np.random.choice(["Low", "Medium", "High"]),
        "WhatsApp_OTT_usage_Indicator": np.random.choice([True, False], p=[0.8, 0.2]),

        # Credit information
        "Credit_Score": credit_score,
        "Recent_Inquiries": int(np.random.poisson(2)),
        "Loan_Exposure_Across_Banks": int(np.random.poisson(1)),
        "Delinquency_on_other_Loans": delinquency,
        "Recent_Score_Change": int(np.random.randint(-40, 40)),

        # Economic indicators
        "Unemployeement_rate_region": round(np.random.uniform(1.8, 2.2), 2),
        "Inflation_Rate": round(np.random.uniform(3.5, 5.0), 2),
        "Interest_Rate_Trend": round(np.random.uniform(-0.1, 0.3), 2),
        "Economic_Stress_Index": round(np.random.uniform(0.1, 0.3), 2),

        # Compliance data
        "Do_Not_Call_Registry_Data": np.random.choice([True, False], p=[0.15, 0.85]),
        "Regional_Time_Restrictions": np.random.choice(["Morning", "Afternoon", "Evening", "Night"]),
        "Communication_Complaince_Limits": np.random.choice(["Daytime", "Evening", "Weekdays", "Weekends", "Holidays"])
    }
    data.append(record)

# Create DataFrame
df = pd.DataFrame(data)

# Validate constraints
assert all(5000 <= amt <= 500000 for amt in df.Loan_Amount_SGD)
assert all(ob > 0 for ob in df.Outstanding_Balance_SGD)
assert all(tenure in [12,24,36,48,60] for tenure in df.Tenure)
assert all(8.0 <= rate <= 10.0 for rate in df.Interest_Rate)

# Format float columns to 2 decimal places
float_columns = df.select_dtypes(include=['float64']).columns
df[float_columns] = df[float_columns].round(2)

# Save to CSV
df.to_csv('singapore_loan_data_enhanced_unique.csv', index=False)

# Analysis of data quality and uniqueness
print(f"Generated {len(df)} records with enhanced unique features")
print("\n=== DATA QUALITY ANALYSIS ===")

# Check spending pattern uniqueness
print(f"\nUnique lifestyle-income combinations: {len(spending_patterns_tracker)}")
print("Top spending patterns:")
for pattern, count in sorted(spending_patterns_tracker.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {pattern}: {count} customers")

# Check anomalies
high_stress = df[df['Finance_Stress_Status'] == 'Extreme High stress']
negative_savings = df[df['Savings_to_Spend_Ratio'] < 0]
over_spenders = df[df['Spend_to_Income_Ratio'] > 1.2]
high_flight_risk = df[df['Flight_Risk_Score'] > 0.8]

print(f"\n=== ANOMALIES AND EXTREMES ===")
print(f"Customers with extreme high stress: {len(high_stress)} ({len(high_stress)/len(df)*100:.1f}%)")
print(f"Customers with negative savings: {len(negative_savings)} ({len(negative_savings)/len(df)*100:.1f}%)")
print(f"Customers spending >120% of income: {len(over_spenders)} ({len(over_spenders)/len(df)*100:.1f}%)")
print(f"Customers with high flight risk (>0.8): {len(high_flight_risk)} ({len(high_flight_risk)/len(df)*100:.1f}%)")

# Check spending pattern diversity
spend_columns = ['Utility_Spend_SGD', 'Shopping_Spend_SGD', 'Entertainment_Spend_SGD',
                 'Health_Spend_SGD', 'Education_Spend_SGD', 'Travel_Spend_SGD']

print(f"\n=== SPENDING DIVERSITY ===")
for col in spend_columns:
    unique_ratios = len(df[col].unique())
    print(f"{col}: {unique_ratios} unique values ({unique_ratios/len(df)*100:.1f}% uniqueness)")

print(f"\nData saved to singapore_loan_data_enhanced_unique.csv")
print("\nSample of key features for first 5 customers:")
sample_cols = ['Name', 'Age', 'Occupation', 'Monthly_Income_SGD', 'Total_Monthly_Spend_SGD',
               'Spend_to_Income_Ratio', 'Savings_to_Spend_Ratio', 'Finance_Stress_Status', 'Lifestyle_Profile']
print(df[sample_cols].head().to_string(index=False))

print("\nSample spending patterns:")
spend_sample = df[['Name', 'Utility_Spend_SGD', 'Shopping_Spend_SGD', 'Entertainment_Spend_SGD',
                   'Health_Spend_SGD', 'Education_Spend_SGD', 'Travel_Spend_SGD']].head(8)
print(spend_sample.to_string(index=False))

Generated 1000 records with enhanced unique features

=== DATA QUALITY ANALYSIS ===

Unique lifestyle-income combinations: 22
Top spending patterns:
  Family Focused_50,000 to 100,000: 125 customers
  Established Career_50,000 to 100,000: 102 customers
  Family Focused_100,000 to 200,000: 87 customers
  Young Professional_50,000 to 100,000: 86 customers
  Family Focused_50,000 or Below: 66 customers

=== ANOMALIES AND EXTREMES ===
Customers with extreme high stress: 273 (27.3%)
Customers with negative savings: 219 (21.9%)
Customers spending >120% of income: 68 (6.8%)
Customers with high flight risk (>0.8): 18 (1.8%)

=== SPENDING DIVERSITY ===
Utility_Spend_SGD: 999 unique values (99.9% uniqueness)
Shopping_Spend_SGD: 998 unique values (99.8% uniqueness)
Entertainment_Spend_SGD: 998 unique values (99.8% uniqueness)
Health_Spend_SGD: 998 unique values (99.8% uniqueness)
Education_Spend_SGD: 996 unique values (99.6% uniqueness)
Travel_Spend_SGD: 999 unique values (99.9% uniqueness)

Data

In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from collections import defaultdict
import gc

# Configuration
num_records = 100000  # 100,000 customers
batch_size = 20000    # Process in batches to manage memory

# Singapore-specific data
singapore_districts = {
    "01": ["Raffles Place", "Marina", "People's Park"],
    "02": ["Anson", "Tanjong Pagar"],
    "03": ["Queenstown", "Tiong Bahru"],
    "04": ["Telok Blangah", "HarbourFront"],
    "05": ["Pasir Panjang", "Hong Leong Garden", "Clementi New Town"],
    "06": ["High Street", "Beach Road"],
    "07": ["Middle Road", "Golden Mile"],
    "08": ["Little India", "Farrer Park"],
    "09": ["Orchard", "Cairnhill", "River Valley"],
    "10": ["Ardmore", "Bukit Timah", "Holland Road"],
    "11": ["Watten Estate", "Novena", "Thomson"],
    "12": ["Balestier", "Toa Payoh", "Serangoon"],
    "13": ["Macpherson", "Braddell"],
    "14": ["Geylang", "Eunos"],
    "15": ["Katong", "Joo Chiat", "Amber Road"],
    "16": ["Bedok", "Upper East Coast", "Eastwood", "Kew Drive"],
    "17": ["Loyang", "Changi"],
    "18": ["Tampines", "Pasir Ris"],
    "19": ["Serangoon Garden", "Hougang", "Punggol"],
    "20": ["Bishan", "Ang Mo Kio"],
    "21": ["Upper Bukit Timah", "Clementi Park", "Ulu Pandan"],
    "22": ["Jurong"],
    "23": ["Hillview", "Dairy Farm", "Bukit Panjang", "Choa Chu Kang"],
    "24": ["Lim Chu Kang", "Tengah"],
    "25": ["Kranji", "Woodgrove"],
    "26": ["Upper Thomson", "Springleaf"],
    "27": ["Yishun", "Sembawang"],
    "28": ["Seletar"]
}

# Optimized data structures
street_names = np.array([
    "Orchard Road", "North Bridge Road", "South Bridge Road", "Victoria Street",
    "Beach Road", "Serangoon Road", "Havelock Road", "Outram Road",
    "Cecil Street", "Robinson Road", "Maxwell Road", "Telok Ayer Street",
    "Bras Basah Road", "Bugis Street", "Chinatown Street", "Clarke Quay"
])

# Pre-computed arrays for performance
chinese_first_names = np.array(["Wei", "Jie", "Ming", "Li", "Xin", "Hui", "Yan", "Feng", "Jun", "Ling"])
chinese_last_names = np.array(["Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Chua", "Chan", "Koh", "Teo"])
malay_first_names = np.array(["Ahmad", "Mohamed", "Siti", "Abdullah", "Fatimah", "Ali", "Rahman", "Zainal", "Nor", "Hassan"])
malay_last_names = np.array(["bin Ahmad", "binti Mohamed", "bin Ismail", "binti Abdullah", "bin Ali", "binti Hassan"])
indian_first_names = np.array(["Raj", "Kumar", "Suresh", "Priya", "Latha", "Arjun", "Divya", "Vijay", "Anand", "Deepa"])
indian_last_names = np.array(["Kumar", "Devi", "Singh", "Rao", "Patel", "Menon", "Pillai", "Sharma", "Subramaniam", "Gopal"])
western_first_names = np.array(["John", "David", "Mary", "Sarah", "Michael", "Jennifer", "James", "Linda", "Robert", "Elizabeth"])
western_last_names = np.array(["Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Garcia", "Rodriguez", "Wilson"])

email_domains = np.array(["gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "singnet.com.sg"])

product_types = np.array(["Personal loan", "Auto loan", "Credit card", "Education loan", "Business loan"])
interest_rates = {"Personal loan": 8.5, "Auto loan": 9.0, "Credit card": 10.0, "Education loan": 8.0, "Business loan": 9.5}
loan_amount_limits = {
    "Personal loan": (5000, 100000), "Auto loan": (5000, 200000), "Credit card": (5000, 50000),
    "Education loan": (5000, 100000), "Business loan": (5000, 500000)
}

# Enhanced income bands
income_bands = {
    "50,000 or Below": (25000, 50000),
    "50,000 to 100,000": (50000, 100000),
    "100,000 to 200,000": (100000, 200000),
    "200,000 to 300,000": (200000, 300000),
    "300,000 to 500,000": (300000, 500000),
    "500,000 or Above": (500000, 2000000)
}

singapore_cities = np.array([
    "Singapore City", "Jurong East", "Tampines", "Woodlands", "Bedok", "Sengkang",
    "Hougang", "Yishun", "Ang Mo Kio", "Bukit Merah", "Bukit Batok", "Pasir Ris",
    "Clementi", "Bishan", "Toa Payoh", "Serangoon", "Queenstown", "Punggol", "Kallang", "Bukit Panjang"
])

agent_ids = np.array([f"SCB_AG_{i:04d}" for i in range(1, 201)])  # 200 agents for 100k customers

# Customer lifestyle profiles
lifestyle_profiles = {
    "Young Professional": {"age_range": (22, 35), "spending_focus": ["entertainment", "shopping", "travel"]},
    "Family Focused": {"age_range": (30, 50), "spending_focus": ["education", "shopping", "health"]},
    "Established Career": {"age_range": (40, 60), "spending_focus": ["travel", "health", "entertainment"]},
    "Retirement Phase": {"age_range": (60, 75), "spending_focus": ["health", "utility", "shopping"]},
    "Student Lifestyle": {"age_range": (18, 25), "spending_focus": ["education", "entertainment", "shopping"]},
    "High Net Worth": {"age_range": (35, 65), "spending_focus": ["travel", "entertainment", "shopping"]}
}

# Pre-computed probability distributions
occupation_weights = np.array([0.65, 0.10, 0.08, 0.07, 0.05, 0.05])
occupations_list = np.array(["Employed", "Self-Employed", "Student", "Retired", "Unemployed", "Homemaker"])
income_band_probs = np.array([0.20, 0.35, 0.25, 0.12, 0.05, 0.03])
income_band_names = np.array(list(income_bands.keys()))
ethnicity_probs = np.array([0.74, 0.13, 0.09, 0.04])
genders = np.array(["Male", "Female", "Others"])
gender_probs = np.array([0.48, 0.48, 0.04])

# Vectorized helper functions
def generate_singaporean_names(n):
    """Generate n Singaporean names using vectorized operations"""
    ethnicities = np.random.choice(["Chinese", "Malay", "Indian", "Western"], n, p=ethnicity_probs)

    first_names = []
    last_names = []

    for i in range(n):
        ethnicity = ethnicities[i]
        if ethnicity == "Chinese":
            first_names.append(np.random.choice(chinese_first_names))
            last_names.append(np.random.choice(chinese_last_names))
        elif ethnicity == "Malay":
            first_names.append(np.random.choice(malay_first_names))
            last_names.append(np.random.choice(malay_last_names))
        elif ethnicity == "Indian":
            first_names.append(np.random.choice(indian_first_names))
            last_names.append(np.random.choice(indian_last_names))
        else:
            first_names.append(np.random.choice(western_first_names))
            last_names.append(np.random.choice(western_last_names))

    return [f"{first} {last}" for first, last in zip(first_names, last_names)]

def generate_singapore_mobile_numbers(n, valid_ratio=0.92):
    """Generate n mobile numbers with vectorized approach"""
    prefixes = np.random.choice(['8', '9'], n)
    numbers = np.random.randint(0, 10000000, n)  # 7-digit numbers
    numbers = [f"{num:07d}" for num in numbers]

    valid_mask = np.random.random(n) < valid_ratio

    mobile_numbers = []
    for i in range(n):
        if valid_mask[i]:
            mobile_numbers.append(f"+65 {prefixes[i]}{numbers[i][:3]} {numbers[i][3:]}")
        else:
            # Invalid numbers
            invalid_types = [
                f"+65 {prefixes[i]}{numbers[i][:6]}",  # Missing digit
                f"+65 {np.random.choice(['6', '7'])}{numbers[i]}",  # Wrong prefix
                f"+65 {prefixes[i]}{numbers[i]}{np.random.randint(0,10)}",  # Extra digit
            ]
            mobile_numbers.append(np.random.choice(invalid_types))

    return mobile_numbers, valid_mask

def generate_emails(first_names, last_names, n):
    """Generate emails using vectorized operations"""
    emails = []
    for i in range(n):
        clean_first = first_names[i].lower().replace(' ', '')
        clean_last = last_names[i].lower().replace(' ', '')

        formats = [
            f"{clean_first}.{clean_last}",
            f"{clean_first}{clean_last}",
            f"{clean_first}_{clean_last}",
        ]
        username = np.random.choice(formats)
        domain = np.random.choice(email_domains)
        emails.append(f"{username}@{domain}")

    return emails

def calculate_emis_vectorized(principals, rates, tenures):
    """Vectorized EMI calculation"""
    monthly_rates = rates / 1200
    emis = (principals * monthly_rates * (1 + monthly_rates) ** tenures) / ((1 + monthly_rates) ** tenures - 1)
    return np.round(emis, 2)

def generate_loan_data(n):
    """Generate loan data in bulk"""
    product_choices = np.random.choice(product_types, n)
    tenures = np.random.choice([12, 24, 36, 48, 60], n)

    loan_amounts = []
    interest_rates_arr = []

    for product in product_choices:
        min_amt, max_amt = loan_amount_limits[product]
        if product in ["Auto loan", "Business loan"]:
            step = 5000
        else:
            step = 1000
        amount = np.random.choice(range(min_amt, max_amt + 1, step))
        loan_amounts.append(amount)
        interest_rates_arr.append(interest_rates[product])

    emis = calculate_emis_vectorized(np.array(loan_amounts), np.array(interest_rates_arr), tenures)

    return product_choices, loan_amounts, interest_rates_arr, tenures, emis

def determine_lifestyle_profiles(ages, income_bands, occupations):
    """Vectorized lifestyle profile determination"""
    profiles = []
    for i in range(len(ages)):
        age = ages[i]
        income_band = income_bands[i]
        occupation = occupations[i]

        if age >= 60:
            profiles.append("Retirement Phase")
        elif age <= 25 and occupation == "Student":
            profiles.append("Student Lifestyle")
        elif income_band in ["500,000 or Above", "300,000 to 500,000"]:
            profiles.append("High Net Worth")
        elif 30 <= age <= 50 and occupation in ["Employed", "Self-Employed"]:
            profiles.append("Family Focused")
        elif 40 <= age <= 60:
            profiles.append("Established Career")
        else:
            profiles.append("Young Professional")

    return profiles

def calculate_monthly_incomes(income_bands_arr):
    """Vectorized monthly income calculation"""
    monthly_incomes = []
    for income_band in income_bands_arr:
        min_income, max_income = income_bands[income_band]

        if income_band == "50,000 or Below":
            annual_income = min_income + (max_income - min_income) * np.random.beta(2, 5)
        elif income_band == "500,000 or Above":
            annual_income = min_income + (max_income - min_income) * np.random.beta(5, 2)
        else:
            annual_income = np.random.normal(
                (min_income + max_income) / 2,
                (max_income - min_income) / 6
            )

        annual_income = np.clip(annual_income, min_income, max_income)
        monthly_incomes.append(annual_income / 12)

    return np.round(monthly_incomes, 2)

def generate_batch_data(batch_size, start_id):
    """Generate a batch of customer data"""
    # Customer demographics
    ages = np.clip(np.random.normal(45, 15, batch_size).astype(int), 22, 75)
    occupations = np.random.choice(occupations_list, batch_size, p=occupation_weights)
    employment_status = np.array(["Employed" if occ in ["Employed", "Self-Employed"] else "Unemployed" for occ in occupations])
    income_bands_arr = np.random.choice(income_band_names, batch_size, p=income_band_probs)
    monthly_incomes = calculate_monthly_incomes(income_bands_arr)
    genders_arr = np.random.choice(genders, batch_size, p=gender_probs)

    # Generate names and contact info
    full_names = generate_singaporean_names(batch_size)
    first_names = [name.split()[0] for name in full_names]
    last_names = [name.split()[1] if ' ' in name else '' for name in full_names]

    # Communication data
    mobile_numbers, valid_primary_mask = generate_singapore_mobile_numbers(batch_size)
    has_secondary = np.random.random(batch_size) > 0.15
    secondary_numbers, valid_secondary_mask = generate_singapore_mobile_numbers(batch_size, 0.75)
    secondary_numbers = [num if has_sec else "" for num, has_sec in zip(secondary_numbers, has_secondary)]
    valid_secondary_mask = [mask if has_sec else False for mask, has_sec in zip(valid_secondary_mask, has_secondary)]

    # Count valid numbers
    no_of_valid_numbers = []
    no_of_invalid_numbers = []
    for i in range(batch_size):
        valid_count = sum([valid_primary_mask[i], valid_secondary_mask[i]]) if has_secondary[i] else valid_primary_mask[i]
        no_of_valid_numbers.append(valid_count)
        no_of_invalid_numbers.append((2 if has_secondary[i] else 1) - valid_count)

    # Communication completion
    communication_completion = np.round(np.clip(
        np.random.normal(0.9, 0.08, batch_size), 0.7, 1.0
    ), 2)

    # Mobile number changes
    mobile_change_base = np.where(employment_status == "Unemployed",
                                 np.random.randint(1, 5, batch_size),
                                 np.random.randint(0, 3, batch_size))
    age_factor = np.maximum(0, (35 - ages) / 15)
    mobile_change_count = np.maximum(0, mobile_change_base + (age_factor * 3).astype(int))
    mobile_change_this_year = np.minimum(mobile_change_count, np.random.randint(0, 3, batch_size))

    # Address changes
    address_change_count = np.where(
        (employment_status == "Unemployed") | (ages < 30),
        np.random.randint(0, 4, batch_size),
        np.random.randint(0, 2, batch_size)
    )
    address_change_this_year = np.minimum(address_change_count, np.random.randint(0, 2, batch_size))

    # Contact change frequency
    customer_age_years = np.maximum(5, ages - 18)
    total_changes = mobile_change_count + address_change_count
    contact_change_frequency = np.round(total_changes / customer_age_years, 2)

    # Lifestyle profiles
    lifestyles = determine_lifestyle_profiles(ages, income_bands_arr, occupations)

    # Generate loan data
    product_types_arr, loan_amounts, interest_rates_arr, tenures, base_emis = generate_loan_data(batch_size)

    # Payment behavior
    on_time_percentages = np.random.beta(3, 2, batch_size)
    payment_frequencies = ["Regular" if p >= 0.8 else "Irregular" for p in on_time_percentages]

    # Loan dates and status
    days_since_loan_start = np.random.randint(180, 1800, batch_size)
    months_completed = np.minimum(tenures - 1, (days_since_loan_start / 30).astype(int))

    # Generate batch data
    batch_data = []
    for i in range(batch_size):
        # Loan-specific calculations
        days_past_due = np.random.randint(0, 120) if np.random.random() > on_time_percentages[i] else 0
        current_emi = adjust_emi_for_delinquency(base_emis[i], days_past_due)

        # Outstanding balance (simplified for performance)
        remaining_months = tenures[i] - months_completed[i]
        if remaining_months <= 0:
            outstanding_balance = loan_amounts[i] * 0.01
        else:
            outstanding_balance = current_emi * remaining_months * np.random.uniform(0.8, 1.2)

        outstanding_balance = max(loan_amounts[i] * 0.01, outstanding_balance)

        # Missed payments
        missed_payment_prob = 0.4 if on_time_percentages[i] < 0.7 else 0.05
        missed_payments_count = int(np.random.poisson(missed_payment_prob * months_completed[i] / 6))

        # Spending profile
        spend_profile = generate_spend_profile(
            lifestyles[i], monthly_incomes[i], employment_status[i], ages[i]
        )

        # Payment behavior
        payment_behavior = generate_payment_behavior(
            monthly_incomes[i], spend_profile["total_monthly_spend"],
            ages[i], employment_status[i], lifestyles[i]
        )

        # Financial health
        financial_health = calculate_financial_health(
            monthly_incomes[i], spend_profile["total_monthly_spend"],
            payment_behavior["savings_ratio"], on_time_percentages[i],
            missed_payments_count, employment_status[i]
        )

        # Contact success
        call_attempts = max(1, int(np.random.poisson(4)))
        contact_success = generate_contact_success_data(
            call_attempts, on_time_percentages[i], employment_status[i]
        )

        # Credit score
        delinquency = np.random.random() < 0.15
        credit_score = calculate_credit_score(
            on_time_percentages[i], missed_payments_count, delinquency,
            np.random.random() < 0.15
        )

        record = {
            # Basic loan information
            "Customer_id": f"SCB{start_id + i:09d}",
            "Loan_Account_id": int(np.random.randint(10000000, 99999999)),
            "Product_Type": product_types_arr[i],
            "Loan_Amount_SGD": round(float(loan_amounts[i]), 2),
            "Outstanding_Balance_SGD": round(outstanding_balance, 2),
            "Day_Past_Due": days_past_due,
            "Tenure": tenures[i],
            "Interest_Rate": interest_rates_arr[i],
            "Current_EMI_SGD": current_emi,
            "Installment_Due_Date": (datetime.now() + timedelta(days=np.random.randint(1, 30))).date(),
            "Last_Payment_Date": (datetime.now() - timedelta(days=np.random.randint(1, 120))).date(),

            # Payment behavior
            "Partial_Payment_Indicator": np.random.random() < 0.15,
            "Number_of_Past_Payments": months_completed[i],
            "Payment_Frequency": payment_frequencies[i],
            "Amount_Paid_Each_Month_SGD": base_emis[i],
            "Missed_Payments_Count": missed_payments_count,
            "Settlement_History": np.random.choice(["Settled", "Partial Settlement", "Not Settled", "Under Negotiation"]),
            "Repayment_Irregularity_Flags": on_time_percentages[i] < 0.7,

            # Customer profile
            "Lifestyle_Profile": lifestyles[i],
            "Customer_Employment_Status": employment_status[i],

            # Communication data
            "Communication_Data_Completion": communication_completion[i],
            "Valid_Phone_Number": valid_primary_mask[i],
            "No_of_Valid_Numbers": no_of_valid_numbers[i],
            "No_of_Invalid_Numbers": no_of_invalid_numbers[i],
            "Mobile_Number_Change_Count": mobile_change_count[i],
            "Mobile_Number_Change_Count_This_Year": mobile_change_this_year[i],
            "Address_Change_Count": address_change_count[i],
            "Address_Change_Count_This_Year": address_change_this_year[i],
            "Contact_Data_Change_Frequency": contact_change_frequency[i],

            # Spend analysis
            "Finance_Stress_Status": financial_health["finance_stress_status"],
            "Utility_Spend_SGD": spend_profile["utility_spend"],
            "Shopping_Spend_SGD": spend_profile["shopping_spend"],
            "Entertainment_Spend_SGD": spend_profile["entertainment_spend"],
            "Health_Spend_SGD": spend_profile["health_spend"],
            "Education_Spend_SGD": spend_profile["education_spend"],
            "Travel_Spend_SGD": spend_profile["travel_spend"],
            "Monthly_Spend_Trend_SGD": spend_profile["monthly_spend_trend"],
            "Seasonal_Spend_Variation": spend_profile["seasonal_spend_variation"],
            "Weekend_Spend_Ratio": spend_profile["weekend_spend_ratio"],
            "Festive_Season_Spend_SGD": spend_profile["festive_spend"],
            "Total_Monthly_Spend_SGD": spend_profile["total_monthly_spend"],
            "Spend_to_Income_Ratio": spend_profile["spend_ratio"],

            # Payment behavior
            "UPI_Transaction_Count": payment_behavior["upi_count"],
            "Debit_Card_Transaction_Count": payment_behavior["debit_count"],
            "Credit_Card_Transaction_Count": payment_behavior["credit_count"],
            "Cash_Withdrawal_Count": payment_behavior["cash_count"],
            "Recurring_Transaction_Count": payment_behavior["recurring_count"],
            "Preferred_Payment_Channel": payment_behavior["preferred_channel"],
            "Recurring_Payment_Ratio": payment_behavior["recurring_ratio"],
            "Savings_to_Spend_Ratio": payment_behavior["savings_ratio"],
            "Spend_Growth_Rate_YoY": payment_behavior["spend_growth"],
            "High_Value_Transaction_Count": payment_behavior["high_value_count"],

            # Financial health
            "Flight_Risk_Score": financial_health["flight_risk"],
            "Financial_Health_Status": financial_health["financial_health_status"],
            "Financial_Stress_Score": financial_health["financial_stress_score"],
            "Avg_Balance_Trends": financial_health["avg_balance_trend"],
            "Overdraft_or_Low_Balance_Flag": financial_health["overdraft_flag"],
            "AAR_Score": financial_health["aar_score"],
            "AAR_Risk_Level": financial_health["aar_risk_level"],

            # Contact success
            "Successful_Contacts_Count": contact_success["successful_contacts"],
            "Contact_Success_Rate": contact_success["contact_success_rate"],
            "Last_Successful_Agent_ID": contact_success["last_successful_agent"],
            "Best_Contact_Agent_IDs": contact_success["best_agents"],
            "Avg_Time_With_Best_Agents_Min": contact_success["avg_times_best_agents"],
            "Customer_Best_Agent_Interaction_Count": contact_success["customer_best_agent_interaction_count"],

            # Customer demographics
            "Name": full_names[i],
            "Age": ages[i],
            "Gender": genders_arr[i],
            "Primary_Phone_Number": mobile_numbers[i],
            "Secondary_Mobile_Number": secondary_numbers[i],
            "Landline_Phone_Number": f"+65 6{np.random.randint(1000000, 9999999):07d}",
            "Email_ID": generate_emails([first_names[i]], [last_names[i]], 1)[0],
            "Occupation": occupations[i],
            "Income_Band_SGD": income_bands_arr[i],
            "Monthly_Income_SGD": monthly_incomes[i],
            "Employeement_Type": np.random.choice(["Full time", "Part time", "Contract", "Freelance", "Unemployed"]),
            "Address": f"{np.random.randint(1, 999)} {np.random.choice(street_names)} #{np.random.randint(1, 50):02d}-{np.random.randint(1, 99):02d} Singapore {np.random.randint(100000, 999999)}",
            "City": np.random.choice(singapore_cities),
            "Language_Preference": np.random.choice(["English", "Regional"], p=[0.7, 0.3]),
            "Mobile_Number_Active_Status": valid_primary_mask[i],
            "Email_Activity": np.random.random() < 0.8,

            # Additional fields for completeness
            "Credit_Score": credit_score,
            "Recent_Inquiries": int(np.random.poisson(2)),
            "Loan_Exposure_Across_Banks": int(np.random.poisson(1)),
            "Delinquency_on_other_Loans": delinquency,
        }
        batch_data.append(record)

    return batch_data

# Individual helper functions (optimized)
def adjust_emi_for_delinquency(base_emi, days_past_due):
    if days_past_due <= 0: return base_emi
    elif days_past_due <= 15: return round(base_emi * 1.02, 2)
    elif days_past_due <= 30: return round(base_emi * 1.05, 2)
    elif days_past_due <= 60: return round(base_emi * 1.08, 2)
    elif days_past_due <= 90: return round(base_emi * 1.12, 2)
    else: return round(base_emi * 1.15, 2)

def generate_spend_profile(lifestyle, monthly_income, employment_status, age):
    """Generate spending profile for a customer"""
    base_profiles = {
        "Young Professional": {"utility": 0.10, "shopping": 0.28, "entertainment": 0.20, "health": 0.08, "education": 0.12, "travel": 0.22},
        "Family Focused": {"utility": 0.15, "shopping": 0.30, "entertainment": 0.10, "health": 0.12, "education": 0.18, "travel": 0.15},
        "Established Career": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.15, "health": 0.15, "education": 0.08, "travel": 0.25},
        "Retirement Phase": {"utility": 0.18, "shopping": 0.22, "entertainment": 0.12, "health": 0.25, "education": 0.05, "travel": 0.18},
        "Student Lifestyle": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.25, "health": 0.08, "education": 0.20, "travel": 0.10},
        "High Net Worth": {"utility": 0.08, "shopping": 0.30, "entertainment": 0.22, "health": 0.12, "education": 0.08, "travel": 0.20}
    }

    base_ratios = base_profiles[lifestyle].copy()

    # Individual variation
    for category in base_ratios:
        base_ratios[category] *= np.random.uniform(0.7, 1.3)

    # Normalize
    total = sum(base_ratios.values())
    for category in base_ratios:
        base_ratios[category] /= total

    # Total spend calculation
    if employment_status == "Unemployed":
        spend_ratio = np.random.uniform(0.7, 1.3)
    elif lifestyle == "High Net Worth":
        spend_ratio = np.random.uniform(0.3, 0.6)
    elif lifestyle == "Student Lifestyle":
        spend_ratio = np.random.uniform(0.8, 1.1)
    else:
        spend_ratio = np.random.uniform(0.5, 0.8)

    # Anomalies
    if np.random.random() < 0.05:
        spend_ratio = np.random.choice([np.random.uniform(0.2, 0.4), np.random.uniform(1.2, 1.8)])

    total_monthly_spend = monthly_income * spend_ratio

    # Calculate category spends
    spends = {}
    for category, ratio in base_ratios.items():
        base_spend = total_monthly_spend * ratio
        monthly_variation = np.random.uniform(0.85, 1.15)
        spends[category] = max(50, base_spend * monthly_variation)

    actual_total_spend = sum(spends.values())

    return {
        "utility_spend": round(spends["utility"], 2),
        "shopping_spend": round(spends["shopping"], 2),
        "entertainment_spend": round(spends["entertainment"], 2),
        "health_spend": round(spends["health"], 2),
        "education_spend": round(spends["education"], 2),
        "travel_spend": round(spends["travel"], 2),
        "monthly_spend_trend": round(np.random.normal(total_monthly_spend, total_monthly_spend * 0.12), 2),
        "seasonal_spend_variation": round(np.random.uniform(0.08, 0.35), 2),
        "weekend_spend_ratio": round(np.random.uniform(0.22, 0.48), 2),
        "festive_spend": round(total_monthly_spend * np.random.uniform(1.15, 2.0), 2),
        "total_monthly_spend": round(actual_total_spend, 2),
        "spend_ratio": round(spend_ratio, 2)
    }

def generate_payment_behavior(monthly_income, total_spend, age, employment_status, lifestyle):
    """Generate payment behavior"""
    if lifestyle == "High Net Worth":
        base_intensity = total_spend / 200
    else:
        base_intensity = total_spend / 80

    # Age factors
    if age < 35:
        upi_factor, credit_factor, cash_factor = 1.4, 0.8, 0.7
    elif age > 55:
        upi_factor, credit_factor, cash_factor = 0.6, 1.1, 1.3
    else:
        upi_factor, credit_factor, cash_factor = 1.0, 1.0, 1.0

    upi_count = max(5, int(np.random.normal(base_intensity * 0.4 * upi_factor, 8)))
    debit_count = max(3, int(np.random.normal(base_intensity * 0.3, 5)))
    credit_count = max(2, int(np.random.normal(base_intensity * 0.2 * credit_factor, 4)))
    cash_count = max(1, int(np.random.normal(base_intensity * 0.1 * cash_factor, 3)))

    # Recurring transactions
    if lifestyle in ["Family Focused", "Established Career"]:
        recurring_count = np.random.randint(6, 13)
    else:
        recurring_count = np.random.randint(3, 9)

    # Preferred channel
    channels = ["UPI", "Debit Card", "Credit Card", "Cash"]
    preferred_channel = channels[np.argmax([upi_count, debit_count, credit_count, cash_count])]

    # Recurring payment ratio
    total_transactions = upi_count + debit_count + credit_count + cash_count
    recurring_ratio = recurring_count / total_transactions if total_transactions > 0 else 0.1

    # Savings ratio
    if employment_status == "Unemployed":
        savings_ratio = np.random.uniform(-0.25, 0.05)
    elif lifestyle == "High Net Worth":
        savings_ratio = np.random.uniform(0.25, 0.45)
    elif lifestyle == "Student Lifestyle":
        savings_ratio = np.random.uniform(-0.1, 0.1)
    else:
        savings_ratio = np.random.uniform(0.05, 0.25)

    if np.random.random() < 0.03:
        savings_ratio = np.random.choice([np.random.uniform(-0.4, -0.2), np.random.uniform(0.5, 0.7)])

    # Spend growth rate
    if employment_status == "Unemployed":
        spend_growth = np.random.normal(-0.08, 0.15)
    else:
        spend_growth = np.random.normal(0.04, 0.12)

    high_value_threshold = monthly_income * 0.07
    high_value_count = max(0, int(np.random.poisson(total_spend / high_value_threshold * 0.08)))

    return {
        "upi_count": upi_count,
        "debit_count": debit_count,
        "credit_count": credit_count,
        "cash_count": cash_count,
        "recurring_count": recurring_count,
        "preferred_channel": preferred_channel,
        "recurring_ratio": round(recurring_ratio, 2),
        "savings_ratio": round(savings_ratio, 2),
        "spend_growth": round(spend_growth, 2),
        "high_value_count": high_value_count
    }

def calculate_financial_health(monthly_income, total_spend, savings_ratio, payment_history, missed_payments, employment_status):
    """Calculate financial health metrics"""
    spend_income_ratio = total_spend / monthly_income if monthly_income > 0 else 2.0

    if employment_status == "Unemployed":
        weights = {"spend_ratio": 0.5, "savings": 0.3, "payments": 0.2}
    else:
        weights = {"spend_ratio": 0.4, "savings": 0.3, "payments": 0.3}

    savings_factor = 1 - max(0, savings_ratio)
    payment_factor = 1 - payment_history
    missed_payment_factor = min(1, missed_payments * 0.25)

    base_stress = (
        min(2.0, spend_income_ratio) * weights["spend_ratio"] +
        savings_factor * weights["savings"] +
        (payment_factor * 0.7 + missed_payment_factor * 0.3) * weights["payments"]
    )

    financial_stress_score = min(100, max(0, base_stress * 100))

    # Stress status
    if financial_stress_score >= 75:
        stress_status = "Extreme High stress"
    elif financial_stress_score >= 55:
        stress_status = "High stress"
    elif financial_stress_score >= 35:
        stress_status = "Medium stress"
    else:
        stress_status = "Low stress"

    # Health status
    if financial_stress_score <= 25:
        health_status = "Healthy"
    elif financial_stress_score <= 50:
        health_status = "Moderate"
    else:
        health_status = "Stressed"

    # Balance trends
    if savings_ratio > 0.2:
        balance_trend = "Rising"
    elif savings_ratio < -0.1:
        balance_trend = "Falling"
    else:
        balance_trend = "Stable"

    # Other metrics
    overdraft_flag = (savings_ratio < -0.15 or spend_income_ratio > 1.3)

    aar_score = np.random.normal(45, 25)
    if np.random.random() < 0.05:
        aar_score = np.random.uniform(75, 95)
    elif np.random.random() < 0.08:
        aar_score = np.random.uniform(5, 25)
    aar_score = np.clip(aar_score, 0, 100)

    if aar_score <= 25:
        aar_risk = "Low"
    elif aar_score <= 70:
        aar_risk = "Medium"
    else:
        aar_risk = "High"

    # Flight risk
    financial_factors = financial_stress_score / 100 * 0.4
    behavioral_factors = (1 - payment_history) * 0.3
    stability_factors = min(1, missed_payments * 0.1) * 0.2
    engagement_factors = (1 - min(1, payment_history)) * 0.1

    flight_risk = min(1.0, financial_factors + behavioral_factors + stability_factors + engagement_factors)

    if np.random.random() < 0.04:
        flight_risk = np.random.choice([np.random.uniform(0.8, 0.95), np.random.uniform(0.05, 0.15)])

    return {
        "finance_stress_status": stress_status,
        "financial_health_status": health_status,
        "financial_stress_score": round(financial_stress_score, 2),
        "flight_risk": round(flight_risk, 2),
        "avg_balance_trend": balance_trend,
        "overdraft_flag": overdraft_flag,
        "aar_score": round(aar_score, 2),
        "aar_risk_level": aar_risk
    }

def generate_contact_success_data(call_attempts, payment_history, employment_status):
    """Generate contact success data"""
    if payment_history > 0.8 and employment_status == "Employed":
        base_success_rate = np.random.uniform(0.5, 0.8)
    elif payment_history > 0.6:
        base_success_rate = np.random.uniform(0.3, 0.6)
    else:
        base_success_rate = np.random.uniform(0.1, 0.4)

    success_rate = np.clip(np.random.normal(base_success_rate, 0.15), 0.05, 0.95)
    successful_contacts = int(call_attempts * success_rate)

    # Agent data
    available_agents = np.random.choice(agent_ids, np.random.randint(2, 6), replace=False)
    last_successful_agent = np.random.choice(available_agents) if successful_contacts > 0 else ""

    num_best_agents = min(3, len(available_agents))
    best_agents = np.random.choice(available_agents, num_best_agents, replace=False)

    avg_call_duration_base = 3 + (success_rate * 10)
    avg_times = [round(np.random.normal(avg_call_duration_base, 2), 2) for _ in best_agents]

    best_agent_interaction_count = np.random.randint(1, successful_contacts + 1) if successful_contacts > 0 else 0

    return {
        "successful_contacts": successful_contacts,
        "contact_success_rate": round(success_rate, 2),
        "last_successful_agent": last_successful_agent,
        "best_agents": ",".join(best_agents),
        "avg_times_best_agents": ",".join(map(str, avg_times)),
        "customer_best_agent_interaction_count": best_agent_interaction_count
    }

def calculate_credit_score(payment_history, missed_payments, delinquency, partial_payments):
    """Calculate credit score"""
    base_score = 650
    payment_impact = (payment_history - 0.5) * 200
    missed_payment_impact = -min(missed_payments * 15, 100)
    delinquency_impact = -50 if delinquency else 0
    partial_impact = -20 if partial_payments else 0

    score = base_score + payment_impact + missed_payment_impact + delinquency_impact + partial_impact
    return max(300, min(850, int(score)))

# Main execution
def main():
    print(f"Generating {num_records:,} customer records...")
    print("This may take a few minutes for 100,000 records...")

    all_data = []
    start_time = datetime.now()

    # Process in batches to manage memory
    for batch_num in range(0, num_records, batch_size):
        current_batch_size = min(batch_size, num_records - batch_num)
        print(f"Processing batch {batch_num//batch_size + 1}/{(num_records + batch_size - 1)//batch_size} "
              f"({current_batch_size} records)...")

        batch_data = generate_batch_data(current_batch_size, batch_num)
        all_data.extend(batch_data)

        # Clear memory
        if (batch_num // batch_size) % 5 == 0:
            gc.collect()

    # Create DataFrame
    df = pd.DataFrame(all_data)

    # Add remaining columns with vectorized operations
    df["Contact_History_Call_Attempts"] = np.random.poisson(4, len(df))
    df["Contact_History_SMS"] = np.random.poisson(5, len(df))
    df["Contact_History_WhatsApp"] = np.random.poisson(3, len(df))
    df["Contact_History_EmailLogs"] = np.random.poisson(2, len(df))
    df["No_of_Attempts"] = (df["Contact_History_Call_Attempts"] +
                           df["Contact_History_SMS"] +
                           df["Contact_History_WhatsApp"] +
                           df["Contact_History_EmailLogs"])
    df["Average_Handling_Time"] = np.round(np.random.uniform(8, 25, len(df)), 2)

    # Fill other categorical columns
    df["Channel_used"] = np.random.choice(["Call", "SMS", "WhatsApp", "Email", "Field Agent", "IVR"], len(df))
    df["Response_Outcome"] = np.random.choice(["Connected", "Promised to pay", "Ignored", "Disconnected", "Paid fully", "Partial paid"], len(df))
    df["App_Login_Frequency"] = np.random.poisson(12, len(df))
    df["Online_Banking_Activity"] = np.random.poisson(10, len(df))
    df["Smartphone_Penetration"] = np.where(df["Age"] < 60, "High", np.where(df["Age"] < 70, "Medium", "Low"))
    df["Preferred_Channel"] = np.random.choice(["Call", "SMS", "WhatsApp", "Email", "App notification", "Field Agent", "IVR"], len(df))
    df["Call_SMS_Activity_Patterns"] = np.random.choice(["Low", "Medium", "High"], len(df))
    df["WhatsApp_OTT_usage_Indicator"] = np.random.choice([True, False], len(df), p=[0.8, 0.2])
    df["Recent_Score_Change"] = np.random.randint(-40, 41, len(df))
    df["Unemployeement_rate_region"] = np.round(np.random.uniform(1.8, 2.2, len(df)), 2)
    df["Inflation_Rate"] = np.round(np.random.uniform(3.5, 5.0, len(df)), 2)
    df["Interest_Rate_Trend"] = np.round(np.random.uniform(-0.1, 0.3, len(df)), 2)
    df["Economic_Stress_Index"] = np.round(np.random.uniform(0.1, 0.3, len(df)), 2)
    df["Do_Not_Call_Registry_Data"] = np.random.choice([True, False], len(df), p=[0.15, 0.85])
    df["Regional_Time_Restrictions"] = np.random.choice(["Morning", "Afternoon", "Evening", "Night"], len(df))
    df["Communication_Complaince_Limits"] = np.random.choice(["Daytime", "Evening", "Weekdays", "Weekends", "Holidays"], len(df))

    # Validate data
    print("Validating data constraints...")
    assert all(5000 <= amt <= 500000 for amt in df.Loan_Amount_SGD)
    assert all(ob > 0 for ob in df.Outstanding_Balance_SGD)
    assert all(tenure in [12,24,36,48,60] for tenure in df.Tenure)
    assert all(8.0 <= rate <= 10.0 for rate in df.Interest_Rate)

    # Format float columns
    float_columns = df.select_dtypes(include=['float64']).columns
    df[float_columns] = df[float_columns].round(2)

    # Save to CSV with compression
    output_file = 'singapore_loan_data_100k.csv.gz'
    print(f"Saving to {output_file}...")
    df.to_csv(output_file, index=False, compression='gzip')

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    print(f"\n=== GENERATION COMPLETE ===")
    print(f"Generated {len(df):,} customer records")
    print(f"Time taken: {duration:.2f} seconds")
    print(f"Records per second: {len(df)/duration:.0f}")
    print(f"File saved: {output_file}")

    # Data quality report
    print(f"\n=== DATA QUALITY REPORT ===")
    print(f"Total customers: {len(df):,}")
    print(f"Customers with extreme high stress: {len(df[df['Finance_Stress_Status'] == 'Extreme High stress']):,}")
    print(f"Customers with negative savings: {len(df[df['Savings_to_Spend_Ratio'] < 0]):,}")
    print(f"Customers spending >120% of income: {len(df[df['Spend_to_Income_Ratio'] > 1.2]):,}")
    print(f"Customers with high flight risk (>0.8): {len(df[df['Flight_Risk_Score'] > 0.8]):,}")

    # Memory usage
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    print(f"DataFrame memory usage: {memory_mb:.2f} MB")

    print(f"\nFirst 5 customers:")
    sample_cols = ['Customer_id', 'Name', 'Age', 'Occupation', 'Monthly_Income_SGD',
                   'Total_Monthly_Spend_SGD', 'Finance_Stress_Status', 'Credit_Score']
    print(df[sample_cols].head().to_string(index=False))

if __name__ == "__main__":
    main()

Generating 100,000 customer records...
This may take a few minutes for 100,000 records...
Processing batch 1/5 (20000 records)...
Processing batch 2/5 (20000 records)...
Processing batch 3/5 (20000 records)...
Processing batch 4/5 (20000 records)...
Processing batch 5/5 (20000 records)...
Validating data constraints...
Saving to singapore_loan_data_100k.csv.gz...

=== GENERATION COMPLETE ===
Generated 100,000 customer records
Time taken: 102.34 seconds
Records per second: 977
File saved: singapore_loan_data_100k.csv.gz

=== DATA QUALITY REPORT ===
Total customers: 100,000
Customers with extreme high stress: 25,826
Customers with negative savings: 21,386
Customers spending >120% of income: 6,143
Customers with high flight risk (>0.8): 2,097
DataFrame memory usage: 271.47 MB

First 5 customers:
 Customer_id             Name  Age    Occupation  Monthly_Income_SGD  Total_Monthly_Spend_SGD Finance_Stress_Status  Credit_Score
SCB000000000        Ming Wong   58       Retired             2796.

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from collections import defaultdict
import gc

# Configuration
num_records = 100000
batch_size = 20000

# Singapore-specific data
singapore_districts = {
    "01": ["Raffles Place", "Marina", "People's Park"],
    "02": ["Anson", "Tanjong Pagar"],
    "03": ["Queenstown", "Tiong Bahru"],
    "04": ["Telok Blangah", "HarbourFront"],
    "05": ["Pasir Panjang", "Hong Leong Garden", "Clementi New Town"],
    "06": ["High Street", "Beach Road"],
    "07": ["Middle Road", "Golden Mile"],
    "08": ["Little India", "Farrer Park"],
    "09": ["Orchard", "Cairnhill", "River Valley"],
    "10": ["Ardmore", "Bukit Timah", "Holland Road"],
    "11": ["Watten Estate", "Novena", "Thomson"],
    "12": ["Balestier", "Toa Payoh", "Serangoon"],
    "13": ["Macpherson", "Braddell"],
    "14": ["Geylang", "Eunos"],
    "15": ["Katong", "Joo Chiat", "Amber Road"],
    "16": ["Bedok", "Upper East Coast", "Eastwood", "Kew Drive"],
    "17": ["Loyang", "Changi"],
    "18": ["Tampines", "Pasir Ris"],
    "19": ["Serangoon Garden", "Hougang", "Punggol"],
    "20": ["Bishan", "Ang Mo Kio"],
    "21": ["Upper Bukit Timah", "Clementi Park", "Ulu Pandan"],
    "22": ["Jurong"],
    "23": ["Hillview", "Dairy Farm", "Bukit Panjang", "Choa Chu Kang"],
    "24": ["Lim Chu Kang", "Tengah"],
    "25": ["Kranji", "Woodgrove"],
    "26": ["Upper Thomson", "Springleaf"],
    "27": ["Yishun", "Sembawang"],
    "28": ["Seletar"]
}

street_names = np.array([
    "Orchard Road", "North Bridge Road", "South Bridge Road", "Victoria Street",
    "Beach Road", "Serangoon Road", "Havelock Road", "Outram Road",
    "Cecil Street", "Robinson Road", "Maxwell Road", "Telok Ayer Street",
    "Bras Basah Road", "Bugis Street", "Chinatown Street", "Clarke Quay"
])

# Names arrays
chinese_first_names = np.array(["Wei", "Jie", "Ming", "Li", "Xin", "Hui", "Yan", "Feng", "Jun", "Ling"])
chinese_last_names = np.array(["Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Chua", "Chan", "Koh", "Teo"])
malay_first_names = np.array(["Ahmad", "Mohamed", "Siti", "Abdullah", "Fatimah", "Ali", "Rahman", "Zainal", "Nor", "Hassan"])
malay_last_names = np.array(["bin Ahmad", "binti Mohamed", "bin Ismail", "binti Abdullah", "bin Ali", "binti Hassan"])
indian_first_names = np.array(["Raj", "Kumar", "Suresh", "Priya", "Latha", "Arjun", "Divya", "Vijay", "Anand", "Deepa"])
indian_last_names = np.array(["Kumar", "Devi", "Singh", "Rao", "Patel", "Menon", "Pillai", "Sharma", "Subramaniam", "Gopal"])
western_first_names = np.array(["John", "David", "Mary", "Sarah", "Michael", "Jennifer", "James", "Linda", "Robert", "Elizabeth"])
western_last_names = np.array(["Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Garcia", "Rodriguez", "Wilson"])

email_domains = np.array(["gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "singnet.com.sg"])
product_types = np.array(["Personal loan", "Auto loan", "Credit card", "Education loan", "Business loan"])
interest_rates = {"Personal loan": 8.5, "Auto loan": 9.0, "Credit card": 10.0, "Education loan": 8.0, "Business loan": 9.5}
loan_amount_limits = {
    "Personal loan": (5000, 100000), "Auto loan": (5000, 200000), "Credit card": (5000, 50000),
    "Education loan": (5000, 100000), "Business loan": (5000, 500000)
}

income_bands = {
    "50,000 or Below": (25000, 50000),
    "50,000 to 100,000": (50000, 100000),
    "100,000 to 200,000": (100000, 200000),
    "200,000 to 300,000": (200000, 300000),
    "300,000 to 500,000": (300000, 500000),
    "500,000 or Above": (500000, 2000000)
}

singapore_cities = np.array([
    "Singapore City", "Jurong East", "Tampines", "Woodlands", "Bedok", "Sengkang",
    "Hougang", "Yishun", "Ang Mo Kio", "Bukit Merah", "Bukit Batok", "Pasir Ris",
    "Clementi", "Bishan", "Toa Payoh", "Serangoon", "Queenstown", "Punggol", "Kallang", "Bukit Panjang"
])

agent_ids = np.array([f"SCB_AG_{i:04d}" for i in range(1, 201)])

# Enhanced Customer ID generation
def generate_customer_id():
    """Generate Customer ID starting with 8 or 9, no 3 consecutive repeating numbers, and unique"""
    while True:
        # Start with 8 or 9
        first_digit = str(np.random.choice([8, 9]))

        # Generate remaining 8 digits
        digits = [first_digit]
        for i in range(8):
            if len(digits) >= 2 and digits[-1] == digits[-2]:
                # If last two digits are same, avoid making third consecutive
                available_digits = [str(x) for x in range(10) if str(x) != digits[-1]]
            else:
                available_digits = [str(x) for x in range(10)]

            next_digit = np.random.choice(available_digits)
            digits.append(next_digit)

        customer_id = "SCB" + ''.join(digits)

        # Check for three consecutive repeating digits
        has_three_consecutive = False
        for i in range(len(digits) - 2):
            if digits[i] == digits[i+1] == digits[i+2]:
                has_three_consecutive = True
                break

        if not has_three_consecutive:
            return customer_id

# Pre-generate unique customer IDs to ensure uniqueness
def generate_unique_customer_ids(n):
    """Generate n unique customer IDs"""
    customer_ids = set()
    while len(customer_ids) < n:
        customer_id = generate_customer_id()
        customer_ids.add(customer_id)
    return list(customer_ids)

# Enhanced Age-Occupation distribution with reduced extreme stress
def generate_age_occupation_distribution(n):
    """Generate realistic age-occupation pairs with reduced extreme stress profiles"""
    ages = []
    occupations = []

    for i in range(n):
        # Age distribution with realistic peaks
        age_group = np.random.choice([
            "young_adult", "adult", "middle_aged", "senior"
        ], p=[0.25, 0.40, 0.25, 0.10])

        if age_group == "young_adult":
            age = np.random.randint(18, 30)
            occupation = np.random.choice([
                "Student", "Employed", "Employed", "Self-Employed", "Unemployed"
            ], p=[0.30, 0.50, 0.10, 0.05, 0.05])

        elif age_group == "adult":
            age = np.random.randint(30, 45)
            occupation = np.random.choice([
                "Employed", "Self-Employed", "Homemaker", "Unemployed"
            ], p=[0.78, 0.15, 0.05, 0.02])  # Reduced unemployed

        elif age_group == "middle_aged":
            age = np.random.randint(45, 60)
            occupation = np.random.choice([
                "Employed", "Self-Employed", "Retired", "Homemaker", "Unemployed"
            ], p=[0.70, 0.20, 0.05, 0.04, 0.01])  # Reduced unemployed

        else:  # senior
            age = np.random.randint(60, 76)
            occupation = np.random.choice([
                "Retired", "Employed", "Self-Employed", "Homemaker", "Unemployed"
            ], p=[0.75, 0.15, 0.05, 0.04, 0.01])  # Reduced unemployed

        # Ensure realistic constraints
        if occupation == "Student" and age > 25:
            if np.random.random() < 0.02:
                age = min(age, 35)
            else:
                occupation = "Employed"

        if occupation == "Retired" and age < 55:
            if np.random.random() < 0.03:
                age = max(age, 50)
            else:
                occupation = "Employed"

        ages.append(age)
        occupations.append(occupation)

    return np.array(ages), np.array(occupations)

# Enhanced financial health with reduced extreme stress
def calculate_financial_health(monthly_income, total_spend, savings_ratio, payment_history, missed_payments, employment_status, age, occupation):
    """Calculate financial health with reduced extreme cases"""
    spend_income_ratio = total_spend / monthly_income if monthly_income > 0 else 2.0

    # Different weighting based on employment and age
    if employment_status == "Unemployed":
        weights = {"spend_ratio": 0.5, "savings": 0.3, "payments": 0.2}
    elif occupation == "Student":
        weights = {"spend_ratio": 0.3, "savings": 0.2, "payments": 0.5}
    elif age > 60:
        weights = {"spend_ratio": 0.4, "savings": 0.4, "payments": 0.2}
    else:
        weights = {"spend_ratio": 0.4, "savings": 0.3, "payments": 0.3}

    savings_factor = 1 - max(0, savings_ratio)
    payment_factor = 1 - payment_history
    missed_payment_factor = min(1, missed_payments * 0.25)

    base_stress = (
        min(2.0, spend_income_ratio) * weights["spend_ratio"] +
        savings_factor * weights["savings"] +
        (payment_factor * 0.7 + missed_payment_factor * 0.3) * weights["payments"]
    )

    # Reduced extreme stress by capping base_stress
    base_stress = min(base_stress, 0.85)  # Cap at 85% to reduce extreme cases

    # Age and occupation adjustments to stress score
    if occupation == "Student":
        base_stress *= 0.9
    elif age > 60:
        base_stress *= 1.1

    financial_stress_score = min(100, max(0, base_stress * 100))

    # Adjusted stress thresholds to reduce extreme high stress
    if age < 25:
        if financial_stress_score >= 85:  # Increased threshold
            stress_status = "Extreme High stress"
        elif financial_stress_score >= 65:
            stress_status = "High stress"
        elif financial_stress_score >= 45:
            stress_status = "Medium stress"
        else:
            stress_status = "Low stress"
    elif age > 60:
        if financial_stress_score >= 75:  # Increased threshold
            stress_status = "Extreme High stress"
        elif financial_stress_score >= 55:
            stress_status = "High stress"
        elif financial_stress_score >= 35:
            stress_status = "Medium stress"
        else:
            stress_status = "Low stress"
    else:
        if financial_stress_score >= 80:  # Increased threshold
            stress_status = "Extreme High stress"
        elif financial_stress_score >= 60:
            stress_status = "High stress"
        elif financial_stress_score >= 40:
            stress_status = "Medium stress"
        else:
            stress_status = "Low stress"

    # Health status
    if financial_stress_score <= 25:
        health_status = "Healthy"
    elif financial_stress_score <= 50:
        health_status = "Moderate"
    else:
        health_status = "Stressed"

    # Balance trends
    if savings_ratio > 0.2:
        balance_trend = "Rising"
    elif savings_ratio < -0.1:
        balance_trend = "Falling"
    else:
        balance_trend = "Stable"

    # Overdraft flag
    overdraft_flag = (savings_ratio < -0.15 or spend_income_ratio > 1.3)

    # AAR score with reduced high-risk cases
    aar_score = np.random.normal(40, 20)  # Lower mean, less spread
    if np.random.random() < 0.03:  # Reduced from 5% to 3% high-risk anomalies
        aar_score = np.random.uniform(75, 90)
    elif np.random.random() < 0.06:  # Reduced from 8% to 6% low-risk anomalies
        aar_score = np.random.uniform(10, 30)
    aar_score = np.clip(aar_score, 0, 100)

    if aar_score <= 25:
        aar_risk = "Low"
    elif aar_score <= 70:
        aar_risk = "Medium"
    else:
        aar_risk = "High"

    # Flight risk with reduced extreme cases
    financial_factors = financial_stress_score / 100 * 0.4
    behavioral_factors = (1 - payment_history) * 0.3
    stability_factors = min(1, missed_payments * 0.1) * 0.2
    engagement_factors = (1 - min(1, payment_history)) * 0.1

    flight_risk = min(1.0, financial_factors + behavioral_factors + stability_factors + engagement_factors)

    # Age adjustments to flight risk
    if age < 30:
        flight_risk *= 1.2
    elif age > 60:
        flight_risk *= 0.8

    # Reduced extreme flight risk cases
    if np.random.random() < 0.02:  # Reduced from 4% to 2%
        flight_risk = np.random.choice([np.random.uniform(0.7, 0.9), np.random.uniform(0.05, 0.15)])

    return {
        "finance_stress_status": stress_status,
        "financial_health_status": health_status,
        "financial_stress_score": round(financial_stress_score, 2),
        "flight_risk": round(flight_risk, 2),
        "avg_balance_trend": balance_trend,
        "overdraft_flag": overdraft_flag,
        "aar_score": round(aar_score, 2),
        "aar_risk_level": aar_risk
    }

# Enhanced payment behavior with reduced negative savings
def generate_enhanced_payment_behavior(monthly_income, total_spend, age, employment_status, lifestyle, occupation):
    """Generate payment behavior with reduced negative savings"""
    if lifestyle == "High Net Worth":
        base_intensity = total_spend / 200
    else:
        base_intensity = total_spend / 80

    # Age and occupation factors
    if age < 25:
        upi_factor, credit_factor, cash_factor = 1.4, 0.7, 0.8
    elif age > 55:
        upi_factor, credit_factor, cash_factor = 0.6, 1.2, 1.4
    else:
        upi_factor, credit_factor, cash_factor = 1.0, 1.0, 1.0

    # Occupation adjustments
    if occupation == "Student":
        upi_factor *= 1.3
        credit_factor *= 0.6
    elif occupation == "Retired":
        upi_factor *= 0.7
        cash_factor *= 1.3

    upi_count = max(5, int(np.random.normal(base_intensity * 0.4 * upi_factor, 8)))
    debit_count = max(3, int(np.random.normal(base_intensity * 0.3, 5)))
    credit_count = max(2, int(np.random.normal(base_intensity * 0.2 * credit_factor, 4)))
    cash_count = max(1, int(np.random.normal(base_intensity * 0.1 * cash_factor, 3)))

    # Recurring transactions
    if occupation in ["Family Focused", "Established Career"] or age > 35:
        recurring_count = np.random.randint(6, 13)
    else:
        recurring_count = np.random.randint(3, 9)

    # Preferred channel
    channels = ["UPI", "Debit Card", "Credit Card", "Cash"]
    preferred_channel = channels[np.argmax([upi_count, debit_count, credit_count, cash_count])]

    # ENHANCED: Reduced negative savings cases
    if employment_status == "Unemployed":
        savings_ratio = np.random.uniform(-0.15, 0.05)  # Reduced from -0.25 to -0.15
    elif occupation == "Student":
        savings_ratio = np.random.uniform(-0.08, 0.1)   # Reduced from -0.15 to -0.08
    elif lifestyle == "High Net Worth":
        savings_ratio = np.random.uniform(0.25, 0.45)
    elif age > 50:
        savings_ratio = np.random.uniform(0.15, 0.35)
    elif age < 30:
        savings_ratio = np.random.uniform(0.05, 0.20)
    else:
        savings_ratio = np.random.uniform(0.08, 0.25)

    # Further reduced extreme savings anomalies
    if np.random.random() < 0.015:  # Reduced from 3% to 1.5%
        savings_ratio = np.random.choice([
            np.random.uniform(-0.3, -0.15),  # Less negative
            np.random.uniform(0.5, 0.7)
        ])

    # Other metrics
    total_transactions = upi_count + debit_count + credit_count + cash_count
    recurring_ratio = recurring_count / total_transactions if total_transactions > 0 else 0.1

    if employment_status == "Unemployed":
        spend_growth = np.random.normal(-0.05, 0.12)  # Reduced negative growth
    else:
        spend_growth = np.random.normal(0.04, 0.12)

    high_value_threshold = monthly_income * 0.07
    high_value_count = max(0, int(np.random.poisson(total_spend / high_value_threshold * 0.08)))

    return {
        "upi_count": upi_count,
        "debit_count": debit_count,
        "credit_count": credit_count,
        "cash_count": cash_count,
        "recurring_count": recurring_count,
        "preferred_channel": preferred_channel,
        "recurring_ratio": round(recurring_ratio, 2),
        "savings_ratio": round(savings_ratio, 2),
        "spend_growth": round(spend_growth, 2),
        "high_value_count": high_value_count
    }

# FIXED: Enhanced spending profile with corrected normal distribution
def generate_enhanced_spend_profile(lifestyle, monthly_income, employment_status, age, occupation):
    """Generate spending profile with reduced extreme spending"""
    base_profiles = {
        "Young Professional": {"utility": 0.10, "shopping": 0.28, "entertainment": 0.20, "health": 0.08, "education": 0.12, "travel": 0.22},
        "Family Focused": {"utility": 0.15, "shopping": 0.30, "entertainment": 0.10, "health": 0.12, "education": 0.18, "travel": 0.15},
        "Established Career": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.15, "health": 0.15, "education": 0.08, "travel": 0.25},
        "Retirement Phase": {"utility": 0.18, "shopping": 0.22, "entertainment": 0.12, "health": 0.25, "education": 0.05, "travel": 0.18},
        "Student Lifestyle": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.25, "health": 0.08, "education": 0.20, "travel": 0.10},
        "High Net Worth": {"utility": 0.08, "shopping": 0.30, "entertainment": 0.22, "health": 0.12, "education": 0.08, "travel": 0.20}
    }

    base_ratios = base_profiles[lifestyle].copy()

    # Occupation-specific adjustments
    if occupation == "Student":
        base_ratios["education"] += 0.10
        base_ratios["entertainment"] += 0.05
    elif occupation == "Retired":
        base_ratios["health"] += 0.05
        base_ratios["utility"] += 0.03
    elif occupation == "Homemaker":
        base_ratios["shopping"] += 0.05
        base_ratios["utility"] += 0.03

    # Age-specific adjustments
    if age < 25:
        base_ratios["entertainment"] += 0.05
        base_ratios["education"] += 0.03
    elif age > 60:
        base_ratios["health"] += 0.05
        base_ratios["travel"] -= 0.03

    # Individual variation
    for category in base_ratios:
        base_ratios[category] *= np.random.uniform(0.7, 1.3)

    # Normalize
    total = sum(base_ratios.values())
    for category in base_ratios:
        base_ratios[category] /= total

    # Total spend calculation with reduced extremes
    if employment_status == "Unemployed":
        spend_ratio = np.random.uniform(0.8, 1.2)  # Reduced extreme spending
    elif occupation == "Student":
        spend_ratio = np.random.uniform(0.8, 1.1)
    elif lifestyle == "High Net Worth":
        spend_ratio = np.random.uniform(0.3, 0.6)
    else:
        spend_ratio = np.random.uniform(0.5, 0.8)

    # Reduced extreme spending anomalies
    if np.random.random() < 0.03:  # Reduced from 5% to 3%
        spend_ratio = np.random.choice([
            np.random.uniform(0.3, 0.5),   # Extreme savers
            np.random.uniform(1.1, 1.5)    # Reduced from 1.2-1.8 to 1.1-1.5
        ])

    total_monthly_spend = monthly_income * spend_ratio

    # Calculate category spends
    spends = {}
    for category, ratio in base_ratios.items():
        base_spend = total_monthly_spend * ratio
        monthly_variation = np.random.uniform(0.85, 1.15)
        spends[category] = max(50, base_spend * monthly_variation)

    actual_total_spend = sum(spends.values())

    # FIXED: Use np.random.normal instead of np.random.normalvariate
    monthly_trend = np.random.normal(total_monthly_spend, total_monthly_spend * 0.12)
    monthly_trend = max(0, monthly_trend)  # Ensure non-negative

    return {
        "utility_spend": round(spends["utility"], 2),
        "shopping_spend": round(spends["shopping"], 2),
        "entertainment_spend": round(spends["entertainment"], 2),
        "health_spend": round(spends["health"], 2),
        "education_spend": round(spends["education"], 2),
        "travel_spend": round(spends["travel"], 2),
        "monthly_spend_trend": round(monthly_trend, 2),
        "seasonal_spend_variation": round(np.random.uniform(0.08, 0.35), 2),
        "weekend_spend_ratio": round(np.random.uniform(0.22, 0.48), 2),
        "festive_spend": round(total_monthly_spend * np.random.uniform(1.15, 2.0), 2),
        "total_monthly_spend": round(actual_total_spend, 2),
        "spend_ratio": round(spend_ratio, 2)
    }

# Other helper functions
def generate_singaporean_names(n):
    ethnicities = np.random.choice(["Chinese", "Malay", "Indian", "Western"], n, p=[0.74, 0.13, 0.09, 0.04])
    first_names, last_names = [], []

    for i in range(n):
        ethnicity = ethnicities[i]
        if ethnicity == "Chinese":
            first_names.append(np.random.choice(chinese_first_names))
            last_names.append(np.random.choice(chinese_last_names))
        elif ethnicity == "Malay":
            first_names.append(np.random.choice(malay_first_names))
            last_names.append(np.random.choice(malay_last_names))
        elif ethnicity == "Indian":
            first_names.append(np.random.choice(indian_first_names))
            last_names.append(np.random.choice(indian_last_names))
        else:
            first_names.append(np.random.choice(western_first_names))
            last_names.append(np.random.choice(western_last_names))

    return [f"{first} {last}" for first, last in zip(first_names, last_names)]

def generate_singapore_mobile_numbers(n, valid_ratio=0.92):
    prefixes = np.random.choice(['8', '9'], n)
    numbers = np.random.randint(0, 10000000, n)
    numbers = [f"{num:07d}" for num in numbers]

    valid_mask = np.random.random(n) < valid_ratio
    mobile_numbers = []

    for i in range(n):
        if valid_mask[i]:
            mobile_numbers.append(f"+65 {prefixes[i]}{numbers[i][:3]} {numbers[i][3:]}")
        else:
            invalid_types = [
                f"+65 {prefixes[i]}{numbers[i][:6]}",
                f"+65 {np.random.choice(['6', '7'])}{numbers[i]}",
                f"+65 {prefixes[i]}{numbers[i]}{np.random.randint(0,10)}",
            ]
            mobile_numbers.append(np.random.choice(invalid_types))

    return mobile_numbers, valid_mask

def generate_loan_data(n):
    product_choices = np.random.choice(product_types, n)
    tenures = np.random.choice([12, 24, 36, 48, 60], n)

    loan_amounts, interest_rates_arr = [], []

    for product in product_choices:
        min_amt, max_amt = loan_amount_limits[product]
        step = 5000 if product in ["Auto loan", "Business loan"] else 1000
        amount = np.random.choice(range(min_amt, max_amt + 1, step))
        loan_amounts.append(amount)
        interest_rates_arr.append(interest_rates[product])

    emis = calculate_emis_vectorized(np.array(loan_amounts), np.array(interest_rates_arr), tenures)

    return product_choices, loan_amounts, interest_rates_arr, tenures, emis

def calculate_emis_vectorized(principals, rates, tenures):
    monthly_rates = rates / 1200
    emis = (principals * monthly_rates * (1 + monthly_rates) ** tenures) / ((1 + monthly_rates) ** tenures - 1)
    return np.round(emis, 2)

def determine_lifestyle_profile(age, occupation, income_band):
    if occupation == "Student":
        return "Student Lifestyle"
    elif occupation == "Retired":
        return "Retirement Phase"
    elif income_band in ["500,000 or Above", "300,000 to 500,000"]:
        return "High Net Worth"
    elif occupation == "Homemaker" and 30 <= age <= 50:
        return "Family Focused"
    elif 30 <= age <= 50 and occupation in ["Employed", "Self-Employed"]:
        return "Family Focused"
    elif 40 <= age <= 60:
        return "Established Career"
    elif 22 <= age <= 35:
        return "Young Professional"
    else:
        if age < 30:
            return "Young Professional"
        elif age < 50:
            return "Family Focused"
        else:
            return "Established Career"

def calculate_monthly_income(income_band):
    """Calculate realistic monthly income with some variation within bands"""
    min_income, max_income = income_bands[income_band]

    # Create more realistic distribution within bands
    if income_band == "50,000 or Below":
        # Skew toward lower end
        annual_income = min_income + (max_income - min_income) * random.betavariate(2, 5)
    elif income_band == "500,000 or Above":
        # Skew toward higher end
        annual_income = min_income + (max_income - min_income) * random.betavariate(5, 2)
    else:
        # More normal distribution for middle bands
        annual_income = random.normalvariate(
            (min_income + max_income) / 2,
            (max_income - min_income) / 6
        )

    annual_income = max(min_income, min(max_income, annual_income))
    monthly_income = annual_income / 12

    return round(monthly_income, 2)

def calculate_monthly_incomes(income_bands_arr):
    monthly_incomes = []
    for income_band in income_bands_arr:
        monthly_income = calculate_monthly_income(income_band)
        monthly_incomes.append(monthly_income)
    return np.array(monthly_incomes)

def adjust_emi_for_delinquency(base_emi, days_past_due):
    if days_past_due <= 0: return base_emi
    elif days_past_due <= 15: return round(base_emi * 1.02, 2)
    elif days_past_due <= 30: return round(base_emi * 1.05, 2)
    elif days_past_due <= 60: return round(base_emi * 1.08, 2)
    elif days_past_due <= 90: return round(base_emi * 1.12, 2)
    else: return round(base_emi * 1.15, 2)

def generate_contact_success_data(call_attempts, payment_history, employment_status, age):
    if payment_history > 0.8 and employment_status == "Employed":
        base_success_rate = np.random.uniform(0.5, 0.8)
    elif payment_history > 0.6:
        base_success_rate = np.random.uniform(0.3, 0.6)
    else:
        base_success_rate = np.random.uniform(0.1, 0.4)

    if age > 60:
        base_success_rate *= 1.2
    elif age < 30:
        base_success_rate *= 0.9

    success_rate = np.clip(np.random.normal(base_success_rate, 0.15), 0.05, 0.95)
    successful_contacts = int(call_attempts * success_rate)

    available_agents = np.random.choice(agent_ids, np.random.randint(2, 6), replace=False)
    last_successful_agent = np.random.choice(available_agents) if successful_contacts > 0 else ""

    num_best_agents = min(3, len(available_agents))
    best_agents = np.random.choice(available_agents, num_best_agents, replace=False)

    avg_call_duration_base = 3 + (success_rate * 10)
    avg_times = [round(np.random.normal(avg_call_duration_base, 2), 2) for _ in best_agents]

    best_agent_interaction_count = np.random.randint(1, successful_contacts + 1) if successful_contacts > 0 else 0

    return {
        "successful_contacts": successful_contacts,
        "contact_success_rate": round(success_rate, 2),
        "last_successful_agent": last_successful_agent,
        "best_agents": ",".join(best_agents),
        "avg_times_best_agents": ",".join(map(str, avg_times)),
        "best_agent_interaction_count": best_agent_interaction_count
    }

def calculate_credit_score(payment_history, missed_payments, delinquency, partial_payments):
    base_score = 650
    payment_impact = (payment_history - 0.5) * 200
    missed_payment_impact = -min(missed_payments * 15, 100)
    delinquency_impact = -50 if delinquency else 0
    partial_impact = -20 if partial_payments else 0

    score = base_score + payment_impact + missed_payment_impact + delinquency_impact + partial_impact
    return max(300, min(850, int(score)))

# Main batch generation function
def generate_batch_data(batch_size, customer_ids_batch):
    """Generate a batch of customer data"""

    # Generate realistic age-occupation pairs
    ages, occupations = generate_age_occupation_distribution(batch_size)
    employment_status = np.array(["Employed" if occ in ["Employed", "Self-Employed"] else "Unemployed" for occ in occupations])

    # Income bands
    income_bands_arr = np.random.choice(list(income_bands.keys()), batch_size, p=[0.20, 0.35, 0.25, 0.12, 0.05, 0.03])
    monthly_incomes = calculate_monthly_incomes(income_bands_arr)

    # Generate names and contact info
    genders_arr = np.random.choice(["Male", "Female", "Others"], batch_size, p=[0.48, 0.48, 0.04])
    full_names = generate_singaporean_names(batch_size)
    first_names = [name.split()[0] for name in full_names]
    last_names = [name.split()[1] if ' ' in name else '' for name in full_names]

    # Communication data
    mobile_numbers, valid_primary_mask = generate_singapore_mobile_numbers(batch_size)
    has_secondary = np.random.random(batch_size) > 0.15
    secondary_numbers, valid_secondary_mask = generate_singapore_mobile_numbers(batch_size, 0.75)
    secondary_numbers = [num if has_sec else "" for num, has_sec in zip(secondary_numbers, has_secondary)]
    valid_secondary_mask = [mask if has_sec else False for mask, has_sec in zip(valid_secondary_mask, has_secondary)]

    # Count valid numbers
    no_of_valid_numbers = []
    no_of_invalid_numbers = []
    for i in range(batch_size):
        valid_count = sum([valid_primary_mask[i], valid_secondary_mask[i]]) if has_secondary[i] else valid_primary_mask[i]
        no_of_valid_numbers.append(valid_count)
        no_of_invalid_numbers.append((2 if has_secondary[i] else 1) - valid_count)

    # Communication completion and changes
    communication_completion = []
    mobile_change_count = []
    address_change_count = []

    for i in range(batch_size):
        age = ages[i]
        occupation = occupations[i]

        if occupation in ["Employed", "Self-Employed"]:
            base_completion = 0.9
        elif occupation == "Student":
            base_completion = 0.85
        else:
            base_completion = 0.75

        if age > 60:
            base_completion -= 0.1

        completion = np.clip(np.random.normal(base_completion, 0.08), 0.7, 1.0)
        communication_completion.append(round(completion, 2))

        if occupation == "Student" or age < 25:
            changes = np.random.poisson(2.5)
        elif occupation == "Unemployed":
            changes = np.random.poisson(1.5)
        else:
            changes = np.random.poisson(0.8)

        mobile_change_count.append(max(0, changes))

        if occupation == "Student" or (age < 30 and occupation == "Employed"):
            address_changes = np.random.poisson(1.2)
        elif occupation == "Unemployed":
            address_changes = np.random.poisson(0.8)
        else:
            address_changes = np.random.poisson(0.3)

        address_change_count.append(max(0, address_changes))

    communication_completion = np.array(communication_completion)
    mobile_change_count = np.array(mobile_change_count)
    mobile_change_this_year = np.minimum(mobile_change_count, np.random.randint(0, 3, batch_size))
    address_change_count = np.array(address_change_count)
    address_change_this_year = np.minimum(address_change_count, np.random.randint(0, 2, batch_size))

    # Contact change frequency
    customer_age_years = np.maximum(5, ages - 18)
    total_changes = mobile_change_count + address_change_count
    contact_change_frequency = np.round(total_changes / customer_age_years, 2)

    # Determine lifestyle profiles
    lifestyles = []
    for i in range(batch_size):
        lifestyle = determine_lifestyle_profile(ages[i], occupations[i], income_bands_arr[i])
        lifestyles.append(lifestyle)

    # Generate loan data
    product_types_arr, loan_amounts, interest_rates_arr, tenures, base_emis = generate_loan_data(batch_size)

    # Payment behavior
    on_time_percentages = []
    for i in range(batch_size):
        if occupations[i] in ["Employed", "Self-Employed"]:
            base_payment = np.random.beta(4, 2)
        elif occupations[i] == "Student":
            base_payment = np.random.beta(3, 3)
        else:
            base_payment = np.random.beta(2, 4)

        if ages[i] > 40:
            base_payment = min(1.0, base_payment * 1.1)
        elif ages[i] < 25:
            base_payment = max(0.1, base_payment * 0.9)

        on_time_percentages.append(base_payment)

    on_time_percentages = np.array(on_time_percentages)
    payment_frequencies = ["Regular" if p >= 0.8 else "Irregular" for p in on_time_percentages]

    # Loan dates and status
    days_since_loan_start = np.random.randint(180, 1800, batch_size)
    months_completed = np.minimum(tenures - 1, (days_since_loan_start / 30).astype(int))

    # Generate batch data
    batch_data = []
    for i in range(batch_size):
        # Use pre-generated customer ID
        customer_id = customer_ids_batch[i]

        # Loan-specific calculations
        days_past_due = np.random.randint(0, 120) if np.random.random() > on_time_percentages[i] else 0
        current_emi = adjust_emi_for_delinquency(base_emis[i], days_past_due)

        # Outstanding balance
        remaining_months = tenures[i] - months_completed[i]
        if remaining_months <= 0:
            outstanding_balance = loan_amounts[i] * 0.01
        else:
            outstanding_balance = current_emi * remaining_months * np.random.uniform(0.8, 1.2)

        outstanding_balance = max(loan_amounts[i] * 0.01, outstanding_balance)

        # Missed payments
        if occupations[i] == "Unemployed":
            missed_payment_prob = 0.3
        elif occupations[i] == "Student":
            missed_payment_prob = 0.2
        else:
            missed_payment_prob = 0.1

        if ages[i] > 50:
            missed_payment_prob *= 0.7
        elif ages[i] < 25:
            missed_payment_prob *= 1.3

        missed_payments_count = int(np.random.poisson(missed_payment_prob * months_completed[i] / 6))

        # Enhanced spending profile
        spend_profile = generate_enhanced_spend_profile(
            lifestyles[i], monthly_incomes[i], employment_status[i], ages[i], occupations[i]
        )

        # Enhanced payment behavior (with reduced negative savings)
        payment_behavior = generate_enhanced_payment_behavior(
            monthly_incomes[i], spend_profile["total_monthly_spend"],
            ages[i], employment_status[i], lifestyles[i], occupations[i]
        )

        # Enhanced financial health (with reduced extreme stress)
        financial_health = calculate_financial_health(
            monthly_incomes[i], spend_profile["total_monthly_spend"],
            payment_behavior["savings_ratio"], on_time_percentages[i],
            missed_payments_count, employment_status[i], ages[i], occupations[i]
        )

        # Contact success
        call_attempts = max(1, int(np.random.poisson(4)))
        contact_success = generate_contact_success_data(
            call_attempts, on_time_percentages[i], employment_status[i], ages[i]
        )

        # Credit score
        delinquency = np.random.random() < 0.15
        credit_score = calculate_credit_score(
            on_time_percentages[i], missed_payments_count, delinquency,
            np.random.random() < 0.15
        )

        record = {
            # Basic loan information
            "Customer_id": customer_id,  # Use pre-generated ID
            "Loan_Account_id": int(np.random.randint(10000000, 99999999)),
            "Product_Type": product_types_arr[i],
            "Loan_Amount_SGD": round(float(loan_amounts[i]), 2),
            "Outstanding_Balance_SGD": round(outstanding_balance, 2),
            "Day_Past_Due": days_past_due,
            "Tenure": tenures[i],
            "Interest_Rate": interest_rates_arr[i],
            "Current_EMI_SGD": current_emi,
            "Installment_Due_Date": (datetime.now() + timedelta(days=np.random.randint(1, 30))).date(),
            "Last_Payment_Date": (datetime.now() - timedelta(days=np.random.randint(1, 120))).date(),

            # Payment behavior
            "Partial_Payment_Indicator": np.random.random() < 0.15,
            "Number_of_Past_Payments": months_completed[i],
            "Payment_Frequency": payment_frequencies[i],
            "Amount_Paid_Each_Month_SGD": base_emis[i],
            "Missed_Payments_Count": missed_payments_count,
            "Settlement_History": np.random.choice(["Settled", "Partial Settlement", "Not Settled", "Under Negotiation"]),
            "Repayment_Irregularity_Flags": on_time_percentages[i] < 0.7,

            # Customer profile
            "Lifestyle_Profile": lifestyles[i],
            "Customer_Employment_Status": employment_status[i],

            # Communication data
            "Communication_Data_Completion": communication_completion[i],
            "Valid_Phone_Number": valid_primary_mask[i],
            "No_of_Valid_Numbers": no_of_valid_numbers[i],
            "No_of_Invalid_Numbers": no_of_invalid_numbers[i],
            "Mobile_Number_Change_Count": mobile_change_count[i],
            "Mobile_Number_Change_Count_This_Year": mobile_change_this_year[i],
            "Address_Change_Count": address_change_count[i],
            "Address_Change_Count_This_Year": address_change_this_year[i],
            "Contact_Data_Change_Frequency": contact_change_frequency[i],

            # Spend analysis
            "Finance_Stress_Status": financial_health["finance_stress_status"],
            "Utility_Spend_SGD": spend_profile["utility_spend"],
            "Shopping_Spend_SGD": spend_profile["shopping_spend"],
            "Entertainment_Spend_SGD": spend_profile["entertainment_spend"],
            "Health_Spend_SGD": spend_profile["health_spend"],
            "Education_Spend_SGD": spend_profile["education_spend"],
            "Travel_Spend_SGD": spend_profile["travel_spend"],
            "Monthly_Spend_Trend_SGD": spend_profile["monthly_spend_trend"],
            "Seasonal_Spend_Variation": spend_profile["seasonal_spend_variation"],
            "Weekend_Spend_Ratio": spend_profile["weekend_spend_ratio"],
            "Festive_Season_Spend_SGD": spend_profile["festive_spend"],
            "Total_Monthly_Spend_SGD": spend_profile["total_monthly_spend"],
            "Spend_to_Income_Ratio": spend_profile["spend_ratio"],

            # Payment behavior
            "UPI_Transaction_Count": payment_behavior["upi_count"],
            "Debit_Card_Transaction_Count": payment_behavior["debit_count"],
            "Credit_Card_Transaction_Count": payment_behavior["credit_count"],
            "Cash_Withdrawal_Count": payment_behavior["cash_count"],
            "Recurring_Transaction_Count": payment_behavior["recurring_count"],
            "Preferred_Payment_Channel": payment_behavior["preferred_channel"],
            "Recurring_Payment_Ratio": payment_behavior["recurring_ratio"],
            "Savings_to_Spend_Ratio": payment_behavior["savings_ratio"],
            "Spend_Growth_Rate_YoY": payment_behavior["spend_growth"],
            "High_Value_Transaction_Count": payment_behavior["high_value_count"],

            # Financial health
            "Flight_Risk_Score": financial_health["flight_risk"],
            "Financial_Health_Status": financial_health["financial_health_status"],
            "Financial_Stress_Score": financial_health["financial_stress_score"],
            "Avg_Balance_Trends": financial_health["avg_balance_trend"],
            "Overdraft_or_Low_Balance_Flag": financial_health["overdraft_flag"],
            "AAR_Score": financial_health["aar_score"],
            "AAR_Risk_Level": financial_health["aar_risk_level"],

            # Contact success
            "Successful_Contacts_Count": contact_success["successful_contacts"],
            "Contact_Success_Rate": contact_success["contact_success_rate"],
            "Last_Successful_Agent_ID": contact_success["last_successful_agent"],
            "Best_Contact_Agent_IDs": contact_success["best_agents"],
            "Avg_Time_With_Best_Agents_Min": contact_success["avg_times_best_agents"],
            "Customer_Best_Agent_Interaction_Count": contact_success["best_agent_interaction_count"],

            # Customer demographics
            "Name": full_names[i],
            "Age": ages[i],
            "Occupation": occupations[i],
            "Gender": genders_arr[i],
            "Primary_Phone_Number": mobile_numbers[i],
            "Secondary_Mobile_Number": secondary_numbers[i],
            "Landline_Phone_Number": f"+65 6{np.random.randint(1000000, 9999999):07d}",
            "Email_ID": f"{first_names[i].lower()}.{last_names[i].lower()}@{np.random.choice(email_domains)}",
            "Income_Band_SGD": income_bands_arr[i],
            "Monthly_Income_SGD": monthly_incomes[i],
            "Employeement_Type": np.random.choice(["Full time", "Part time", "Contract", "Freelance", "Unemployed"]),
            "Address": f"{np.random.randint(1, 999)} {np.random.choice(street_names)} #{np.random.randint(1, 50):02d}-{np.random.randint(1, 99):02d} Singapore {np.random.randint(100000, 999999)}",
            "City": np.random.choice(singapore_cities),
            "Language_Preference": np.random.choice(["English", "Regional"], p=[0.7, 0.3]),
            "Mobile_Number_Active_Status": valid_primary_mask[i],
            "Email_Activity": np.random.random() < 0.8,

            # Additional fields
            "Credit_Score": credit_score,
            "Recent_Inquiries": int(np.random.poisson(2)),
            "Loan_Exposure_Across_Banks": int(np.random.poisson(1)),
            "Delinquency_on_other_Loans": delinquency,
        }
        batch_data.append(record)

    return batch_data

# Main execution
def main():
    print(f"Generating {num_records:,} customer records with enhanced Customer IDs and reduced extremes...")

    # Pre-generate unique customer IDs
    print("Generating unique Customer IDs...")
    customer_ids = generate_unique_customer_ids(num_records)
    print(f"Generated {len(customer_ids):,} unique Customer IDs")

    all_data = []
    start_time = datetime.now()

    # Track distributions for validation
    age_occupation_tracker = defaultdict(lambda: defaultdict(int))
    stress_tracker = defaultdict(int)
    savings_tracker = defaultdict(int)

    # Process in batches
    for batch_num in range(0, num_records, batch_size):
        current_batch_size = min(batch_size, num_records - batch_num)
        print(f"Processing batch {batch_num//batch_size + 1}/{(num_records + batch_size - 1)//batch_size} "
              f"({current_batch_size} records)...")

        # Get customer IDs for this batch
        batch_customer_ids = customer_ids[batch_num:batch_num + current_batch_size]

        batch_data = generate_batch_data(current_batch_size, batch_customer_ids)
        all_data.extend(batch_data)

        # Track distributions
        for record in batch_data:
            age_group = f"{(record['Age'] // 10) * 10}s"
            age_occupation_tracker[age_group][record['Occupation']] += 1
            stress_tracker[record['Finance_Stress_Status']] += 1
            savings_tracker["Negative" if record['Savings_to_Spend_Ratio'] < 0 else "Positive"] += 1

        if (batch_num // batch_size) % 5 == 0:
            gc.collect()

    # Create DataFrame
    df = pd.DataFrame(all_data)

    # Add remaining columns
    df["Contact_History_Call_Attempts"] = np.random.poisson(4, len(df))
    df["Contact_History_SMS"] = np.random.poisson(5, len(df))
    df["Contact_History_WhatsApp"] = np.random.poisson(3, len(df))
    df["Contact_History_EmailLogs"] = np.random.poisson(2, len(df))
    df["No_of_Attempts"] = (df["Contact_History_Call_Attempts"] +
                           df["Contact_History_SMS"] +
                           df["Contact_History_WhatsApp"] +
                           df["Contact_History_EmailLogs"])
    df["Average_Handling_Time"] = np.round(np.random.uniform(8, 25, len(df)), 2)

    # Fill other categorical columns
    df["Channel_used"] = np.random.choice(["Call", "SMS", "WhatsApp", "Email", "Field Agent", "IVR"], len(df))
    df["Response_Outcome"] = np.random.choice(["Connected", "Promised to pay", "Ignored", "Disconnected", "Paid fully", "Partial paid"], len(df))
    df["App_Login_Frequency"] = np.random.poisson(12, len(df))
    df["Online_Banking_Activity"] = np.random.poisson(10, len(df))
    df["Smartphone_Penetration"] = np.where(df["Age"] < 60, "High", np.where(df["Age"] < 70, "Medium", "Low"))
    df["Preferred_Channel"] = np.random.choice(["Call", "SMS", "WhatsApp", "Email", "App notification", "Field Agent", "IVR"], len(df))
    df["Call_SMS_Activity_Patterns"] = np.random.choice(["Low", "Medium", "High"], len(df))
    df["WhatsApp_OTT_usage_Indicator"] = np.random.choice([True, False], len(df), p=[0.8, 0.2])
    df["Recent_Score_Change"] = np.random.randint(-40, 41, len(df))
    df["Unemployeement_rate_region"] = np.round(np.random.uniform(1.8, 2.2, len(df)), 2)
    df["Inflation_Rate"] = np.round(np.random.uniform(3.5, 5.0, len(df)), 2)
    df["Interest_Rate_Trend"] = np.round(np.random.uniform(-0.1, 0.3, len(df)), 2)
    df["Economic_Stress_Index"] = np.round(np.random.uniform(0.1, 0.3, len(df)), 2)
    df["Do_Not_Call_Registry_Data"] = np.random.choice([True, False], len(df), p=[0.15, 0.85])
    df["Regional_Time_Restrictions"] = np.random.choice(["Morning", "Afternoon", "Evening", "Night"], len(df))
    df["Communication_Complaince_Limits"] = np.random.choice(["Daytime", "Evening", "Weekdays", "Weekends", "Holidays"], len(df))

    # Validate data
    print("Validating data constraints...")
    assert all(5000 <= amt <= 500000 for amt in df.Loan_Amount_SGD)
    assert all(ob > 0 for ob in df.Outstanding_Balance_SGD)
    assert all(tenure in [12,24,36,48,60] for tenure in df.Tenure)
    assert all(8.0 <= rate <= 10.0 for rate in df.Interest_Rate)

    # Format float columns
    float_columns = df.select_dtypes(include=['float64']).columns
    df[float_columns] = df[float_columns].round(2)

    # Save to CSV
    output_file = 'singapore_loan_data_100k_enhanced.csv.gz'
    print(f"Saving to {output_file}...")
    df.to_csv(output_file, index=False, compression='gzip')

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    print(f"\n=== GENERATION COMPLETE ===")
    print(f"Generated {len(df):,} customer records")
    print(f"Time taken: {duration:.2f} seconds")
    print(f"Records per second: {len(df)/duration:.0f}")
    print(f"File saved: {output_file}")

    # Enhanced reporting
    print(f"\n=== ENHANCED DATA QUALITY REPORT ===")
    print(f"Total customers: {len(df):,}")

    # Stress distribution
    print(f"\nFinancial Stress Distribution:")
    for stress_level, count in sorted(stress_tracker.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / len(df)) * 100
        print(f"  {stress_level}: {count:,} ({percentage:.1f}%)")

    # Savings distribution
    print(f"\nSavings Distribution:")
    for savings_type, count in sorted(savings_tracker.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / len(df)) * 100
        print(f"  {savings_type}: {count:,} ({percentage:.1f}%)")

    # Customer ID validation
    customer_id_starts = df['Customer_id'].str[3].value_counts()
    print(f"\nCustomer ID Starting Digits:")
    for digit, count in customer_id_starts.items():
        percentage = (count / len(df)) * 100
        print(f"  Starts with {digit}: {count:,} ({percentage:.1f}%)")

    # Check for three consecutive digits
    def has_three_consecutive(s):
        for i in range(len(s) - 2):
            if s[i] == s[i+1] == s[i+2]:
                return True
        return False

    three_consecutive_count = df['Customer_id'].apply(lambda x: has_three_consecutive(x[3:])).sum()
    print(f"Customer IDs with three consecutive digits: {three_consecutive_count} (should be 0)")

    # Check uniqueness
    unique_customer_ids = df['Customer_id'].nunique()
    print(f"Unique Customer IDs: {unique_customer_ids:,} (should be {len(df):,})")

    print(f"\nFirst 5 customers with new Customer IDs:")
    sample_cols = ['Customer_id', 'Name', 'Age', 'Occupation', 'Monthly_Income_SGD',
                   'Finance_Stress_Status', 'Savings_to_Spend_Ratio']
    print(df[sample_cols].head().to_string(index=False))

if __name__ == "__main__":
    main()

Generating 100,000 customer records with enhanced Customer IDs and reduced extremes...
Generating unique Customer IDs...
Generated 100,000 unique Customer IDs
Processing batch 1/5 (20000 records)...
Processing batch 2/5 (20000 records)...
Processing batch 3/5 (20000 records)...
Processing batch 4/5 (20000 records)...
Processing batch 5/5 (20000 records)...
Validating data constraints...
Saving to singapore_loan_data_100k_enhanced.csv.gz...

=== GENERATION COMPLETE ===
Generated 100,000 customer records
Time taken: 118.38 seconds
Records per second: 845
File saved: singapore_loan_data_100k_enhanced.csv.gz

=== ENHANCED DATA QUALITY REPORT ===
Total customers: 100,000

Financial Stress Distribution:
  Medium stress: 51,739 (51.7%)
  High stress: 32,738 (32.7%)
  Extreme High stress: 13,771 (13.8%)
  Low stress: 1,752 (1.8%)

Savings Distribution:
  Positive: 85,813 (85.8%)
  Negative: 14,187 (14.2%)

Customer ID Starting Digits:
  Starts with 8: 50,012 (50.0%)
  Starts with 9: 49,988 (50

In [11]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from collections import defaultdict
import gc

# Configuration
num_records = 100000
batch_size = 20000

# Singapore-specific data
singapore_districts = {
    "01": ["Raffles Place", "Marina", "People's Park"],
    "02": ["Anson", "Tanjong Pagar"],
    "03": ["Queenstown", "Tiong Bahru"],
    "04": ["Telok Blangah", "HarbourFront"],
    "05": ["Pasir Panjang", "Hong Leong Garden", "Clementi New Town"],
    "06": ["High Street", "Beach Road"],
    "07": ["Middle Road", "Golden Mile"],
    "08": ["Little India", "Farrer Park"],
    "09": ["Orchard", "Cairnhill", "River Valley"],
    "10": ["Ardmore", "Bukit Timah", "Holland Road"],
    "11": ["Watten Estate", "Novena", "Thomson"],
    "12": ["Balestier", "Toa Payoh", "Serangoon"],
    "13": ["Macpherson", "Braddell"],
    "14": ["Geylang", "Eunos"],
    "15": ["Katong", "Joo Chiat", "Amber Road"],
    "16": ["Bedok", "Upper East Coast", "Eastwood", "Kew Drive"],
    "17": ["Loyang", "Changi"],
    "18": ["Tampines", "Pasir Ris"],
    "19": ["Serangoon Garden", "Hougang", "Punggol"],
    "20": ["Bishan", "Ang Mo Kio"],
    "21": ["Upper Bukit Timah", "Clementi Park", "Ulu Pandan"],
    "22": ["Jurong"],
    "23": ["Hillview", "Dairy Farm", "Bukit Panjang", "Choa Chu Kang"],
    "24": ["Lim Chu Kang", "Tengah"],
    "25": ["Kranji", "Woodgrove"],
    "26": ["Upper Thomson", "Springleaf"],
    "27": ["Yishun", "Sembawang"],
    "28": ["Seletar"]
}

street_names = np.array([
    "Orchard Road", "North Bridge Road", "South Bridge Road", "Victoria Street",
    "Beach Road", "Serangoon Road", "Havelock Road", "Outram Road",
    "Cecil Street", "Robinson Road", "Maxwell Road", "Telok Ayer Street",
    "Bras Basah Road", "Bugis Street", "Chinatown Street", "Clarke Quay"
])

# Names arrays
chinese_first_names = np.array(["Wei", "Jie", "Ming", "Li", "Xin", "Hui", "Yan", "Feng", "Jun", "Ling"])
chinese_last_names = np.array(["Tan", "Lim", "Lee", "Ng", "Ong", "Wong", "Chua", "Chan", "Koh", "Teo"])
malay_first_names = np.array(["Ahmad", "Mohamed", "Siti", "Abdullah", "Fatimah", "Ali", "Rahman", "Zainal", "Nor", "Hassan"])
malay_last_names = np.array(["bin Ahmad", "binti Mohamed", "bin Ismail", "binti Abdullah", "bin Ali", "binti Hassan"])
indian_first_names = np.array(["Raj", "Kumar", "Suresh", "Priya", "Latha", "Arjun", "Divya", "Vijay", "Anand", "Deepa"])
indian_last_names = np.array(["Kumar", "Devi", "Singh", "Rao", "Patel", "Menon", "Pillai", "Sharma", "Subramaniam", "Gopal"])
western_first_names = np.array(["John", "David", "Mary", "Sarah", "Michael", "Jennifer", "James", "Linda", "Robert", "Elizabeth"])
western_last_names = np.array(["Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Garcia", "Rodriguez", "Wilson"])

email_domains = np.array(["gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "singnet.com.sg"])
product_types = np.array(["Personal loan", "Auto loan", "Credit card", "Education loan", "Business loan"])
interest_rates = {"Personal loan": 8.5, "Auto loan": 9.0, "Credit card": 10.0, "Education loan": 8.0, "Business loan": 9.5}
loan_amount_limits = {
    "Personal loan": (5000, 100000), "Auto loan": (5000, 200000), "Credit card": (5000, 50000),
    "Education loan": (5000, 100000), "Business loan": (5000, 500000)
}

income_bands = {
    "50,000 or Below": (25000, 50000),
    "50,000 to 100,000": (50000, 100000),
    "100,000 to 200,000": (100000, 200000),
    "200,000 to 300,000": (200000, 300000),
    "300,000 to 500,000": (300000, 500000),
    "500,000 or Above": (500000, 2000000)
}

singapore_cities = np.array([
    "Singapore City", "Jurong East", "Tampines", "Woodlands", "Bedok", "Sengkang",
    "Hougang", "Yishun", "Ang Mo Kio", "Bukit Merah", "Bukit Batok", "Pasir Ris",
    "Clementi", "Bishan", "Toa Payoh", "Serangoon", "Queenstown", "Punggol", "Kallang", "Bukit Panjang"
])

agent_ids = np.array([f"SCB_AG_{i:04d}" for i in range(1, 201)])

# Enhanced Customer ID generation
def generate_customer_id():
    """Generate Customer ID starting with 8 or 9, no 3 consecutive repeating numbers, and unique"""
    while True:
        # Start with 8 or 9
        first_digit = str(np.random.choice([8, 9]))

        # Generate remaining 8 digits
        digits = [first_digit]
        for i in range(8):
            if len(digits) >= 2 and digits[-1] == digits[-2]:
                # If last two digits are same, avoid making third consecutive
                available_digits = [str(x) for x in range(10) if str(x) != digits[-1]]
            else:
                available_digits = [str(x) for x in range(10)]

            next_digit = np.random.choice(available_digits)
            digits.append(next_digit)

        customer_id = "SCB" + ''.join(digits)

        # Check for three consecutive repeating digits
        has_three_consecutive = False
        for i in range(len(digits) - 2):
            if digits[i] == digits[i+1] == digits[i+2]:
                has_three_consecutive = True
                break

        if not has_three_consecutive:
            return customer_id

# Pre-generate unique customer IDs to ensure uniqueness
def generate_unique_customer_ids(n):
    """Generate n unique customer IDs"""
    customer_ids = set()
    while len(customer_ids) < n:
        customer_id = generate_customer_id()
        customer_ids.add(customer_id)
    return list(customer_ids)

# Enhanced Age-Occupation distribution with reduced extreme stress
def generate_age_occupation_distribution(n):
    """Generate realistic age-occupation pairs with reduced extreme stress profiles"""
    ages = []
    occupations = []

    for i in range(n):
        # Age distribution with realistic peaks
        age_group = np.random.choice([
            "young_adult", "adult", "middle_aged", "senior"
        ], p=[0.25, 0.40, 0.25, 0.10])

        if age_group == "young_adult":
            age = np.random.randint(18, 30)
            occupation = np.random.choice([
                "Student", "Employed", "Employed", "Self-Employed", "Unemployed"
            ], p=[0.30, 0.50, 0.10, 0.05, 0.05])

        elif age_group == "adult":
            age = np.random.randint(30, 45)
            occupation = np.random.choice([
                "Employed", "Self-Employed", "Homemaker", "Unemployed"
            ], p=[0.78, 0.15, 0.05, 0.02])  # Reduced unemployed

        elif age_group == "middle_aged":
            age = np.random.randint(45, 60)
            occupation = np.random.choice([
                "Employed", "Self-Employed", "Retired", "Homemaker", "Unemployed"
            ], p=[0.70, 0.20, 0.05, 0.04, 0.01])  # Reduced unemployed

        else:  # senior
            age = np.random.randint(60, 76)
            occupation = np.random.choice([
                "Retired", "Employed", "Self-Employed", "Homemaker", "Unemployed"
            ], p=[0.75, 0.15, 0.05, 0.04, 0.01])  # Reduced unemployed

        # Ensure realistic constraints
        if occupation == "Student" and age > 25:
            if np.random.random() < 0.02:
                age = min(age, 35)
            else:
                occupation = "Employed"

        if occupation == "Retired" and age < 55:
            if np.random.random() < 0.03:
                age = max(age, 50)
            else:
                occupation = "Employed"

        ages.append(age)
        occupations.append(occupation)

    return np.array(ages), np.array(occupations)

# Enhanced financial health with reduced extreme stress
def calculate_financial_health(monthly_income, total_spend, savings_ratio, payment_history, missed_payments, employment_status, age, occupation):
    """Calculate financial health with reduced extreme cases"""
    spend_income_ratio = total_spend / monthly_income if monthly_income > 0 else 2.0

    # Different weighting based on employment and age
    if employment_status == "Unemployed":
        weights = {"spend_ratio": 0.5, "savings": 0.3, "payments": 0.2}
    elif occupation == "Student":
        weights = {"spend_ratio": 0.3, "savings": 0.2, "payments": 0.5}
    elif age > 60:
        weights = {"spend_ratio": 0.4, "savings": 0.4, "payments": 0.2}
    else:
        weights = {"spend_ratio": 0.4, "savings": 0.3, "payments": 0.3}

    savings_factor = 1 - max(0, savings_ratio)
    payment_factor = 1 - payment_history
    missed_payment_factor = min(1, missed_payments * 0.25)

    base_stress = (
        min(2.0, spend_income_ratio) * weights["spend_ratio"] +
        savings_factor * weights["savings"] +
        (payment_factor * 0.7 + missed_payment_factor * 0.3) * weights["payments"]
    )

    # Reduced extreme stress by capping base_stress
    base_stress = min(base_stress, 0.85)  # Cap at 85% to reduce extreme cases

    # Age and occupation adjustments to stress score
    if occupation == "Student":
        base_stress *= 0.9
    elif age > 60:
        base_stress *= 1.1

    financial_stress_score = min(100, max(0, base_stress * 100))

    # Adjusted stress thresholds to reduce extreme high stress
    if age < 25:
        if financial_stress_score >= 85:  # Increased threshold
            stress_status = "Extreme High stress"
        elif financial_stress_score >= 65:
            stress_status = "High stress"
        elif financial_stress_score >= 45:
            stress_status = "Medium stress"
        else:
            stress_status = "Low stress"
    elif age > 60:
        if financial_stress_score >= 75:  # Increased threshold
            stress_status = "Extreme High stress"
        elif financial_stress_score >= 55:
            stress_status = "High stress"
        elif financial_stress_score >= 35:
            stress_status = "Medium stress"
        else:
            stress_status = "Low stress"
    else:
        if financial_stress_score >= 80:  # Increased threshold
            stress_status = "Extreme High stress"
        elif financial_stress_score >= 60:
            stress_status = "High stress"
        elif financial_stress_score >= 40:
            stress_status = "Medium stress"
        else:
            stress_status = "Low stress"

    # Health status
    if financial_stress_score <= 25:
        health_status = "Healthy"
    elif financial_stress_score <= 50:
        health_status = "Moderate"
    else:
        health_status = "Stressed"

    # Balance trends
    if savings_ratio > 0.2:
        balance_trend = "Rising"
    elif savings_ratio < -0.1:
        balance_trend = "Falling"
    else:
        balance_trend = "Stable"

    # Overdraft flag
    overdraft_flag = (savings_ratio < -0.15 or spend_income_ratio > 1.3)

    # AAR score with reduced high-risk cases
    aar_score = np.random.normal(40, 20)  # Lower mean, less spread
    if np.random.random() < 0.03:  # Reduced from 5% to 3% high-risk anomalies
        aar_score = np.random.uniform(75, 90)
    elif np.random.random() < 0.06:  # Reduced from 8% to 6% low-risk anomalies
        aar_score = np.random.uniform(10, 30)
    aar_score = np.clip(aar_score, 0, 100)

    if aar_score <= 25:
        aar_risk = "Low"
    elif aar_score <= 70:
        aar_risk = "Medium"
    else:
        aar_risk = "High"

    # Flight risk with reduced extreme cases
    financial_factors = financial_stress_score / 100 * 0.4
    behavioral_factors = (1 - payment_history) * 0.3
    stability_factors = min(1, missed_payments * 0.1) * 0.2
    engagement_factors = (1 - min(1, payment_history)) * 0.1

    flight_risk = min(1.0, financial_factors + behavioral_factors + stability_factors + engagement_factors)

    # Age adjustments to flight risk
    if age < 30:
        flight_risk *= 1.2
    elif age > 60:
        flight_risk *= 0.8

    # Reduced extreme flight risk cases
    if np.random.random() < 0.02:  # Reduced from 4% to 2%
        flight_risk = np.random.choice([np.random.uniform(0.7, 0.9), np.random.uniform(0.05, 0.15)])

    return {
        "finance_stress_status": stress_status,
        "financial_health_status": health_status,
        "financial_stress_score": round(financial_stress_score, 2),
        "flight_risk": round(flight_risk, 2),
        "avg_balance_trend": balance_trend,
        "overdraft_flag": overdraft_flag,
        "aar_score": round(aar_score, 2),
        "aar_risk_level": aar_risk
    }

# Enhanced payment behavior with reduced negative savings
def generate_enhanced_payment_behavior(monthly_income, total_spend, age, employment_status, lifestyle, occupation):
    """Generate payment behavior with reduced negative savings"""
    if lifestyle == "High Net Worth":
        base_intensity = total_spend / 200
    else:
        base_intensity = total_spend / 80

    # Age and occupation factors
    if age < 25:
        upi_factor, credit_factor, cash_factor = 1.4, 0.7, 0.8
    elif age > 55:
        upi_factor, credit_factor, cash_factor = 0.6, 1.2, 1.4
    else:
        upi_factor, credit_factor, cash_factor = 1.0, 1.0, 1.0

    # Occupation adjustments
    if occupation == "Student":
        upi_factor *= 1.3
        credit_factor *= 0.6
    elif occupation == "Retired":
        upi_factor *= 0.7
        cash_factor *= 1.3

    upi_count = max(5, int(np.random.normal(base_intensity * 0.4 * upi_factor, 8)))
    debit_count = max(3, int(np.random.normal(base_intensity * 0.3, 5)))
    credit_count = max(2, int(np.random.normal(base_intensity * 0.2 * credit_factor, 4)))
    cash_count = max(1, int(np.random.normal(base_intensity * 0.1 * cash_factor, 3)))

    # Recurring transactions
    if occupation in ["Family Focused", "Established Career"] or age > 35:
        recurring_count = np.random.randint(6, 13)
    else:
        recurring_count = np.random.randint(3, 9)

    # Preferred channel
    channels = ["UPI", "Debit Card", "Credit Card", "Cash"]
    preferred_channel = channels[np.argmax([upi_count, debit_count, credit_count, cash_count])]

    # ENHANCED: Reduced negative savings cases
    if employment_status == "Unemployed":
        savings_ratio = np.random.uniform(-0.15, 0.05)  # Reduced from -0.25 to -0.15
    elif occupation == "Student":
        savings_ratio = np.random.uniform(-0.08, 0.1)   # Reduced from -0.15 to -0.08
    elif lifestyle == "High Net Worth":
        savings_ratio = np.random.uniform(0.25, 0.45)
    elif age > 50:
        savings_ratio = np.random.uniform(0.15, 0.35)
    elif age < 30:
        savings_ratio = np.random.uniform(0.05, 0.20)
    else:
        savings_ratio = np.random.uniform(0.08, 0.25)

    # Further reduced extreme savings anomalies
    if np.random.random() < 0.015:  # Reduced from 3% to 1.5%
        savings_ratio = np.random.choice([
            np.random.uniform(-0.3, -0.15),  # Less negative
            np.random.uniform(0.5, 0.7)
        ])

    # Other metrics
    total_transactions = upi_count + debit_count + credit_count + cash_count
    recurring_ratio = recurring_count / total_transactions if total_transactions > 0 else 0.1

    if employment_status == "Unemployed":
        spend_growth = np.random.normal(-0.05, 0.12)  # Reduced negative growth
    else:
        spend_growth = np.random.normal(0.04, 0.12)

    high_value_threshold = monthly_income * 0.07
    high_value_count = max(0, int(np.random.poisson(total_spend / high_value_threshold * 0.08)))

    return {
        "upi_count": upi_count,
        "debit_count": debit_count,
        "credit_count": credit_count,
        "cash_count": cash_count,
        "recurring_count": recurring_count,
        "preferred_channel": preferred_channel,
        "recurring_ratio": round(recurring_ratio, 2),
        "savings_ratio": round(savings_ratio, 2),
        "spend_growth": round(spend_growth, 2),
        "high_value_count": high_value_count
    }

# FIXED: Enhanced spending profile with corrected normal distribution
def generate_enhanced_spend_profile(lifestyle, monthly_income, employment_status, age, occupation):
    """Generate spending profile with reduced extreme spending"""
    base_profiles = {
        "Young Professional": {"utility": 0.10, "shopping": 0.28, "entertainment": 0.20, "health": 0.08, "education": 0.12, "travel": 0.22},
        "Family Focused": {"utility": 0.15, "shopping": 0.30, "entertainment": 0.10, "health": 0.12, "education": 0.18, "travel": 0.15},
        "Established Career": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.15, "health": 0.15, "education": 0.08, "travel": 0.25},
        "Retirement Phase": {"utility": 0.18, "shopping": 0.22, "entertainment": 0.12, "health": 0.25, "education": 0.05, "travel": 0.18},
        "Student Lifestyle": {"utility": 0.12, "shopping": 0.25, "entertainment": 0.25, "health": 0.08, "education": 0.20, "travel": 0.10},
        "High Net Worth": {"utility": 0.08, "shopping": 0.30, "entertainment": 0.22, "health": 0.12, "education": 0.08, "travel": 0.20}
    }

    base_ratios = base_profiles[lifestyle].copy()

    # Occupation-specific adjustments
    if occupation == "Student":
        base_ratios["education"] += 0.10
        base_ratios["entertainment"] += 0.05
    elif occupation == "Retired":
        base_ratios["health"] += 0.05
        base_ratios["utility"] += 0.03
    elif occupation == "Homemaker":
        base_ratios["shopping"] += 0.05
        base_ratios["utility"] += 0.03

    # Age-specific adjustments
    if age < 25:
        base_ratios["entertainment"] += 0.05
        base_ratios["education"] += 0.03
    elif age > 60:
        base_ratios["health"] += 0.05
        base_ratios["travel"] -= 0.03

    # Individual variation
    for category in base_ratios:
        base_ratios[category] *= np.random.uniform(0.7, 1.3)

    # Normalize
    total = sum(base_ratios.values())
    for category in base_ratios:
        base_ratios[category] /= total

    # Total spend calculation with reduced extremes
    if employment_status == "Unemployed":
        spend_ratio = np.random.uniform(0.8, 1.2)  # Reduced extreme spending
    elif occupation == "Student":
        spend_ratio = np.random.uniform(0.8, 1.1)
    elif lifestyle == "High Net Worth":
        spend_ratio = np.random.uniform(0.3, 0.6)
    else:
        spend_ratio = np.random.uniform(0.5, 0.8)

    # Reduced extreme spending anomalies
    if np.random.random() < 0.03:  # Reduced from 5% to 3%
        spend_ratio = np.random.choice([
            np.random.uniform(0.3, 0.5),   # Extreme savers
            np.random.uniform(1.1, 1.5)    # Reduced from 1.2-1.8 to 1.1-1.5
        ])

    total_monthly_spend = monthly_income * spend_ratio

    # Calculate category spends
    spends = {}
    for category, ratio in base_ratios.items():
        base_spend = total_monthly_spend * ratio
        monthly_variation = np.random.uniform(0.85, 1.15)
        spends[category] = max(50, base_spend * monthly_variation)

    actual_total_spend = sum(spends.values())

    # FIXED: Use np.random.normal instead of np.random.normalvariate
    monthly_trend = np.random.normal(total_monthly_spend, total_monthly_spend * 0.12)
    monthly_trend = max(0, monthly_trend)  # Ensure non-negative

    return {
        "utility_spend": round(spends["utility"], 2),
        "shopping_spend": round(spends["shopping"], 2),
        "entertainment_spend": round(spends["entertainment"], 2),
        "health_spend": round(spends["health"], 2),
        "education_spend": round(spends["education"], 2),
        "travel_spend": round(spends["travel"], 2),
        "monthly_spend_trend": round(monthly_trend, 2),
        "seasonal_spend_variation": round(np.random.uniform(0.08, 0.35), 2),
        "weekend_spend_ratio": round(np.random.uniform(0.22, 0.48), 2),
        "festive_spend": round(total_monthly_spend * np.random.uniform(1.15, 2.0), 2),
        "total_monthly_spend": round(actual_total_spend, 2),
        "spend_ratio": round(spend_ratio, 2)
    }

# Other helper functions
def generate_singaporean_names(n):
    ethnicities = np.random.choice(["Chinese", "Malay", "Indian", "Western"], n, p=[0.74, 0.13, 0.09, 0.04])
    first_names, last_names = [], []

    for i in range(n):
        ethnicity = ethnicities[i]
        if ethnicity == "Chinese":
            first_names.append(np.random.choice(chinese_first_names))
            last_names.append(np.random.choice(chinese_last_names))
        elif ethnicity == "Malay":
            first_names.append(np.random.choice(malay_first_names))
            last_names.append(np.random.choice(malay_last_names))
        elif ethnicity == "Indian":
            first_names.append(np.random.choice(indian_first_names))
            last_names.append(np.random.choice(indian_last_names))
        else:
            first_names.append(np.random.choice(western_first_names))
            last_names.append(np.random.choice(western_last_names))

    return [f"{first} {last}" for first, last in zip(first_names, last_names)]

def generate_singapore_mobile_numbers(n, valid_ratio=0.92):
    prefixes = np.random.choice(['8', '9'], n)
    numbers = np.random.randint(0, 10000000, n)
    numbers = [f"{num:07d}" for num in numbers]

    valid_mask = np.random.random(n) < valid_ratio
    mobile_numbers = []

    for i in range(n):
        if valid_mask[i]:
            mobile_numbers.append(f"+65 {prefixes[i]}{numbers[i][:3]} {numbers[i][3:]}")
        else:
            invalid_types = [
                f"+65 {prefixes[i]}{numbers[i][:6]}",
                f"+65 {np.random.choice(['6', '7'])}{numbers[i]}",
                f"+65 {prefixes[i]}{numbers[i]}{np.random.randint(0,10)}",
            ]
            mobile_numbers.append(np.random.choice(invalid_types))

    return mobile_numbers, valid_mask

def generate_loan_data(n):
    product_choices = np.random.choice(product_types, n)
    tenures = np.random.choice([12, 24, 36, 48, 60], n)

    loan_amounts, interest_rates_arr = [], []

    for product in product_choices:
        min_amt, max_amt = loan_amount_limits[product]
        step = 5000 if product in ["Auto loan", "Business loan"] else 1000
        amount = np.random.choice(range(min_amt, max_amt + 1, step))
        loan_amounts.append(amount)
        interest_rates_arr.append(interest_rates[product])

    emis = calculate_emis_vectorized(np.array(loan_amounts), np.array(interest_rates_arr), tenures)

    return product_choices, loan_amounts, interest_rates_arr, tenures, emis

def calculate_emis_vectorized(principals, rates, tenures):
    monthly_rates = rates / 1200
    emis = (principals * monthly_rates * (1 + monthly_rates) ** tenures) / ((1 + monthly_rates) ** tenures - 1)
    return np.round(emis, 2)

def determine_lifestyle_profile(age, occupation, income_band):
    if occupation == "Student":
        return "Student Lifestyle"
    elif occupation == "Retired":
        return "Retirement Phase"
    elif income_band in ["500,000 or Above", "300,000 to 500,000"]:
        return "High Net Worth"
    elif occupation == "Homemaker" and 30 <= age <= 50:
        return "Family Focused"
    elif 30 <= age <= 50 and occupation in ["Employed", "Self-Employed"]:
        return "Family Focused"
    elif 40 <= age <= 60:
        return "Established Career"
    elif 22 <= age <= 35:
        return "Young Professional"
    else:
        if age < 30:
            return "Young Professional"
        elif age < 50:
            return "Family Focused"
        else:
            return "Established Career"

def calculate_monthly_income(income_band):
    """Calculate realistic monthly income with some variation within bands"""
    min_income, max_income = income_bands[income_band]

    # Create more realistic distribution within bands
    if income_band == "50,000 or Below":
        # Skew toward lower end
        annual_income = min_income + (max_income - min_income) * random.betavariate(2, 5)
    elif income_band == "500,000 or Above":
        # Skew toward higher end
        annual_income = min_income + (max_income - min_income) * random.betavariate(5, 2)
    else:
        # More normal distribution for middle bands
        annual_income = random.normalvariate(
            (min_income + max_income) / 2,
            (max_income - min_income) / 6
        )

    annual_income = max(min_income, min(max_income, annual_income))
    monthly_income = annual_income / 12

    return round(monthly_income, 2)

def calculate_monthly_incomes(income_bands_arr):
    monthly_incomes = []
    for income_band in income_bands_arr:
        monthly_income = calculate_monthly_income(income_band)
        monthly_incomes.append(monthly_income)
    return np.array(monthly_incomes)

def adjust_emi_for_delinquency(base_emi, days_past_due):
    if days_past_due <= 0: return base_emi
    elif days_past_due <= 15: return round(base_emi * 1.02, 2)
    elif days_past_due <= 30: return round(base_emi * 1.05, 2)
    elif days_past_due <= 60: return round(base_emi * 1.08, 2)
    elif days_past_due <= 90: return round(base_emi * 1.12, 2)
    else: return round(base_emi * 1.15, 2)

def generate_contact_success_data(call_attempts, payment_history, employment_status, age):
    if payment_history > 0.8 and employment_status == "Employed":
        base_success_rate = np.random.uniform(0.5, 0.8)
    elif payment_history > 0.6:
        base_success_rate = np.random.uniform(0.3, 0.6)
    else:
        base_success_rate = np.random.uniform(0.1, 0.4)

    if age > 60:
        base_success_rate *= 1.2
    elif age < 30:
        base_success_rate *= 0.9

    success_rate = np.clip(np.random.normal(base_success_rate, 0.15), 0.05, 0.95)
    successful_contacts = int(call_attempts * success_rate)

    available_agents = np.random.choice(agent_ids, np.random.randint(2, 6), replace=False)
    last_successful_agent = np.random.choice(available_agents) if successful_contacts > 0 else ""

    num_best_agents = min(3, len(available_agents))
    best_agents = np.random.choice(available_agents, num_best_agents, replace=False)

    avg_call_duration_base = 3 + (success_rate * 10)
    avg_times = [round(np.random.normal(avg_call_duration_base, 2), 2) for _ in best_agents]

    best_agent_interaction_count = np.random.randint(1, successful_contacts + 1) if successful_contacts > 0 else 0

    return {
        "successful_contacts": successful_contacts,
        "contact_success_rate": round(success_rate, 2),
        "last_successful_agent": last_successful_agent,
        "best_agents": ",".join(best_agents),
        "avg_times_best_agents": ",".join(map(str, avg_times)),
        "best_agent_interaction_count": best_agent_interaction_count
    }

def calculate_credit_score(payment_history, missed_payments, delinquency, partial_payments):
    base_score = 650
    payment_impact = (payment_history - 0.5) * 200
    missed_payment_impact = -min(missed_payments * 15, 100)
    delinquency_impact = -50 if delinquency else 0
    partial_impact = -20 if partial_payments else 0

    score = base_score + payment_impact + missed_payment_impact + delinquency_impact + partial_impact
    return max(300, min(850, int(score)))

# Main batch generation function
def generate_batch_data(batch_size, customer_ids_batch):
    """Generate a batch of customer data"""

    # Generate realistic age-occupation pairs
    ages, occupations = generate_age_occupation_distribution(batch_size)
    employment_status = np.array(["Employed" if occ in ["Employed", "Self-Employed"] else "Unemployed" for occ in occupations])

    # Income bands
    income_bands_arr = np.random.choice(list(income_bands.keys()), batch_size, p=[0.20, 0.35, 0.25, 0.12, 0.05, 0.03])
    monthly_incomes = calculate_monthly_incomes(income_bands_arr)

    # Generate names and contact info
    genders_arr = np.random.choice(["Male", "Female", "Others"], batch_size, p=[0.48, 0.48, 0.04])
    full_names = generate_singaporean_names(batch_size)
    first_names = [name.split()[0] for name in full_names]
    last_names = [name.split()[1] if ' ' in name else '' for name in full_names]

    # Communication data
    mobile_numbers, valid_primary_mask = generate_singapore_mobile_numbers(batch_size)
    has_secondary = np.random.random(batch_size) > 0.15
    secondary_numbers, valid_secondary_mask = generate_singapore_mobile_numbers(batch_size, 0.75)
    secondary_numbers = [num if has_sec else "" for num, has_sec in zip(secondary_numbers, has_secondary)]
    valid_secondary_mask = [mask if has_sec else False for mask, has_sec in zip(valid_secondary_mask, has_secondary)]

    # FIXED: Ensure No_of_Valid_Numbers is integer only
    no_of_valid_numbers = []
    no_of_invalid_numbers = []
    for i in range(batch_size):
        valid_count = int(sum([valid_primary_mask[i], valid_secondary_mask[i]])) if has_secondary[i] else int(valid_primary_mask[i])
        no_of_valid_numbers.append(valid_count)
        invalid_count = (2 if has_secondary[i] else 1) - valid_count
        no_of_invalid_numbers.append(int(invalid_count))

    # Communication completion and changes
    communication_completion = []
    mobile_change_count = []
    address_change_count = []

    for i in range(batch_size):
        age = ages[i]
        occupation = occupations[i]

        if occupation in ["Employed", "Self-Employed"]:
            base_completion = 0.9
        elif occupation == "Student":
            base_completion = 0.85
        else:
            base_completion = 0.75

        if age > 60:
            base_completion -= 0.1

        completion = np.clip(np.random.normal(base_completion, 0.08), 0.7, 1.0)
        communication_completion.append(round(completion, 2))

        if occupation == "Student" or age < 25:
            changes = np.random.poisson(2.5)
        elif occupation == "Unemployed":
            changes = np.random.poisson(1.5)
        else:
            changes = np.random.poisson(0.8)

        mobile_change_count.append(max(0, int(changes)))

        if occupation == "Student" or (age < 30 and occupation == "Employed"):
            address_changes = np.random.poisson(1.2)
        elif occupation == "Unemployed":
            address_changes = np.random.poisson(0.8)
        else:
            address_changes = np.random.poisson(0.3)

        address_change_count.append(max(0, int(address_changes)))

    communication_completion = np.array(communication_completion)
    mobile_change_count = np.array(mobile_change_count)
    mobile_change_this_year = np.minimum(mobile_change_count, np.random.randint(0, 3, batch_size))
    address_change_count = np.array(address_change_count)
    address_change_this_year = np.minimum(address_change_count, np.random.randint(0, 2, batch_size))

    # Contact change frequency
    customer_age_years = np.maximum(5, ages - 18)
    total_changes = mobile_change_count + address_change_count
    contact_change_frequency = np.round(total_changes / customer_age_years, 2)

    # Determine lifestyle profiles
    lifestyles = []
    for i in range(batch_size):
        lifestyle = determine_lifestyle_profile(ages[i], occupations[i], income_bands_arr[i])
        lifestyles.append(lifestyle)

    # Generate loan data
    product_types_arr, loan_amounts, interest_rates_arr, tenures, base_emis = generate_loan_data(batch_size)

    # Payment behavior
    on_time_percentages = []
    for i in range(batch_size):
        if occupations[i] in ["Employed", "Self-Employed"]:
            base_payment = np.random.beta(4, 2)
        elif occupations[i] == "Student":
            base_payment = np.random.beta(3, 3)
        else:
            base_payment = np.random.beta(2, 4)

        if ages[i] > 40:
            base_payment = min(1.0, base_payment * 1.1)
        elif ages[i] < 25:
            base_payment = max(0.1, base_payment * 0.9)

        on_time_percentages.append(base_payment)

    on_time_percentages = np.array(on_time_percentages)
    payment_frequencies = ["Regular" if p >= 0.8 else "Irregular" for p in on_time_percentages]

    # Loan dates and status
    days_since_loan_start = np.random.randint(180, 1800, batch_size)
    months_completed = np.minimum(tenures - 1, (days_since_loan_start / 30).astype(int))

    # Generate batch data
    batch_data = []
    for i in range(batch_size):
        # Use pre-generated customer ID
        customer_id = customer_ids_batch[i]

        # Loan-specific calculations
        days_past_due = np.random.randint(0, 120) if np.random.random() > on_time_percentages[i] else 0
        current_emi = adjust_emi_for_delinquency(base_emis[i], days_past_due)

        # Outstanding balance
        remaining_months = tenures[i] - months_completed[i]
        if remaining_months <= 0:
            outstanding_balance = loan_amounts[i] * 0.01
        else:
            outstanding_balance = current_emi * remaining_months * np.random.uniform(0.8, 1.2)

        outstanding_balance = max(loan_amounts[i] * 0.01, outstanding_balance)

        # Missed payments
        if occupations[i] == "Unemployed":
            missed_payment_prob = 0.3
        elif occupations[i] == "Student":
            missed_payment_prob = 0.2
        else:
            missed_payment_prob = 0.1

        if ages[i] > 50:
            missed_payment_prob *= 0.7
        elif ages[i] < 25:
            missed_payment_prob *= 1.3

        missed_payments_count = int(np.random.poisson(missed_payment_prob * months_completed[i] / 6))

        # Enhanced spending profile
        spend_profile = generate_enhanced_spend_profile(
            lifestyles[i], monthly_incomes[i], employment_status[i], ages[i], occupations[i]
        )

        # Enhanced payment behavior (with reduced negative savings)
        payment_behavior = generate_enhanced_payment_behavior(
            monthly_incomes[i], spend_profile["total_monthly_spend"],
            ages[i], employment_status[i], lifestyles[i], occupations[i]
        )

        # Enhanced financial health (with reduced extreme stress)
        financial_health = calculate_financial_health(
            monthly_incomes[i], spend_profile["total_monthly_spend"],
            payment_behavior["savings_ratio"], on_time_percentages[i],
            missed_payments_count, employment_status[i], ages[i], occupations[i]
        )

        # Contact success
        call_attempts = max(1, int(np.random.poisson(4)))
        contact_success = generate_contact_success_data(
            call_attempts, on_time_percentages[i], employment_status[i], ages[i]
        )

        # Credit score
        delinquency = np.random.random() < 0.15
        credit_score = calculate_credit_score(
            on_time_percentages[i], missed_payments_count, delinquency,
            np.random.random() < 0.15
        )

        record = {
            # Basic loan information
            "Customer_id": customer_id,  # Use pre-generated ID
            "Loan_Account_id": int(np.random.randint(10000000, 99999999)),
            "Product_Type": product_types_arr[i],
            "Loan_Amount_SGD": round(float(loan_amounts[i]), 2),
            "Outstanding_Balance_SGD": round(outstanding_balance, 2),
            "Day_Past_Due": days_past_due,
            "Tenure": tenures[i],
            "Interest_Rate": interest_rates_arr[i],
            "Current_EMI_SGD": current_emi,
            "Installment_Due_Date": (datetime.now() + timedelta(days=np.random.randint(1, 30))).date(),
            "Last_Payment_Date": (datetime.now() - timedelta(days=np.random.randint(1, 120))).date(),

            # Payment behavior
            "Partial_Payment_Indicator": np.random.random() < 0.15,
            "Number_of_Past_Payments": months_completed[i],
            "Payment_Frequency": payment_frequencies[i],
            "Amount_Paid_Each_Month_SGD": base_emis[i],
            "Missed_Payments_Count": missed_payments_count,
            "Settlement_History": np.random.choice(["Settled", "Partial Settlement", "Not Settled", "Under Negotiation"]),
            "Repayment_Irregularity_Flags": on_time_percentages[i] < 0.7,

            # Customer profile
            "Lifestyle_Profile": lifestyles[i],
            "Customer_Employment_Status": employment_status[i],

            # Communication data
            "Communication_Data_Completion": communication_completion[i],
            "Valid_Phone_Number": valid_primary_mask[i],
            "No_of_Valid_Numbers": no_of_valid_numbers[i],  # Now integer only
            "No_of_Invalid_Numbers": no_of_invalid_numbers[i],  # Now integer only
            "Mobile_Number_Change_Count": int(mobile_change_count[i]),  # Ensure integer
            "Mobile_Number_Change_Count_This_Year": int(mobile_change_this_year[i]),  # Ensure integer
            "Address_Change_Count": int(address_change_count[i]),  # Ensure integer
            "Address_Change_Count_This_Year": int(address_change_this_year[i]),  # Ensure integer
            "Contact_Data_Change_Frequency": contact_change_frequency[i],

            # Spend analysis
            "Finance_Stress_Status": financial_health["finance_stress_status"],
            "Utility_Spend_SGD": spend_profile["utility_spend"],
            "Shopping_Spend_SGD": spend_profile["shopping_spend"],
            "Entertainment_Spend_SGD": spend_profile["entertainment_spend"],
            "Health_Spend_SGD": spend_profile["health_spend"],
            "Education_Spend_SGD": spend_profile["education_spend"],
            "Travel_Spend_SGD": spend_profile["travel_spend"],
            "Monthly_Spend_Trend_SGD": spend_profile["monthly_spend_trend"],
            "Seasonal_Spend_Variation": spend_profile["seasonal_spend_variation"],
            "Weekend_Spend_Ratio": spend_profile["weekend_spend_ratio"],
            "Festive_Season_Spend_SGD": spend_profile["festive_spend"],
            "Total_Monthly_Spend_SGD": spend_profile["total_monthly_spend"],
            "Spend_to_Income_Ratio": spend_profile["spend_ratio"],

            # Payment behavior
            "UPI_Transaction_Count": payment_behavior["upi_count"],
            "Debit_Card_Transaction_Count": payment_behavior["debit_count"],
            "Credit_Card_Transaction_Count": payment_behavior["credit_count"],
            "Cash_Withdrawal_Count": payment_behavior["cash_count"],
            "Recurring_Transaction_Count": payment_behavior["recurring_count"],
            "Preferred_Payment_Channel": payment_behavior["preferred_channel"],
            "Recurring_Payment_Ratio": payment_behavior["recurring_ratio"],
            "Savings_to_Spend_Ratio": payment_behavior["savings_ratio"],
            "Spend_Growth_Rate_YoY": payment_behavior["spend_growth"],
            "High_Value_Transaction_Count": payment_behavior["high_value_count"],

            # Financial health
            "Flight_Risk_Score": financial_health["flight_risk"],
            "Financial_Health_Status": financial_health["financial_health_status"],
            "Financial_Stress_Score": financial_health["financial_stress_score"],
            "Avg_Balance_Trends": financial_health["avg_balance_trend"],
            "Overdraft_or_Low_Balance_Flag": financial_health["overdraft_flag"],
            "AAR_Score": financial_health["aar_score"],
            "AAR_Risk_Level": financial_health["aar_risk_level"],

            # Contact success
            "Successful_Contacts_Count": contact_success["successful_contacts"],
            "Contact_Success_Rate": contact_success["contact_success_rate"],
            "Last_Successful_Agent_ID": contact_success["last_successful_agent"],
            "Best_Contact_Agent_IDs": contact_success["best_agents"],
            "Avg_Time_With_Best_Agents_Min": contact_success["avg_times_best_agents"],
            "Customer_Best_Agent_Interaction_Count": contact_success["best_agent_interaction_count"],

            # Customer demographics
            "Name": full_names[i],
            "Age": ages[i],
            "Occupation": occupations[i],
            "Gender": genders_arr[i],
            "Primary_Phone_Number": mobile_numbers[i],
            "Secondary_Mobile_Number": secondary_numbers[i],
            "Landline_Phone_Number": f"+65 6{np.random.randint(1000000, 9999999):07d}",
            "Email_ID": f"{first_names[i].lower()}.{last_names[i].lower()}@{np.random.choice(email_domains)}",
            "Income_Band_SGD": income_bands_arr[i],
            "Monthly_Income_SGD": monthly_incomes[i],
            "Employeement_Type": np.random.choice(["Full time", "Part time", "Contract", "Freelance", "Unemployed"]),
            "Address": f"{np.random.randint(1, 999)} {np.random.choice(street_names)} #{np.random.randint(1, 50):02d}-{np.random.randint(1, 99):02d} Singapore {np.random.randint(100000, 999999)}",
            "City": np.random.choice(singapore_cities),
            "Language_Preference": np.random.choice(["English", "Regional"], p=[0.7, 0.3]),
            "Mobile_Number_Active_Status": valid_primary_mask[i],
            "Email_Activity": np.random.random() < 0.8,

            # Additional fields
            "Credit_Score": credit_score,
            "Recent_Inquiries": int(np.random.poisson(2)),
            "Loan_Exposure_Across_Banks": int(np.random.poisson(1)),
            "Delinquency_on_other_Loans": delinquency,
        }
        batch_data.append(record)

    return batch_data

# Main execution
def main():
    print(f"Generating {num_records:,} customer records")

    # Pre-generate unique customer IDs
    print("Generating unique Customer IDs...")
    customer_ids = generate_unique_customer_ids(num_records)
    print(f"Generated {len(customer_ids):,} unique Customer IDs")

    all_data = []
    start_time = datetime.now()

    # Track distributions for validation
    age_occupation_tracker = defaultdict(lambda: defaultdict(int))
    stress_tracker = defaultdict(int)
    savings_tracker = defaultdict(int)

    # Process in batches
    for batch_num in range(0, num_records, batch_size):
        current_batch_size = min(batch_size, num_records - batch_num)
        print(f"Processing batch {batch_num//batch_size + 1}/{(num_records + batch_size - 1)//batch_size} "
              f"({current_batch_size} records)...")

        # Get customer IDs for this batch
        batch_customer_ids = customer_ids[batch_num:batch_num + current_batch_size]

        batch_data = generate_batch_data(current_batch_size, batch_customer_ids)
        all_data.extend(batch_data)

        # Track distributions
        for record in batch_data:
            age_group = f"{(record['Age'] // 10) * 10}s"
            age_occupation_tracker[age_group][record['Occupation']] += 1
            stress_tracker[record['Finance_Stress_Status']] += 1
            savings_tracker["Negative" if record['Savings_to_Spend_Ratio'] < 0 else "Positive"] += 1

        if (batch_num // batch_size) % 5 == 0:
            gc.collect()

    # Create DataFrame
    df = pd.DataFrame(all_data)

    # Add remaining columns
    df["Contact_History_Call_Attempts"] = np.random.poisson(4, len(df)).astype(int)
    df["Contact_History_SMS"] = np.random.poisson(5, len(df)).astype(int)
    df["Contact_History_WhatsApp"] = np.random.poisson(3, len(df)).astype(int)
    df["Contact_History_EmailLogs"] = np.random.poisson(2, len(df)).astype(int)
    df["No_of_Attempts"] = (df["Contact_History_Call_Attempts"] +
                           df["Contact_History_SMS"] +
                           df["Contact_History_WhatsApp"] +
                           df["Contact_History_EmailLogs"]).astype(int)
    df["Average_Handling_Time"] = np.round(np.random.uniform(8, 25, len(df)), 2)

    # Fill other categorical columns
    df["Channel_used"] = np.random.choice(["Call", "SMS", "WhatsApp", "Email", "Field Agent", "IVR"], len(df))
    df["Response_Outcome"] = np.random.choice(["Connected", "Promised to pay", "Ignored", "Disconnected", "Paid fully", "Partial paid"], len(df))
    df["App_Login_Frequency"] = np.random.poisson(12, len(df)).astype(int)
    df["Online_Banking_Activity"] = np.random.poisson(10, len(df)).astype(int)
    df["Smartphone_Penetration"] = np.where(df["Age"] < 60, "High", np.where(df["Age"] < 70, "Medium", "Low"))
    df["Preferred_Channel"] = np.random.choice(["Call", "SMS", "WhatsApp", "Email", "App notification", "Field Agent", "IVR"], len(df))
    df["Call_SMS_Activity_Patterns"] = np.random.choice(["Low", "Medium", "High"], len(df))
    df["WhatsApp_OTT_usage_Indicator"] = np.random.choice([True, False], len(df), p=[0.8, 0.2])
    df["Recent_Score_Change"] = np.random.randint(-40, 41, len(df))
    df["Unemployeement_rate_region"] = np.round(np.random.uniform(1.8, 2.2, len(df)), 2)
    df["Inflation_Rate"] = np.round(np.random.uniform(3.5, 5.0, len(df)), 2)
    df["Interest_Rate_Trend"] = np.round(np.random.uniform(-0.1, 0.3, len(df)), 2)
    df["Economic_Stress_Index"] = np.round(np.random.uniform(0.1, 0.3, len(df)), 2)
    df["Do_Not_Call_Registry_Data"] = np.random.choice([True, False], len(df), p=[0.15, 0.85])
    df["Regional_Time_Restrictions"] = np.random.choice(["Morning", "Afternoon", "Evening", "Night"], len(df))
    df["Communication_Complaince_Limits"] = np.random.choice(["Daytime", "Evening", "Weekdays", "Weekends", "Holidays"], len(df))

    # ENSURE INTEGER COLUMNS: Convert all count columns to integers
    integer_columns = [
        'No_of_Valid_Numbers', 'No_of_Invalid_Numbers', 'Mobile_Number_Change_Count',
        'Mobile_Number_Change_Count_This_Year', 'Address_Change_Count', 'Address_Change_Count_This_Year',
        'UPI_Transaction_Count', 'Debit_Card_Transaction_Count', 'Credit_Card_Transaction_Count',
        'Cash_Withdrawal_Count', 'Recurring_Transaction_Count', 'High_Value_Transaction_Count',
        'Successful_Contacts_Count', 'Customer_Best_Agent_Interaction_Count', 'Contact_History_Call_Attempts',
        'Contact_History_SMS', 'Contact_History_WhatsApp', 'Contact_History_EmailLogs', 'No_of_Attempts',
        'App_Login_Frequency', 'Online_Banking_Activity', 'Recent_Score_Change', 'Recent_Inquiries',
        'Loan_Exposure_Across_Banks', 'Missed_Payments_Count', 'Number_of_Past_Payments', 'Tenure',
        'Day_Past_Due', 'Age', 'Credit_Score'
    ]

    for col in integer_columns:
        if col in df.columns:
            df[col] = df[col].astype(int)

    # Validate data
    print("Validating data constraints...")
    assert all(5000 <= amt <= 500000 for amt in df.Loan_Amount_SGD)
    assert all(ob > 0 for ob in df.Outstanding_Balance_SGD)
    assert all(tenure in [12,24,36,48,60] for tenure in df.Tenure)
    assert all(8.0 <= rate <= 10.0 for rate in df.Interest_Rate)

    # Format float columns
    float_columns = df.select_dtypes(include=['float64']).columns
    df[float_columns] = df[float_columns].round(2)

    # Save to CSV
    output_file = 'singapore_loan_data_100k_enhanced.csv.gz'
    print(f"Saving to {output_file}...")
    df.to_csv(output_file, index=False, compression='gzip')

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    print(f"\n=== GENERATION COMPLETE ===")
    print(f"Generated {len(df):,} customer records")
    print(f"Time taken: {duration:.2f} seconds")
    print(f"Records per second: {len(df)/duration:.0f}")
    print(f"File saved: {output_file}")

    # Enhanced reporting
    print(f"\n=== ENHANCED DATA QUALITY REPORT ===")
    print(f"Total customers: {len(df):,}")

    # Stress distribution
    print(f"\nFinancial Stress Distribution:")
    for stress_level, count in sorted(stress_tracker.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / len(df)) * 100
        print(f"  {stress_level}: {count:,} ({percentage:.1f}%)")

    # Savings distribution
    print(f"\nSavings Distribution:")
    for savings_type, count in sorted(savings_tracker.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / len(df)) * 100
        print(f"  {savings_type}: {count:,} ({percentage:.1f}%)")

    # Customer ID validation
    customer_id_starts = df['Customer_id'].str[3].value_counts()
    print(f"\nCustomer ID Starting Digits:")
    for digit, count in customer_id_starts.items():
        percentage = (count / len(df)) * 100
        print(f"  Starts with {digit}: {count:,} ({percentage:.1f}%)")

    # Check for three consecutive digits
    def has_three_consecutive(s):
        for i in range(len(s) - 2):
            if s[i] == s[i+1] == s[i+2]:
                return True
        return False

    three_consecutive_count = df['Customer_id'].apply(lambda x: has_three_consecutive(x[3:])).sum()
    print(f"Customer IDs with three consecutive digits: {three_consecutive_count} (should be 0)")

    # Check uniqueness
    unique_customer_ids = df['Customer_id'].nunique()
    print(f"Unique Customer IDs: {unique_customer_ids:,} (should be {len(df):,})")

    # Check No_of_Valid_Numbers data type and values
    print(f"\nNo_of_Valid_Numbers validation:")
    print(f"Data type: {df['No_of_Valid_Numbers'].dtype}")
    print(f"Unique values: {sorted(df['No_of_Valid_Numbers'].unique())}")
    print(f"Value counts:")
    for value, count in df['No_of_Valid_Numbers'].value_counts().sort_index().items():
        print(f"  {value}: {count:,} customers")

    print(f"\nFirst 5 customers with new Customer IDs:")
    sample_cols = ['Customer_id', 'Name', 'Age', 'Occupation', 'Monthly_Income_SGD',
                   'Finance_Stress_Status', 'Savings_to_Spend_Ratio', 'No_of_Valid_Numbers']
    print(df[sample_cols].head().to_string(index=False))

if __name__ == "__main__":
    main()

Generating 100,000 customer records
Generating unique Customer IDs...
Generated 100,000 unique Customer IDs
Processing batch 1/5 (20000 records)...
Processing batch 2/5 (20000 records)...
Processing batch 3/5 (20000 records)...
Processing batch 4/5 (20000 records)...
Processing batch 5/5 (20000 records)...
Validating data constraints...
Saving to singapore_loan_data_100k_enhanced.csv.gz...

=== GENERATION COMPLETE ===
Generated 100,000 customer records
Time taken: 109.18 seconds
Records per second: 916
File saved: singapore_loan_data_100k_enhanced.csv.gz

=== ENHANCED DATA QUALITY REPORT ===
Total customers: 100,000

Financial Stress Distribution:
  Medium stress: 51,929 (51.9%)
  High stress: 32,508 (32.5%)
  Extreme High stress: 13,833 (13.8%)
  Low stress: 1,730 (1.7%)

Savings Distribution:
  Positive: 85,877 (85.9%)
  Negative: 14,123 (14.1%)

Customer ID Starting Digits:
  Starts with 9: 50,122 (50.1%)
  Starts with 8: 49,878 (49.9%)
Customer IDs with three consecutive digits: 0 