# Generate Customer Account Sample Data

## Overview
This notebook generates sample data for the CustomerAccount table with specific business rules and distributions.

## Business Rules
- **513 customers** with CustomerId values CID-001 to CID-513
- **Two ParentAccountId values**: PA-1 (70%) and PA-2 (30%)
- **80% of customers** have two accounts (one for each ParentAccountId)
- **20% of customers** have only one account (randomly assigned to PA-1 or PA-2)

## Output
- File: `C:\temp\samples\output\CustomerAccount_samples.csv`
- Contains CustomerAccountId, ParentAccountId, CustomerId, IsoCurrencyCode fields

---

In [None]:

import pandas as pd
import numpy as np
import random
import os
from datetime import datetime, date

# Set seed for reproducible results
random.seed(42)
np.random.seed(42)

# Configuration
CUSTOMER_COUNT = 513  # Number of customers
OUTPUT_FOLDER = "C:\\temp\\samples\\output"  # Output folder
OUTPUT_FILE = "CustomerAccount_Samples.csv"

# Create output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Remove existing output file if it exists
output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
if os.path.exists(output_path):
    os.remove(output_path)
    print(f"🗑️ Removed existing file: {output_path}")

print(f"🎯 GENERATING CUSTOMER ACCOUNT SAMPLE DATA")
print(f"Customer Count: {CUSTOMER_COUNT}")
print(f"Output: {OUTPUT_FOLDER}\\{OUTPUT_FILE}")
print("="*50)

# Generate customer account data based on business rules
def generate_customer_accounts():
    """Generate customer accounts based on business rules"""
    accounts = []
    account_counter = 1
    
    # Generate CustomerIds
    customer_ids = [f"CID-{i+1:03d}" for i in range(CUSTOMER_COUNT)]
    
    for customer_id in customer_ids:
        # 80% chance of having two accounts (one for each parent)
        if random.random() < 0.8:
            # Customer has two accounts - one for PA-1 and one for PA-2
            for parent_id in ["PA-1", "PA-2"]:
                account_id = f"CA-{account_counter:04d}"
                # Set CustomerAccountName based on ParentAccountId
                account_name = "Fabric" if parent_id == "PA-1" else "ADB"
                
                accounts.append({
                    'CustomerAccountId': account_id,
                    'ParentAccountId': parent_id,
                    'CustomerAccountName': account_name,
                    'CustomerId': customer_id,
                    'IsoCurrencyCode': 'USD'
                })
                account_counter += 1
        else:
            # 20% chance of having only one account
            # Randomly assign to PA-1 (70% chance) or PA-2 (30% chance)
            if random.random() < 0.7:
                parent_id = "PA-1"
                account_name = "Fabric"
            else:
                parent_id = "PA-2"
                account_name = "ADB"
            
            account_id = f"CA-{account_counter:04d}"
            accounts.append({
                'CustomerAccountId': account_id,
                'ParentAccountId': parent_id,
                'CustomerAccountName': account_name,
                'CustomerId': customer_id,
                'IsoCurrencyCode': 'USD'
            })
            account_counter += 1
    
    return accounts

print("🔄 Generating customer account data...")

# Generate all accounts
accounts_data = generate_customer_accounts()

# Create DataFrame
df = pd.DataFrame(accounts_data)

print("✅ Customer account generation complete!")

🎯 GENERATING CUSTOMER ACCOUNT SAMPLE DATA
Customer Count: 513
Output: C:\temp\samples\output\CustomerAccount_Samples.csv
🔄 Generating customer account data...
✅ Customer account generation complete!


In [2]:
# filepath: c:\Repos\Code\SampleDataPrep\src\notebooks\data\Generate_Customer_Account_Fields.ipynb
# Display distributions and statistics
print("\n📊 CUSTOMER ACCOUNT DATA ANALYSIS")
print("="*50)

# Basic counts
total_accounts = len(df)
unique_customers = df['CustomerId'].nunique()
print(f"📊 Total Accounts: {total_accounts}")
print(f"👥 Unique Customers: {unique_customers}")
print(f"📈 Average Accounts per Customer: {total_accounts/unique_customers:.2f}")

# ParentAccountId Distribution
print("\n🎯 ParentAccountId Distribution:")
parent_dist = df['ParentAccountId'].value_counts()
parent_pct = df['ParentAccountId'].value_counts(normalize=True) * 100
for parent_id in ['PA-1', 'PA-2']:
    count = parent_dist.get(parent_id, 0)
    percent = parent_pct.get(parent_id, 0)
    print(f"  {parent_id}: {count:3d} accounts ({percent:5.1f}%)")

# Customer account count distribution
print("\n🎯 Accounts per Customer Distribution:")
customer_account_counts = df['CustomerId'].value_counts()
accounts_per_customer_dist = customer_account_counts.value_counts().sort_index()

for account_count, customer_count in accounts_per_customer_dist.items():
    percent = (customer_count / unique_customers) * 100
    print(f"  {account_count} account(s): {customer_count:3d} customers ({percent:5.1f}%)")

# Verify business rules
print("\n✅ BUSINESS RULE VERIFICATION:")

# Check customers with 2 accounts
customers_with_2_accounts = len(customer_account_counts[customer_account_counts == 2])
customers_with_1_account = len(customer_account_counts[customer_account_counts == 1])

print(f"  Customers with 2 accounts: {customers_with_2_accounts:3d} ({customers_with_2_accounts/unique_customers*100:5.1f}%)")
print(f"  Customers with 1 account : {customers_with_1_account:3d} ({customers_with_1_account/unique_customers*100:5.1f}%)")

# Verify that customers with 2 accounts have one PA-1 and one PA-2
print("\n🔍 Verifying two-account customers have both PA-1 and PA-2:")
two_account_customers = customer_account_counts[customer_account_counts == 2].index

valid_two_account_customers = 0
for customer_id in two_account_customers:
    customer_accounts = df[df['CustomerId'] == customer_id]['ParentAccountId'].values
    if 'PA-1' in customer_accounts and 'PA-2' in customer_accounts:
        valid_two_account_customers += 1

print(f"  Valid two-account customers: {valid_two_account_customers}/{len(two_account_customers)}")

# Check distribution for single-account customers
print("\n🔍 Single-account customer distribution:")
single_account_customers = customer_account_counts[customer_account_counts == 1].index
single_account_parents = df[df['CustomerId'].isin(single_account_customers)]['ParentAccountId'].value_counts()

for parent_id in ['PA-1', 'PA-2']:
    count = single_account_parents.get(parent_id, 0)
    percent = (count / len(single_account_customers)) * 100 if len(single_account_customers) > 0 else 0
    print(f"  Single accounts in {parent_id}: {count:3d} ({percent:5.1f}%)")

# Currency verification
print(f"\n💰 Currency Distribution:")
currency_dist = df['IsoCurrencyCode'].value_counts()
for currency, count in currency_dist.items():
    print(f"  {currency}: {count:3d} accounts (100.0%)")

# Display sample records
print(f"\n📋 Sample Records (First 15):")
sample_df = df.head(15)
print(sample_df.to_string(index=False))

# Verify CustomerAccountName distribution
print(f"\n🎯 CustomerAccountName Distribution:")
account_name_dist = df['CustomerAccountName'].value_counts()
account_name_pct = df['CustomerAccountName'].value_counts(normalize=True) * 100
for account_name in ['Fabric', 'ADB']:
    count = account_name_dist.get(account_name, 0)
    percent = account_name_pct.get(account_name, 0)
    print(f"  {account_name}: {count:3d} accounts ({percent:5.1f}%)")

# Save to CSV
output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
df.to_csv(output_path, index=False)

print(f"\n💾 SAVED TO: {output_path}")
print(f"📊 Total Records: {len(df)}")
print(f"📈 Columns: {', '.join(df.columns)}")
print("\n✅ Customer account sample data generation complete!")


📊 CUSTOMER ACCOUNT DATA ANALYSIS
📊 Total Accounts: 925
👥 Unique Customers: 513
📈 Average Accounts per Customer: 1.80

🎯 ParentAccountId Distribution:
  PA-1: 475 accounts ( 51.4%)
  PA-2: 450 accounts ( 48.6%)

🎯 Accounts per Customer Distribution:
  1 account(s): 101 customers ( 19.7%)
  2 account(s): 412 customers ( 80.3%)

✅ BUSINESS RULE VERIFICATION:
  Customers with 2 accounts: 412 ( 80.3%)
  Customers with 1 account : 101 ( 19.7%)

🔍 Verifying two-account customers have both PA-1 and PA-2:
  Valid two-account customers: 412/412

🔍 Single-account customer distribution:
  Single accounts in PA-1:  63 ( 62.4%)
  Single accounts in PA-2:  38 ( 37.6%)

💰 Currency Distribution:
  USD: 925 accounts (100.0%)

📋 Sample Records (First 15):
CustomerAccountId ParentAccountId CustomerAccountName CustomerId IsoCurrencyCode
          CA-0001            PA-1              Fabric    CID-001             USD
          CA-0002            PA-2                 ADB    CID-001             USD
         