# Finance Sample Data Generation - Summary

This notebook generates synthetic sample data for the Finance domain, including invoices, payments, and account tables, based on customer and order data for both ADB and Fabric channels.

**Inputs:**
- Customer, Account, Product, and Location sample CSV files
- Order and OrderPayment sample CSV files for ADB and Fabric

**Outputs:**
- Invoice, Payment, and Account sample CSV files for ADB and Fabric channels
- Each record is linked to its source order and customer, with traceable identifiers
- Invoice and Payment numbers use prefixes (IN-, PM-) and are derived from Order numbers
- Account tables are generated per channel, reflecting balances and statuses

**Logic:**
- Loads input data and prints summary statistics
- Generates invoices and payments with correct numbering and channel separation
- Aggregates account balances and statuses per channel
- Writes all outputs to the specified output directory

Use this notebook to create realistic finance sample data for analytics, testing, or demonstration purposes.

In [14]:
import pandas as pd
import uuid
import os
import glob
from datetime import datetime

base_dir = r'C:\Users\v-yamini3\Documents\GitRepos\MAAG-Repo-Latest\MaagDataFoundationForAI\architecture\Code_for_Architecture_Design\generate_samples'
output_dir = os.path.join(base_dir, 'finance_generation_output')
os.makedirs(output_dir, exist_ok=True)

# Load customer data
customer_file = os.path.join(base_dir, 'order_generation_input', 'Customer_Samples.csv')
customers = pd.read_csv(customer_file)

# Find all order files for Fabric and ADB
order_files = glob.glob(os.path.join(base_dir, 'order_generation_output', 'Order_Samples_*.csv'))
orderpayment_files = glob.glob(os.path.join(base_dir, 'order_generation_output', 'OrderPayment_*.csv'))

# Status mapping
invoice_status_map = {
    'Completed': 'Issued',
    'Shipped': 'Issued',
    'Pending': 'Issued',
    'Cancelled': 'Cancelled',
    'Returned': 'Refunded'
}
payment_status_map = {
    'Completed': 'Completed',
    'Shipped': 'Completed',
    'Pending': 'Pending',
    'Cancelled': 'Failed',
    'Returned': 'Refunded'
}

# Prepare account aggregation per channel
account_balances = {'ADB': {}, 'Fabric': {}}

for order_file in order_files:
    if 'ADB' in order_file:
        channel = 'ADB'
        channel_code = 'A'
    elif 'F' in order_file:
        channel = 'Fabric'
        channel_code = 'F'
    elif 'ADB' in order_file:
        channel = 'ADB'
        channel_code = 'A'
    else:
        channel = 'Other'
        channel_code = 'X'
    orderpayment_file = [f for f in orderpayment_files if channel in f]

    orders = pd.read_csv(order_file)
    orderpayments = pd.read_csv(orderpayment_file[0]) if orderpayment_file else pd.DataFrame()

    invoice_list = []
    payment_list = []

    for idx, row in orders.iterrows():
        order_status = row.get('OrderStatus', 'Completed')
        customer_id = row.get('CustomerId', '')
        order_id = row.get('OrderId', '')
        order_number = row.get('OrderNumber', f"{channel_code}{1000+idx}")
        invoice_date = pd.to_datetime(row.get('OrderDate', datetime(2018, 1, 10))).date()
        due_date = invoice_date
        subtotal = row.get('SubTotal', 0.0)
        tax_amount = row.get('TaxAmount', 0.0)
        total_amount = row.get('OrderTotal', 0.0)
        invoice_status = invoice_status_map.get(order_status, 'Issued')

        invoice_id = str(uuid.uuid4())
        invoice_number = f"IN-{order_number}"

        invoice_list.append({
            "InvoiceId": invoice_id,
            "InvoiceNumber": invoice_number,
            "CustomerId": customer_id,
            "OrderId": order_id,
            "InvoiceDate": invoice_date,
            "DueDate": due_date,
            "SubTotal": subtotal,
            "TaxAmount": tax_amount,
            "TotalAmount": total_amount,
            "InvoiceStatus": invoice_status
        })

        # Find payment(s) for this order
        payments_for_order = orderpayments[orderpayments['OrderId'] == order_id] if not orderpayments.empty else pd.DataFrame()
        if not payments_for_order.empty:
            for _, pay in payments_for_order.iterrows():
                payment_id = str(uuid.uuid4())
                payment_status = payment_status_map.get(pay.get('PaymentStatus', order_status), 'Completed')
                payment_method = pay.get('PaymentMethod', 'VISA')
                payment_amount = pay.get('PaymentAmount', total_amount)
                payment_date = pd.to_datetime(pay.get('PaymentDate', invoice_date)).date()
                payment_number = f"PM-{order_number}"
                payment_list.append({
                    "PaymentId": payment_id,
                    "PaymentNumber": payment_number,
                    "InvoiceId": invoice_id,
                    "CustomerId": customer_id,
                    "PaymentDate": payment_date,
                    "PaymentAmount": payment_amount,
                    "PaymentMethod": payment_method,
                    "PaymentStatus": payment_status
                })
                if payment_status == "Completed":
                    account_balances[channel].setdefault(customer_id, 0.0)
                    account_balances[channel][customer_id] -= payment_amount
        else:
            payment_id = str(uuid.uuid4())
            payment_status = payment_status_map.get(order_status, 'Completed')
            payment_method = "VISA"
            payment_amount = total_amount
            payment_date = invoice_date
            payment_number = f"PM-{order_number}"
            payment_list.append({
                "PaymentId": payment_id,
                "PaymentNumber": payment_number,
                "InvoiceId": invoice_id,
                "CustomerId": customer_id,
                "PaymentDate": payment_date,
                "PaymentAmount": payment_amount,
                "PaymentMethod": payment_method,
                "PaymentStatus": payment_status
            })
            if payment_status == "Completed":
                account_balances[channel].setdefault(customer_id, 0.0)
                account_balances[channel][customer_id] -= payment_amount

        account_balances[channel].setdefault(customer_id, 0.0)
        account_balances[channel][customer_id] += total_amount

    invoices_df = pd.DataFrame(invoice_list)
    payments_df = pd.DataFrame(payment_list)
    invoices_df.to_csv(os.path.join(output_dir, f'Invoice_Samples_{channel}.csv'), index=False)
    payments_df.to_csv(os.path.join(output_dir, f'Payment_Samples_{channel}.csv'), index=False)
    print(f"✅ {channel} Invoice and Payment sample data generated!")

# Find earliest order date for each customer per channel
customer_first_order = {'ADB': {}, 'Fabric': {}}
for order_file in order_files:
    if 'F' in order_file:
        channel = 'Fabric'
    elif 'A' in order_file:
        channel = 'ADB'
    else:
        channel = 'Other'
    orders = pd.read_csv(order_file)
    for _, row in orders.iterrows():
        customer_id = row['CustomerId']
        order_date = pd.to_datetime(row['OrderDate'], errors='coerce')
        if pd.notnull(order_date):
            order_date_val = order_date.date()
            if customer_id not in customer_first_order[channel] or (
                isinstance(customer_first_order[channel][customer_id], datetime) and order_date_val < customer_first_order[channel][customer_id]
            ) or (
                not isinstance(customer_first_order[channel][customer_id], datetime)
            ):
                customer_first_order[channel][customer_id] = order_date_val

# Generate Account tables for ADB and Fabric
for channel in ['ADB', 'Fabric']:
    accounts = []
    for idx, row in customers.iterrows():
        customer_id = row['CustomerId']
        balance = account_balances[channel].get(customer_id, 0.0)
        account_status = "Active" if balance == 0 else "Overdue"
        account_id = str(uuid.uuid4())
        account_number = f"ACC-{channel}-{1000+idx}"
        account_type = "Receivable"
        created_date = customer_first_order[channel].get(customer_id, datetime(2018, 1, 10).date())
        closed_date = None
        currency = "USD"
        description = f"Customer receivable account ({channel})"
        accounts.append({
            "AccountId": account_id,
            "AccountNumber": account_number,
            "CustomerId": customer_id,
            "AccountType": account_type,
            "AccountStatus": account_status,
            "CreatedDate": created_date,
            "ClosedDate": closed_date,
            "Balance": balance,
            "Currency": currency,
            "Description": description
        })
    accounts_df = pd.DataFrame(accounts)
    accounts_df.to_csv(os.path.join(output_dir, f'Account_Samples_{channel}.csv'), index=False)
    print(f"✅ Account sample data generated for {channel}!")

  invoice_date = pd.to_datetime(row.get('OrderDate', datetime(2018, 1, 10))).date()


✅ ADB Invoice and Payment sample data generated!
✅ Fabric Invoice and Payment sample data generated!
✅ Fabric Invoice and Payment sample data generated!


  order_date = pd.to_datetime(row['OrderDate'], errors='coerce')


✅ Account sample data generated for ADB!
✅ Account sample data generated for Fabric!
