In [30]:
from faker import Faker
import random
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine

fake = Faker()

# Generate Customers
def generate_customers(n=100000):
    customers = []
    for i in range(n):
        customers.append({
            "customer_id": fake.uuid4(),
            "first_name": fake.first_name(),
            "last_name": fake.last_name(),
            "email": fake.unique.email(),
            "phone": fake.phone_number(),
            "dob": fake.date_of_birth(minimum_age=18, maximum_age=65).strftime("%Y-%m-%d"),
            "gender": random.choice(["Male", "Female", "Other"]),
            "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        if (i + 1) % 100000 == 0:
            print(f"✅ Generated {i + 1} / {n} customers")
    df = pd.DataFrame(customers)
    df.to_csv("customers.csv", index=False)
    print(f"🎉 Customers generation completed: {len(df)} rows")
    return df

# Generate Accounts
def generate_accounts(customers_df):
    accounts = []
    for i, customer in enumerate(customers_df.itertuples(index=False)):
        accounts.append({
            "account_id": fake.uuid4(),
            "customer_id": customer.customer_id,
            "account_type": random.choice(["Savings", "Checking", "Business"]),
            "account_balance": round(random.uniform(100, 100000), 2),
            "currency": "USD",
            "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        if (i + 1) % 100000 == 0:
            print(f"✅ Generated {i + 1} / {len(customers_df)} accounts")
    df = pd.DataFrame(accounts)
    df.to_csv("accounts.csv", index=False)
    print(f"🎉 Accounts generation completed: {len(df)} rows")
    return df

# Run the Generators (Smaller sample sizes for testing)
customers_df = generate_customers(100000)
accounts_df = generate_accounts(customers_df)

✅ Generated 100000 / 100000 customers
🎉 Customers generation completed: 100000 rows
✅ Generated 100000 / 100000 accounts
🎉 Accounts generation completed: 100000 rows


In [28]:
import os
# Check if the file exists
if os.path.exists(file_path):
    os.remove("customers.csv")  # Delete the file
    print(f"✅ {file_path} has been deleted.")
else:
    print("❌ File does not exist.")

✅ transactions.csv has been deleted.


In [29]:
import os
# Check if the file exists
if os.path.exists(file_path):
    os.remove("accounts.csv")  # Delete the file
    print(f"✅ {file_path} has been deleted.")
else:
    print("❌ File does not exist.")

✅ transactions.csv has been deleted.


In [9]:
# Check accounts_df before passing it to the function
print(accounts_df.head())  # Ensure that the dataframe contains account_id


                             account_id                           customer_id  \
0  3cd69138-cb49-4467-b756-a2d61b8040f2  09cc5550-3361-4683-99bc-0cba777ebaa6   
1  44a739fc-74ba-4c41-a4c9-ddb76f0c99ff  13265d4b-1851-46c4-9f0a-296af6beaf89   
2  ecf4c747-f48f-41d9-b070-4c4d2897e691  e42b8c5e-fc14-4d4c-9ea1-8564ea6381c5   
3  0463f8d7-f76c-4943-92d8-3496b8f8b265  b5342ae2-fcb2-4c1d-9ace-bdfbb3a11081   
4  b64cc214-b9fb-48e0-b28a-f8b0116c599b  0d64f834-1492-4905-b394-6f350e235816   

  account_type  account_balance currency           created_at  
0      Savings         71172.92      USD  2025-03-21 13:00:16  
1      Savings         92456.74      USD  2025-03-21 13:00:16  
2      Savings          6416.11      USD  2025-03-21 13:00:16  
3     Business         25404.55      USD  2025-03-21 13:00:16  
4     Business         24279.80      USD  2025-03-21 13:00:16  


In [36]:
def generate_transactions(accounts_df, n=400000, batch_size=100000):
    transactions = []
    account_ids = accounts_df["account_id"].values  # Directly access the numpy array
    for i in range(n):
        transactions.append({
            "transaction_id": fake.uuid4(),
            "account_id": random.choice(account_ids),  # Use numpy array for faster random selection
            "transaction_type": random.choice(["Deposit", "Withdrawal", "Transfer", "Payment"]),
            "amount": round(random.uniform(5, 5000), 2),
            "transaction_date": fake.date_time_this_year().strftime("%Y-%m-%d %H:%M:%S"),
            "status": random.choice(["Completed", "Pending", "Failed"])
        })
        
        # Write to CSV every batch_size records
        if (i + 1) % batch_size == 0:
            print(f"✅ Generated {i + 1} / {n} transactions")
            df = pd.DataFrame(transactions)
            df.to_csv("transactions.csv", mode='a', header=(i == 0), index=False)  # Append to CSV
            transactions.clear()  # Clear the list to free memory
    
    # Final write if the remaining transactions are less than batch_size
    if transactions:
        df = pd.DataFrame(transactions)
        df.to_csv("transactions.csv", mode='a', header=False, index=False)
    
    return df

# Run the function with 100,000 transactions (adjust the number as needed)
transactions_df = generate_transactions(accounts_df, 400000)

✅ Generated 100000 / 400000 transactions
✅ Generated 200000 / 400000 transactions
✅ Generated 300000 / 400000 transactions
✅ Generated 400000 / 400000 transactions


In [32]:
import os
# Check if the file exists
if os.path.exists(file_path):
    os.remove("transactions.csv")  # Delete the file
    print(f"✅ {file_path} has been deleted.")
else:
    print("❌ File does not exist.")

❌ File does not exist.


In [37]:
def generate_loans(customers_df, n=60000, batch_size=10000):
    loans = []
    for i in range(n):
        loans.append({
            "loan_id": fake.uuid4(),
            "customer_id": random.choice(customers_df["customer_id"].tolist()),  # Link to existing customers
            "loan_type": random.choice(["Mortgage", "Personal", "Auto Loan"]),
            "loan_amount": round(random.uniform(5000, 500000), 2),
            "interest_rate": round(random.uniform(1.5, 10), 2),
            "loan_status": random.choice(["Active", "Closed", "Defaulted"]),
            "due_date": fake.future_date(end_date="+1y").strftime("%Y-%m-%d")
        })
        
        # Write to CSV every batch_size records
        if (i + 1) % batch_size == 0:
            print(f"✅ Generated {i + 1} / {n} loans")
            df = pd.DataFrame(loans)
            df.to_csv("loans.csv", mode='a', header=(i == 0), index=False)  # Append to CSV
            loans.clear()  # Clear the list to free memory
    
    # Final write if the remaining loans are less than batch_size
    if loans:
        df = pd.DataFrame(loans)
        df.to_csv("loans.csv", mode='a', header=False, index=False)
    
    return df

# Run the function with 100,000 loans
loans_df = generate_loans(customers_df, 60000)

✅ Generated 10000 / 60000 loans
✅ Generated 20000 / 60000 loans
✅ Generated 30000 / 60000 loans
✅ Generated 40000 / 60000 loans
✅ Generated 50000 / 60000 loans
✅ Generated 60000 / 60000 loans


In [26]:
data = pd.read_csv("loans.csv")

In [35]:
import os
# Check if the file exists
if os.path.exists(file_path):
    os.remove("loans.csv")  # Delete the file
    print(f"✅ {file_path} has been deleted.")
else:
    print("❌ File does not exist.")

❌ File does not exist.


In [38]:
def generate_support_tickets(customers_df, n=80000, batch_size=10000):
    tickets = []
    for i in range(n):
        tickets.append({
            "ticket_id": fake.uuid4(),
            "customer_id": random.choice(customers_df["customer_id"].tolist()),  # Link to existing customers
            "issue_type": random.choice(["Fraud", "Loan Inquiry", "Transaction Issue"]),
            "status": random.choice(["Open", "Closed", "In Progress"]),
            "created_at": fake.date_time_this_year().strftime("%Y-%m-%d %H:%M:%S")
        })
        
        # Write to CSV every batch_size records
        if (i + 1) % batch_size == 0:
            print(f"✅ Generated {i + 1} / {n} support tickets")
            df = pd.DataFrame(tickets)
            df.to_csv("support_tickets.csv", mode='a', header=(i == 0), index=False)  # Append to CSV
            tickets.clear()  # Clear the list to free memory
    
    # Final write if the remaining tickets are less than batch_size
    if tickets:
        df = pd.DataFrame(tickets)
        df.to_csv("support_tickets.csv", mode='a', header=False, index=False)
    
    return df

# Run the function with 80,000 support tickets
support_tickets_df = generate_support_tickets(customers_df, 80000)

✅ Generated 10000 / 80000 support tickets
✅ Generated 20000 / 80000 support tickets
✅ Generated 30000 / 80000 support tickets
✅ Generated 40000 / 80000 support tickets
✅ Generated 50000 / 80000 support tickets
✅ Generated 60000 / 80000 support tickets
✅ Generated 70000 / 80000 support tickets
✅ Generated 80000 / 80000 support tickets


In [39]:
def generate_fraud_incidents(customers_df, n=40000, batch_size=10000):
    incidents = []
    for i in range(n):
        incidents.append({
            "incident_id": fake.uuid4(),
            "customer_id": random.choice(customers_df["customer_id"].tolist()),  # Link to existing customers
            "incident_type": random.choice(["Phishing", "Unauthorized Transaction", "Account Breach"]),
            "incident_date": fake.date_time_this_year().strftime("%Y-%m-%d %H:%M:%S"),
            "status": random.choice(["Investigating", "Resolved", "Pending"])
        })
        
        # Write to CSV every batch_size records
        if (i + 1) % batch_size == 0:
            print(f"✅ Generated {i + 1} / {n} fraud incidents")
            df = pd.DataFrame(incidents)
            df.to_csv("fraud_incidents.csv", mode='a', header=(i == 0), index=False)  # Append to CSV
            incidents.clear()  # Clear the list to free memory
    
    # Final write if the remaining incidents are less than batch_size
    if incidents:
        df = pd.DataFrame(incidents)
        df.to_csv("fraud_incidents.csv", mode='a', header=False, index=False)
    
    return df

# Run the function with 40,000 fraud incidents
fraud_df = generate_fraud_incidents(customers_df, 40000)


✅ Generated 10000 / 40000 fraud incidents
✅ Generated 20000 / 40000 fraud incidents
✅ Generated 30000 / 40000 fraud incidents
✅ Generated 40000 / 40000 fraud incidents
