In [33]:
# importing necessary libraries
import sqlite3
from faker import Faker
import random
from datetime import datetime, timedelta

# Set seed for reproducibility
SEED = 123
random.seed(SEED)        # Seed Python's built-in random module
fake = Faker()
fake.seed_instance(SEED) # Seed Faker's random generator


In [34]:
# Initialize database connection
conn = sqlite3.connect('bank_data.db')
cursor = conn.cursor()

# --------------------------
# Create tables in the database
# --------------------------

# Drop tables if they already exist
cursor.execute("DROP TABLE IF EXISTS customers")
cursor.execute("DROP TABLE IF EXISTS accounts")
cursor.execute("DROP TABLE IF EXISTS transactions")
cursor.execute("DROP TABLE IF EXISTS loans")

# Create customers table with province instead of country
cursor.execute("""
CREATE TABLE IF NOT EXISTS customers (
    customer_id INTEGER PRIMARY KEY,
    first_name TEXT,
    last_name TEXT,
    gender TEXT,
    date_of_birth DATE,
    join_date DATE,
    income NUMERIC,
    city TEXT,
    province TEXT
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS accounts (
    account_id INTEGER PRIMARY KEY,
    customer_id INTEGER,
    account_type TEXT,
    balance NUMERIC,
    open_date DATE,
    status TEXT,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS transactions (
    transaction_id INTEGER PRIMARY KEY,
    account_id INTEGER,
    transaction_date DATE,
    transaction_type TEXT,
    amount NUMERIC,
    description TEXT,
    FOREIGN KEY (account_id) REFERENCES accounts(account_id)
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS loans (
    loan_id INTEGER PRIMARY KEY,
    customer_id INTEGER,
    loan_type TEXT,
    loan_amount NUMERIC,
    issue_date DATE,
    due_date DATE,
    balance NUMERIC,
    status TEXT,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
)
""")

conn.commit()

### 👥 Customer Data Generation

- Creates **100 fake customers** using the `Faker` library.
- Each customer includes:
  - Name, gender, birth date, join date
  - Annual income, city, and province
- This table is the foundation for accounts and loans.

In [35]:

# Dictionary of Canadian provinces and a few corresponding cities
canada_provinces = {
    "Ontario": ["Toronto", "Ottawa", "Mississauga", "Hamilton"],
    "Quebec": ["Montreal", "Quebec City", "Laval", "Gatineau"],
    "British Columbia": ["Vancouver", "Victoria", "Surrey", "Kelowna"],
    "Alberta": ["Calgary", "Edmonton", "Red Deer", "Lethbridge"],
    "Manitoba": ["Winnipeg", "Brandon", "Steinbach"],
    "Saskatchewan": ["Saskatoon", "Regina", "Prince Albert"],
    "Nova Scotia": ["Halifax", "Sydney", "Dartmouth"],
    "New Brunswick": ["Moncton", "Saint John", "Fredericton"],
    "Newfoundland and Labrador": ["St. John's", "Corner Brook"],
    "Prince Edward Island": ["Charlottetown", "Summerside"]
}

# Generate fake customer data with valid Canadian provinces and cities
def generate_customers(n=100):
    customers = []
    for _ in range(n):
        gender = random.choice(['Male', 'Female'])
        
        # Generate a first name based on the gender
        if gender == 'Male':
            first_name = fake.first_name_male()
        else:
            first_name = fake.first_name_female()

        last_name = fake.last_name()
        date_of_birth = fake.date_of_birth(minimum_age=18, maximum_age=75)
        join_date = fake.date_between(start_date='-10y', end_date='today')
        income = round(random.uniform(20000, 150000), 2)

        # Select a random province and corresponding city
        province = random.choice(list(canada_provinces.keys()))
        city = random.choice(canada_provinces[province])

        customers.append((first_name, last_name, gender, date_of_birth, join_date, income, city, province))
    
    return customers

# Insert customers into the database
customers = generate_customers(100)
cursor.executemany("""
INSERT INTO customers (first_name, last_name, gender, date_of_birth, join_date, income, city, province)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", customers)

conn.commit()


### 🏦 Account Data Generation

- Each customer is assigned **1 or 2 bank accounts**.
- Account attributes include:
  - Type (`savings` or `checking`)
  - Balance (random between $0 and $50,000)
  - Open date and account status (`active`, `closed`, `frozen`)
- This simulates how customers may hold multiple account types with different balances and states.

In [36]:
# --------------------------
# Generate and insert fake accounts
# --------------------------

def generate_accounts():
    account_types = ['savings', 'checking']
    statuses = ['active', 'closed', 'frozen']
    accounts = []

    cursor.execute("SELECT customer_id FROM customers")
    customer_ids = [row[0] for row in cursor.fetchall()]

    for customer_id in customer_ids:
        num_accounts = random.choice([1, 2])  # Each customer has 1 or 2 accounts
        for _ in range(num_accounts):
            account_type = random.choice(account_types)
            balance = round(random.uniform(0, 50000), 2)
            open_date = fake.date_between(start_date='-10y', end_date='today')
            status = random.choices(statuses, weights=[0.8, 0.15, 0.05])[0]
            accounts.append((customer_id, account_type, balance, open_date, status))

    return accounts

# Insert accounts into the database
accounts = generate_accounts()
cursor.executemany("""
INSERT INTO accounts (customer_id, account_type, balance, open_date, status)
VALUES (?, ?, ?, ?, ?)
""", accounts)

conn.commit()

### 🧾 Transaction Data Generation

This section creates fake transaction records for each active bank account in the database.

- Each account receives between **10 and 50 transactions**, simulating typical customer activity.
- The transaction types include:
  - `deposit` (most frequent)
  - `withdrawal`
  - `payment`
  - `transfer`
- **Weights** are applied to prioritize deposits and withdrawals, mimicking realistic banking behavior.
- Each transaction has:
  - A **random amount** between $10 and $5,000
  - A **random description** (e.g., store names, payment notes)
  - A **date within the last 5 years**

This data will later be used for behavioral and financial pattern analysis.


In [37]:
# --------------------------
# Generate and insert fake transactions
# --------------------------

def generate_transactions():
    transaction_types = ['deposit', 'withdrawal', 'payment', 'transfer']
    transactions = []

    cursor.execute("SELECT account_id FROM accounts WHERE status = 'active'")
    account_ids = [row[0] for row in cursor.fetchall()]

    for account_id in account_ids:
        num_transactions = random.randint(10, 50)
        for _ in range(num_transactions):
            transaction_date = fake.date_between(start_date='-5y', end_date='today')
            transaction_type = random.choices(
                transaction_types,
                weights=[0.4, 0.3, 0.2, 0.1]  # More deposits and withdrawals
            )[0]
            amount = round(random.uniform(10, 5000), 2)
            description = fake.sentence(nb_words=4)
            transactions.append((account_id, transaction_date, transaction_type, amount, description))

    return transactions

# Insert transactions into the database
transactions = generate_transactions()
cursor.executemany("""
INSERT INTO transactions (account_id, transaction_date, transaction_type, amount, description)
VALUES (?, ?, ?, ?, ?)
""", transactions)

conn.commit()


### 💸 Loan Data Generation

- About 50% of customers receive **1 or 2 loans** each.
- Loan types include:
  - `personal`, `mortgage`, `auto`
- Loan fields:
  - Loan amount, issue date, due date
  - Status (`current`, `delinquent`, `paid`)
  - Remaining balance (calculated based on status)
- This simulates realistic debt profiles for customers.

In [38]:
# --------------------------
# Generate and insert fake loans
# --------------------------

def generate_loans():
    loan_types = ['personal', 'mortgage', 'auto']
    loan_statuses = ['current', 'delinquent', 'paid']
    loans = []

    cursor.execute("SELECT customer_id FROM customers")
    customer_ids = [row[0] for row in cursor.fetchall()]

    for customer_id in customer_ids:
        # Randomly decide if this customer has a loan (50% chance)
        if random.random() < 0.5:
            num_loans = random.choice([1, 2])
            for _ in range(num_loans):
                loan_type = random.choice(loan_types)
                
                # Adjust loan amount based on loan type
                if loan_type == 'mortgage':
                    loan_amount = round(random.uniform(200000, 500000), 2)  # Higher range for mortgage
                elif loan_type == 'auto':
                    loan_amount = round(random.uniform(10000, 50000), 2)  # Medium range for auto loans
                else:  # personal
                    loan_amount = round(random.uniform(5000, 15000), 2)  # Smaller range for personal loans
                
                issue_date = fake.date_between(start_date='-10y', end_date='-1y')
                due_date = issue_date + timedelta(days=random.randint(365, 365 * 5))
                status = random.choices(loan_statuses, weights=[0.7, 0.1, 0.2])[0]

                # Balance depends on the status
                if status == 'paid':
                    balance = 0.0
                elif status == 'current':
                    balance = round(loan_amount * random.uniform(0.3, 0.9), 2)
                else:  # delinquent
                    balance = round(loan_amount * random.uniform(0.8, 1.1), 2)

                loans.append((customer_id, loan_type, loan_amount, issue_date, due_date, balance, status))

    return loans

# Insert loans into the database
loans = generate_loans()
cursor.executemany("""
INSERT INTO loans (customer_id, loan_type, loan_amount, issue_date, due_date, balance, status)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", loans)

conn.commit()