In [1]:
"""
generate_dataset.py
───────────────────
Synthetic Dutch bank-transaction generator
Date range: 2022-01-25 → 2025-05-24
"""

from __future__ import annotations
import argparse, random, calendar, math
from datetime import date, datetime, timedelta

import numpy as np
import pandas as pd
from faker import Faker


# ── public holidays NL 2022-2025 ─────────────────────────────────────────
HOLIDAYS_NL = {
    # 2022
    date(2022, 1, 1),  date(2022, 4,15), date(2022, 4,17), date(2022, 4,18),
    date(2022, 4,27),  date(2022, 5, 5), date(2022, 5,26),
    date(2022, 6, 5),  date(2022, 6, 6),
    date(2022,12,25),  date(2022,12,26),
    # 2023
    date(2023, 1, 1),  date(2023, 4, 7), date(2023, 4, 9), date(2023, 4,10),
    date(2023, 4,27),  date(2023, 5, 5), date(2023, 5,18),
    date(2023, 5,28),  date(2023, 5,29),
    date(2023,12,25),  date(2023,12,26),
    # 2024
    date(2024, 1, 1),  date(2024, 3,29), date(2024, 3,31), date(2024, 4, 1),
    date(2024, 4,27),  date(2024, 5, 5), date(2024, 5, 9),
    date(2024, 5,19),  date(2024, 5,20),
    date(2024,12,25),  date(2024,12,26),
    # 2025 (dataset stops 24 May)
    date(2025, 1, 1),  date(2025, 4,18), date(2025, 4,20), date(2025, 4,21),
    date(2025, 4,27),  date(2025, 5, 5),
}

# ── amount ranges by category (lo, hi) ───────────────────────────────────
CATEGORY_RULES = {
    "Rent":                (-4000,  -700),
    "Groceries":              (-300,    -2),
    "Entertainment":          (-60,   -10),
    "Transport":              (-20,    -5),
    "Utilities":              (-100,   -50),
    "Income":                (+2000, +6000),
    "Gift":                    (+50,   +300),
    "Loan repayment":         (-400,  -200),
    "Mortgage payment":       (-4000, -1000),
    "Big purchase":          (-3000,  -500),
    "Salary bonus":           (+300,   +800),
}

SEASONS = {1:"Winter",2:"Winter",3:"Spring",4:"Spring",5:"Spring",6:"Summer",
           7:"Summer",8:"Summer",9:"Autumn",10:"Autumn",11:"Autumn",12:"Winter"}


# ── helpers ──────────────────────────────────────────────────────────────
def initials(full: str) -> str:
    parts = full.split()
    return f'{".".join(p[0].upper() for p in parts[:-1])}. {parts[-1].upper()}'


def age_bracket(age: int, width: int = 10) -> str:
    lo = (age // width) * width
    return f"{lo}-{lo + width - 1}"


def rand(lo_hi):  # uniform helper
    return round(random.uniform(*lo_hi), 2)


# ── core generator ───────────────────────────────────────────────────────
def build(
    n_users: int,
    seed: int = 42,
    start: date = date(2022, 1, 1),   # ← NEW default start
    end: date = date(2025, 5, 31)      # ← NEW default end
) -> pd.DataFrame:

    r = random.Random(seed);         np.random.seed(seed)
    fake = Faker("nl_NL");           Faker.seed(seed)

    # 25-35 % loan prevalence
    loan_prob = r.uniform(0.25, 0.35)

    # 1 · user profiles ---------------------------------------------------
    profiles = []
    for uid in range(1, n_users + 1):
        full = fake.name()
        age  = r.randint(18, 90)

        has_mortgage = r.random() < 0.30
        rent_amt     = None if has_mortgage else r.randint(800, 2500)

        # 28-day vacation period (static, not used in new rules but kept)
        vac_start = start + timedelta(days=r.randint(0, (end - start).days - 28))

        profiles.append({
            "user_id": uid,
            "name": initials(full),
            "age": age,
            "age_bracket": age_bracket(age),
            "risk_score": r.randint(1, 5),
            "country": "Netherlands",
            "city": fake.city(),
            "iban": fake.iban(),
            "base_salary": r.randint(2500, 5000),
            "rent": rent_amt,
            "mortgage_payment": r.randint(900, 1800) if has_mortgage else None,
            "has_loan": r.random() < loan_prob,
            "has_mortgage": has_mortgage,
            "birthday": fake.date_of_birth(minimum_age=age, maximum_age=age),
            "vac_days": {vac_start + timedelta(d) for d in range(28)},
        })

    rows = []

    # 2 · monthly fixed events -------------------------------------------
    month_iter = pd.period_range(start, end, freq="M")
    for p in profiles:
        salary = p["base_salary"]
        for period in month_iter:
            y, m = period.year, period.month

            # salary increase every January (compound)
            if m == 1 and period != month_iter[0]:
                salary = round(salary * 1.02, 2)

            # salary on 1st
            day = date(y, m, min(1, calendar.monthrange(y, m)[1]))
            rows.append([day, p["user_id"], salary,
                         "Monthly salary", "Income"])

            # salary bonus (June & December) on 25th
            if m in (6, 12):
                rows.append([day, p["user_id"],
                             rand(CATEGORY_RULES["Salary bonus"]),
                             "Salary bonus", "Salary bonus"])

            # rent or mortgage on 1st
            day1 = date(y, m, 1)
            if p["has_mortgage"]:
                rows.append([day1, p["user_id"],
                             rand(CATEGORY_RULES["Mortgage payment"]),
                             "Mortgage", "Mortgage payment"])
            else:
                rows.append([day1, p["user_id"], -p["rent"],
                             "Rent", "Rent"])

            # utilities on 10th
            if calendar.monthrange(y, m)[1] >= 10:
                rows.append([date(y, m, 10), p["user_id"],
                             rand(CATEGORY_RULES["Utilities"]),
                             "Utilities", "Utilities"])

            # loan repayment on 15th (loan holders only)
            if p["has_loan"] and calendar.monthrange(y, m)[1] >= 15:
                rows.append([date(y, m, 15), p["user_id"],
                             rand(CATEGORY_RULES["Loan repayment"]),
                             "Personal loan", "Loan repayment"])

    # 3 · gifts -----------------------------------------------------------
    for p in profiles:
        for y in range(start.year, end.year + 1):
            # birthday
            bday = p["birthday"].replace(year=y)
            if start <= bday <= end:
                rows.append([bday, p["user_id"],
                             rand(CATEGORY_RULES["Gift"]),
                             "Birthday gift", "Gift"])
            # Christmas
            xmas = date(y, 12, 25)
            if start <= xmas <= end:
                rows.append([xmas, p["user_id"],
                             rand(CATEGORY_RULES["Gift"]),
                             "Christmas gift", "Gift"])

    # 4 · big purchase (once every 2 years) ------------------------------
    total_days = (end - start).days
    for p in profiles:
        n_big = max(1, math.floor((total_days / 365.25) / 2))
        for _ in range(n_big):
            rand_day = start + timedelta(days=r.randint(0, total_days))
            rows.append([rand_day, p["user_id"],
                         rand(CATEGORY_RULES["Big purchase"]),
                         fake.company(), "Big purchase"])

    # 5 · weekly groceries & transport -----------------------------------
    weeks = pd.period_range(start, end, freq="W-SUN")
    for p in profiles:
        for w in weeks:
            days = pd.date_range(w.start_time, w.end_time, freq="D")
            # groceries 1-2×/week
            for d in r.sample(list(days), k=r.randint(1, 2)):
                rows.append([d.date(), p["user_id"],
                             rand(CATEGORY_RULES["Groceries"]),
                             fake.company(), "Groceries"])
            # transport: 2-6 days, each with two transactions
            for d in r.sample(list(days), k=r.randint(2, 6)):
                for _ in range(2):
                    rows.append([d.date(), p["user_id"],
                                 rand(CATEGORY_RULES["Transport"]),
                                 fake.company(), "Transport"])

    # 6 · daily entertainment (≤1 per day) -------------------------------
    for current in pd.date_range(start, end, freq="D"):
        for p in profiles:
            if r.random() < 0.25:  # 25 % chance
                rows.append([current.date(), p["user_id"],
                             rand(CATEGORY_RULES["Entertainment"]),
                             fake.company(), "Entertainment"])

    # --------------------------------------------------------------------
    df = pd.DataFrame(rows, columns=["date", "user_id", "amount",
                                     "description", "category"])
    df["date"] = pd.to_datetime(df["date"])

    # drop 1 % rows for realism
    df = df.sample(frac=0.99, random_state=seed).reset_index(drop=True)

    # merge static attributes (remove private cols)
    stat = (pd.DataFrame(profiles)
              .drop(columns=["base_salary", "rent", "mortgage_payment",
                             "has_loan", "has_mortgage", "vac_days",
                             "birthday"]))
    df = df.merge(stat, on="user_id", how="left")

    # derived flags
    df["season"]            = df["date"].dt.month.map(SEASONS)
    df["is_weekend"]        = df["date"].dt.weekday >= 5
    df["is_public_holiday"] = df["date"].dt.normalize().isin(HOLIDAYS_NL)

    return df.sort_values(["user_id", "date"]).reset_index(drop=True)


# ── CLI (works in terminal & notebooks) ──────────────────────────────────
if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Generate synthetic NL transactions")
    ap.add_argument("--n_users", type=int, default=100)
    ap.add_argument("--seed",    type=int, default=42)
    ap.add_argument("--out",     type=str, default="synthetic_transactions.csv")
    args, _ = ap.parse_known_args()          # ignore Jupyter’s extra -f flag

    out_df = build(n_users=args.n_users, seed=args.seed)
    out_df.to_csv(args.out, index=False)
    print(f"✔  {args.out} saved  —  {len(out_df):,} rows, {args.n_users} users")



✔  synthetic_transactions.csv saved  —  214,781 rows, 100 users
