In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# --------------------------
# PARAMETERS
# --------------------------
START_YEAR = 2015
END_YEAR = 2025
TARGET_ROWS = 150_000
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Base anchor counts for long-term trend (we add noise on top)
anchor_counts = {2015: 15000, 2019: 20000, 2020: 18000, 2022:21000, 2023: 23000, 2024: 24000, 2025: 23000}

# Frequencies & mapping
FREQUENCIES = ["monthly", "biyearly", "yearly"]
FREQ_WEIGHTS = [0.88, 0.04, 0.08]
FREQ_TO_DAYS = {"monthly": 30, "biyearly": 185, "yearly": 365}

DONOR_SEGMENTS = [
    "General Recurring Donor",
    "Emergency Response Supporter",
    "Education Fund Supporter",
    "Community Development Supporter"
]

PAYMENT_METHODS = ["Credit Card", "ACH Bank Transfer", "PayPal", "Google Pay", "Apple Pay"]

ACQUISITION_CHANNELS = [
    "Website", "Email Campaign", "Facebook Ad", "Instagram Ad",
    "Google Search", "Peer Referral", "Event Signup"
]

CANCEL_REASONS = [
    "Financial hardship", "Not interested anymore", "Payment failure",
    "Moved to different cause", "No reason given"
]

# Donation amount seasonality (mid-range nonprofit)
SEASONALITY = {
    1:1.00, 2:1.00, 3:1.02, 4:1.03, 5:1.01, 6:0.96,
    7:0.95, 8:0.97, 9:1.00, 10:1.03, 11:1.10, 12:1.15
}

# Loyalty targets (but we add volatility later)
PCT_LONG_5PLUS = 0.08
PCT_MEDIUM_3PLUS = 0.22

# --------------------------
# Realistic Yearly Shocks
# --------------------------
EVENT_SHOCKS = {
    2016: 1.05,
    2017: 1.12,  # strong year
    2018: 0.95,
    2019: 0.90,  # soft year
    2020: 0.60,  # pandemic crash
    2021: 0.80,  # recovery surge
    2022: 0.88,
    2023: 1.10,
    2024: 0.95
}

# --------------------------
# Monthly Acquisition Weights (base)
# --------------------------
BASE_MONTH_ACQ = {
    1: 0.85, 2: 0.80, 3: 1.05,
    4: 1.10, 5: 1.12, 6: 0.88,
    7: 0.78, 8: 0.90, 9: 1.05,
    10: 1.20, 11: 1.45, 12: 1.60  # year-end spike
}

def weighted_start_date(year):
    """Randomized acquisition weights each year for realism."""
    months = list(BASE_MONTH_ACQ.keys())
    weights = [BASE_MONTH_ACQ[m] * np.random.uniform(0.6, 1.4) for m in months]
    month = random.choices(months, weights=weights, k=1)[0]
    day = random.randint(1, 28)
    return datetime(year, month, day)

# --------------------------
# Churn Variability
# --------------------------
BASE_MONTH_CHURN = {
    1: 1.35, 2: 1.20, 3: 1.12,
    4: 1.05, 5: 0.95, 6: 1.00,
    7: 0.90, 8: 0.88, 9: 0.92,
    10: 0.85, 11: 0.95, 12: 1.10
}

def adjust_end_date(end_date):
    """Add noisy churn shifts to end dates."""
    if end_date is None:
        return None
    churn_noise = np.random.normal(1.0, 0.25)
    base = BASE_MONTH_CHURN[end_date.month]
    multiplier = base * churn_noise
    shift_days = int((multiplier - 1.0) * np.random.randint(10, 60))
    return end_date + timedelta(days=shift_days)

# --------------------------
# Inflation Model
# --------------------------
year_inflation = {}
cumulative = 1.0
for y in range(START_YEAR, END_YEAR + 1):
    rate = np.random.uniform(0.01, 0.025)
    cumulative *= (1 + rate)
    year_inflation[y] = cumulative

# --------------------------
# Helper Functions
# --------------------------
def interp_year_counts(anchors, start, end, noise_sd=500):
    """Smooth but noisy donor trend base."""
    years = list(range(start, end + 1))
    anchors_sorted = sorted(anchors.items())
    result = {}
    for y in years:
        if y in anchors:
            base = anchors[y]
        else:
            prev_y = max(a for a,_ in anchors_sorted if a < y)
            next_y = min(a for a,_ in anchors_sorted if a > y)
            ratio = (y - prev_y) / (next_y - prev_y)
            base = anchors[prev_y] + ratio * (anchors[next_y] - anchors[prev_y])
        noisy = int(max(0, base + np.random.normal(0, noise_sd)))
        result[y] = noisy
    return result

def base_amount_for_frequency(freq):
    """Mid-range donors ($20–$50 typical)."""
    return np.random.randint(20, 51)

def lifespan_years_by_loyalty():
    """High churn realism."""
    r = np.random.rand()
    # 30% drop within 1–3 months
    if r < 0.30:
        return np.random.uniform(0.05, 0.25)
    # Medium-term donors
    elif r < 0.30 + PCT_MEDIUM_3PLUS:
        return np.random.uniform(2.5, 4.5)
    # Long-term donors
    elif r < 0.30 + PCT_MEDIUM_3PLUS + PCT_LONG_5PLUS:
        return np.random.uniform(5, 8)
    # Everything else — noisy exponential churn
    years = np.random.exponential(scale=1.3)
    return float(round(min(years, 5.0), 2))

def apply_seasonality_and_inflation(base_amt, dt):
    month_factor = SEASONALITY[dt.month]
    noise = np.random.normal(1.0, 0.08)
    infl = year_inflation[dt.year]
    return int(max(5, round(base_amt * month_factor * noise * infl)))

# --------------------------
# Build Donors (High Realism)
# --------------------------
year_counts = interp_year_counts(anchor_counts, START_YEAR, END_YEAR, noise_sd=700)

donors = []
donor_id_counter = 1

for y in range(START_YEAR, END_YEAR + 1):

    macro_multiplier = np.random.normal(1.0, 0.20)
    macro_multiplier *= EVENT_SHOCKS.get(y, 1.0)

    n = int(year_counts[y] * macro_multiplier)
    n = max(0, n)

    for _ in range(n):
        start_dt = weighted_start_date(y)

        lifespan = lifespan_years_by_loyalty()
        if lifespan < 0.3:
            # Early dropout
            end_dt = start_dt + timedelta(days=int(lifespan * 365))
        else:
            raw = start_dt + timedelta(days=int(lifespan * 365))
            raw += timedelta(days=np.random.randint(-45, 45))
            end_dt = raw

        frequency = random.choices(FREQUENCIES, weights=FREQ_WEIGHTS, k=1)[0]
        base_amount = base_amount_for_frequency(frequency)
        payment_method = random.choices(PAYMENT_METHODS, weights=[0.5,0.15,0.15,0.1,0.1])[0]
        donor_segment = random.choice(DONOR_SEGMENTS)
        acquisition_channel = random.choices(
            ACQUISITION_CHANNELS,
            weights=[0.20,0.20,0.18,0.14,0.12,0.12,0.04]
        )[0]

        if end_dt.year > END_YEAR:
            end_dt_effective = None
            status = "active"
            cancel_reason = ""
        else:
            end_dt_effective = adjust_end_date(end_dt)
            if end_dt_effective is None or end_dt_effective.year > END_YEAR:
                end_dt_effective = None
                status = "active"
                cancel_reason = ""
            else:
                status = "cancelled"
                cancel_reason = random.choice(CANCEL_REASONS)

        donors.append({
            "donor_id": donor_id_counter,
            "start_date": start_dt,
            "end_date": end_dt_effective,
            "frequency": frequency,
            "base_amount": base_amount,
            "payment_method": payment_method,
            "donor_segment": donor_segment,
            "acquisition_channel": acquisition_channel,
            "cancel_reason": cancel_reason,
            "status": status
        })

        donor_id_counter += 1

# --------------------------
# Generate Donation Events (High Realism)
# --------------------------
records = []
donation_id_counter = 1

for d in donors:
    step_days = FREQ_TO_DAYS[d["frequency"]]
    dt = d["start_date"]
    end_dt = d["end_date"]
    base_amt = d["base_amount"]

    while dt.year <= END_YEAR:
        if end_dt is not None and dt > end_dt:
            break

        amt = apply_seasonality_and_inflation(base_amt, dt)

        records.append({
            "donation_id": f"D{donation_id_counter:06d}",
            "donor_id": d["donor_id"],
            "start_date": (d["start_date"].strftime("%Y-%m-%d")),
            "end_date": (d["end_date"].strftime("%Y-%m-%d") if d["end_date"] else ""),
            "amount": amt,
            "frequency": d["frequency"],
            "status": d["status"] if (d["end_date"] is None or dt <= d["end_date"]) else "cancelled",
            "donor_segment": d["donor_segment"],
            "payment_method": d["payment_method"],
            "acquisition_channel": d["acquisition_channel"],
            "cancel_reason": d["cancel_reason"]
        })

        donation_id_counter += 1
        # Random drift in timing (high realism)
        dt = dt + timedelta(days=int(step_days * np.random.uniform(0.6, 1.5)))

# --------------------------
# Save CSV (same schema)
# --------------------------
df = pd.DataFrame(records)

if len(df) > TARGET_ROWS:
    df = df.sample(n=TARGET_ROWS, random_state=RANDOM_SEED).reset_index(drop=True)

df["amount"] = df["amount"].astype(int)

final_cols = [
    "donation_id","donor_id","start_date","end_date","amount",
    "frequency","status","donor_segment","payment_method",
    "acquisition_channel","cancel_reason"
]

df = df[final_cols]

output_path = "/Users/noahmancione/OneDrive/Noah Python Projects/realistic_nonprofit_donors_capstone.csv"
df.to_csv(output_path, index=False)

print("High realism CSV saved:", output_path)


High realism CSV saved: /Users/noahmancione/OneDrive/Noah Python Projects/realistic_nonprofit_donors_capstone.csv
