In [4]:
import numpy as np
import pandas as pd

def generate_pcard_data(
    n_cluster1=200,
    n_cluster2=300,
    n_cluster3=200,
    n_outliers=25,
    random_state=42,
    save_path=None,
    delimiter=",",
):
    """
    Generate synthetic P-Card summary data with 3 main behavioral clusters + anomalies.

    Clusters:
    1. High-amount, infrequent, weekend-heavy  (Field Marketing / Public Outreach)
    2. Low-amount, frequent, weekday-heavy     (Administration / Finance / Public Works)
    3. Low-amount, frequent, all-week          (Parks & Rec)
    + Outliers
    """
    rng = np.random.default_rng(random_state)

    # ------------ CLUSTER 1 ------------
    # High amount, low frequency, weekend heavy
    c1 = pd.DataFrame({
        "avg_transaction_amount": rng.normal(700, 100, n_cluster1).clip(300, 1200).round(2),
        "transactions_per_month": rng.normal(5, 2, n_cluster1).clip(1, 12).round(0),
        "pct_weekend_transactions": rng.normal(0.35, 0.10, n_cluster1).clip(0, 0.9).round(2),
        "department": rng.choice(["Field Marketing", "Public Outreach"], n_cluster1)
    })

    # ------------ CLUSTER 2 ------------
    # Low amount, high frequency, weekday
    c2 = pd.DataFrame({
        "avg_transaction_amount": rng.normal(100, 30, n_cluster2).clip(10, 200).round(2),
        "transactions_per_month": rng.normal(25, 5, n_cluster2).clip(10, 40).round(0),
        "pct_weekend_transactions": rng.normal(0.05, 0.02, n_cluster2).clip(0, 0.15).round(2),
        "department": rng.choice(["Administration", "Finance", "Public Works"], n_cluster2)
    })

    # ------------ CLUSTER 3 ------------
    # Low amount, high frequency, all week (more weekend activity)
    c3 = pd.DataFrame({
        "avg_transaction_amount": rng.normal(90, 25, n_cluster3).clip(10, 200).round(2),
        "transactions_per_month": rng.normal(28, 5, n_cluster3).clip(10, 45).round(0),
        "pct_weekend_transactions": rng.normal(0.6, 0.08, n_cluster3).clip(.2, 0.8).round(2),
        "department": "Parks & Rec"
    })

    # ------------ OUTLIERS ------------
    # Scatter some wild points across the space
    outliers = pd.DataFrame({
        "avg_transaction_amount": rng.uniform(50, 2000, n_outliers).round(2),
        "transactions_per_month": rng.uniform(1, 50, n_outliers).round(0),
        "pct_weekend_transactions": rng.uniform(0, 1, n_outliers).round(2),
        "department": rng.choice(["Parks & Rec", "Field Marketing", "Administration"], n_outliers)
    })

    # ------------ COMBINE ------------
    df = pd.concat([c1, c2, c3, outliers], ignore_index=True)
    df["cardholder_id"] = [f"CARD_{i:04d}" for i in range(1, len(df) + 1)]

    # Reorder
    df = df[[
        "cardholder_id",
        "department",
        "avg_transaction_amount",
        "transactions_per_month",
        "pct_weekend_transactions"
    ]]

    # Optional: save
    if save_path:
        # if you want pipe-delimited for students, set delimiter="|"
        df.to_csv(save_path, index=False, sep=delimiter)

    return df


# tweak these numbers to make the assignment bigger/smaller or to change balance
df = generate_pcard_data(
    n_cluster1=200,   # field marketing
    n_cluster2=300,   # admin
    n_cluster3=200,   # parks & rec
    n_outliers=25,
    random_state=42,
    save_path="DC_PCard_Summary_G.csv",
    delimiter=",",    # change to "|" if you want pipe-delimited
)
print(df.head())
print(f"Generated {len(df)} rows.")

  cardholder_id       department  avg_transaction_amount  \
0     CARD_0001  Field Marketing                  730.47   
1     CARD_0002  Field Marketing                  596.00   
2     CARD_0003  Field Marketing                  775.05   
3     CARD_0004  Public Outreach                  794.06   
4     CARD_0005  Public Outreach                  504.90   

   transactions_per_month  pct_weekend_transactions  
0                     6.0                      0.33  
1                     8.0                      0.37  
2                     5.0                      0.43  
3                     6.0                      0.31  
4                     1.0                      0.40  
Generated 725 rows.
