In [2]:
"""
Feature-engineering for synthetic_transactions.csv
==================================================
Adds:
    • Rolling 7/30/90-day stats
    • Days since last transaction
    • Cumulative in-month spend
    • Time since last payday
    • Spending-category ratios
    • Day-of-month & payday flag

2025-06-19
"""

import pandas as pd
import numpy as np
from pathlib import Path

# ------------------------------------------------------------------
# 0.  PATHS ― adjust only these two lines if the file lives elsewhere
# ------------------------------------------------------------------
CSV_PATH  = Path(r"C:/Users/loics/OneDrive/Documents/1. BAM/BLOCK 5/Assignment coding/synthetic_transactions.csv")
SAVE_PATH = Path(r"C:/Users/loics/OneDrive/Documents/1. BAM/BLOCK 5/Assignment coding/synthetic_transactions_enriched.csv")

# ------------------------------------------------------------------
# 1.  LOAD  +  ROBUST DATE PARSING
# ------------------------------------------------------------------
df = pd.read_csv(CSV_PATH)

# Many European CSVs use **DD/MM/YYYY**.  `dayfirst=True` handles that.
# If your file ever switches to ISO-like "2022-01-13", the same call
# still works.
df["date"] = pd.to_datetime(df["date"], dayfirst=True, errors="raise")

# sort so every user is strictly time-ordered
df = df.sort_values(["user_id", "date"]).reset_index(drop=True)

# ------------------------------------------------------------------
# 2.  ROLLING USER-LEVEL STATISTICS (7 / 30 / 90 days)
# ------------------------------------------------------------------
WINDOWS = [7, 30, 90]

for w in WINDOWS:
    rolled = (
        df.set_index("date")
          .groupby("user_id")["amount"]
          .rolling(f"{w}D", min_periods=1)
    )
    df[f"rolling_{w}d_sum"]  = rolled.sum().values
    df[f"rolling_{w}d_mean"] = rolled.mean().values
    df[f"rolling_{w}d_std"]  = rolled.std(ddof=0).fillna(0).values

# ------------------------------------------------------------------
# 3.  DAYS SINCE LAST TRANSACTION
# ------------------------------------------------------------------
df["days_since_last_tx"] = (
    df.groupby("user_id")["date"]
      .diff()
      .dt.days
      .fillna(-1)           # −1 marks “no previous tx”
)

# ------------------------------------------------------------------
# 4.  CUMULATIVE MONTHLY SPEND  (DEBITS ONLY)
# ------------------------------------------------------------------
df["year_month"]   = df["date"].dt.to_period("M")
mask_spend         = df["amount"] < 0             # debits
df["cum_month_spend"] = 0.0

df.loc[mask_spend, "cum_month_spend"] = (
    df[mask_spend]
      .groupby(["user_id", "year_month"])["amount"]
      .cumsum()
      .values
)

# ------------------------------------------------------------------
# 5.  TIME SINCE LAST PAYDAY  (+ flag)
# ------------------------------------------------------------------
def is_salary(row, threshold=1_000):
    """Proxy: positive inflow above a threshold."""
    return row["amount"] > threshold

df["is_payday"] = df.apply(is_salary, axis=1)

df["time_since_payday"] = (
    df.groupby("user_id")["date"]
      .apply(lambda s: s - s.where(df.loc[s.index, "is_payday"]).ffill())
      .dt.days
      .fillna(-1)           # −1 means “no payday yet”
      .reset_index(level=0, drop=True)
)

# ------------------------------------------------------------------
# 6.  SPENDING-CATEGORY RATIOS  (user × month)
# ------------------------------------------------------------------
spend_month_tot = (
    df[mask_spend]
      .groupby(["user_id", "year_month"])["amount"]
      .sum()
      .abs()
      .rename("month_total_abs")
)

spend_month_cat = (
    df[mask_spend]
      .groupby(["user_id", "year_month", "category"])["amount"]
      .sum()
      .abs()
      .rename("cat_total_abs")
      .reset_index()
      .merge(spend_month_tot, on=["user_id", "year_month"])
)

spend_month_cat["cat_ratio"] = (
    spend_month_cat["cat_total_abs"] / spend_month_cat["month_total_abs"]
)

ratio_wide = (
    spend_month_cat
      .pivot(index=["user_id", "year_month"],
             columns="category",
             values="cat_ratio")
      .fillna(0)
      .add_prefix("ratio_")
      .reset_index()
)

df = (
    df.merge(ratio_wide,
             on=["user_id", "year_month"],
             how="left")
      .fillna(0)
)

# ------------------------------------------------------------------
# 7.  CALENDAR FEATURE
# ------------------------------------------------------------------
df["day_of_month"] = df["date"].dt.day

# ------------------------------------------------------------------
# 8.  CLEAN-UP  &  SAVE
# ------------------------------------------------------------------
df.drop(columns=["year_month"], inplace=True)

df.to_csv(SAVE_PATH, index=False)
print(f"Enriched dataset saved to: {SAVE_PATH}")



Enriched dataset saved to: C:\Users\loics\OneDrive\Documents\1. BAM\BLOCK 5\Assignment coding\synthetic_transactions_enriched.csv
