PHASE 2
Customer State Construction (RFM + Temporal Signals)

In [58]:
# STEP 2.1 — Load Phase 1 Artifact (Immutable)

In [59]:
import pandas as pd

df = pd.read_parquet("phase1_clean_transactions.parquet")
print(df.shape)
df.head()


(824364, 11)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,is_cancelled,revenue,event_index
0,491725,TEST001,This is a test product.,10,2009-12-14 08:34:00,4.5,12346.0,United Kingdom,False,45.0,0
1,491742,TEST001,This is a test product.,5,2009-12-14 11:00:00,4.5,12346.0,United Kingdom,False,22.5,1
2,491744,TEST001,This is a test product.,5,2009-12-14 11:02:00,4.5,12346.0,United Kingdom,False,22.5,2
3,492718,TEST001,This is a test product.,5,2009-12-18 10:47:00,4.5,12346.0,United Kingdom,False,22.5,3
4,492722,TEST002,This is a test product.,1,2009-12-18 10:55:00,1.0,12346.0,United Kingdom,False,1.0,4


In [60]:
# STEP 2.2 — Define the Decision Timeline (CRITICAL)

In [61]:
# State is defined using history strictly before current invoice
# Current invoice revenue is NOT used to influence its own state


In [62]:
# STEP 2.3 — Invoice-Level Aggregation (Minimal)

In [63]:
invoice_df = (
    df.groupby(["Customer ID", "Invoice", "InvoiceDate"])
      .agg(
          total_revenue=("revenue", "sum"),
          total_quantity=("Quantity", "sum"),
          is_cancelled=("is_cancelled", "max")
      )
      .reset_index()
)

invoice_df.head()


Unnamed: 0,Customer ID,Invoice,InvoiceDate,total_revenue,total_quantity,is_cancelled
0,12346.0,491725,2009-12-14 08:34:00,45.0,10,False
1,12346.0,491742,2009-12-14 11:00:00,22.5,5,False
2,12346.0,491744,2009-12-14 11:02:00,22.5,5,False
3,12346.0,492718,2009-12-18 10:47:00,22.5,5,False
4,12346.0,492722,2009-12-18 10:55:00,1.0,1,False


In [64]:
invoice_df = invoice_df[~invoice_df["is_cancelled"]].copy()


In [65]:
# STEP 2.4 — Build Basic RFM (Time-Causal)

In [66]:
# Step 2.4.1 — Sort correctly (again, no trust)

In [67]:
invoice_df = invoice_df.sort_values(
    by=["Customer ID", "InvoiceDate"]
).reset_index(drop=True)


In [68]:
# Step 2.4.2 — Recency (days since last purchase)

In [69]:
invoice_df["prev_invoice_date"] = (
    invoice_df.groupby("Customer ID")["InvoiceDate"]
              .shift(1)
)

invoice_df["recency_days"] = (
    (invoice_df["InvoiceDate"] - invoice_df["prev_invoice_date"])
    .dt.days
)

invoice_df.head()


Unnamed: 0,Customer ID,Invoice,InvoiceDate,total_revenue,total_quantity,is_cancelled,prev_invoice_date,recency_days
0,12346.0,491725,2009-12-14 08:34:00,45.0,10,False,NaT,
1,12346.0,491742,2009-12-14 11:00:00,22.5,5,False,2009-12-14 08:34:00,0.0
2,12346.0,491744,2009-12-14 11:02:00,22.5,5,False,2009-12-14 11:00:00,0.0
3,12346.0,492718,2009-12-18 10:47:00,22.5,5,False,2009-12-14 11:02:00,3.0
4,12346.0,492722,2009-12-18 10:55:00,1.0,1,False,2009-12-18 10:47:00,0.0


In [70]:
# Step 2.4.3 — Frequency (purchase count so far)

In [71]:
invoice_df["frequency"] = (
    invoice_df.groupby("Customer ID")
              .cumcount()
)


In [72]:
# Step 2.4.4 — Monetary (historical average spend)

In [73]:
invoice_df["cum_revenue"] = (
    invoice_df.groupby("Customer ID")["total_revenue"]
              .cumsum()
)

invoice_df["monetary_avg"] = (
    invoice_df["cum_revenue"] /
    (invoice_df["frequency"] + 1)
)


In [74]:
# STEP 2.5 — Temporal Dynamics (Small but Powerful)

In [75]:
invoice_df["prev_revenue"] = (
    invoice_df.groupby("Customer ID")["total_revenue"]
              .shift(1)
)

invoice_df["delta_revenue"] = (
    invoice_df["total_revenue"] - invoice_df["prev_revenue"]
)


In [76]:
# Purchase acceleration (Δ recency)

In [77]:
invoice_df["prev_recency"] = (
    invoice_df.groupby("Customer ID")["recency_days"]
              .shift(1)
)

invoice_df["delta_recency"] = (
    invoice_df["recency_days"] - invoice_df["prev_recency"]
)


In [78]:
# STEP 2.6 — Define the Customer State Vector

In [79]:
state_cols = [
    "recency_days",
    "frequency",
    "monetary_avg",
    "delta_revenue",
    "delta_recency"
]

state_df = invoice_df[
    ["Customer ID", "InvoiceDate"] + state_cols
].copy()

state_df.head()


Unnamed: 0,Customer ID,InvoiceDate,recency_days,frequency,monetary_avg,delta_revenue,delta_recency
0,12346.0,2009-12-14 08:34:00,,0,45.0,,
1,12346.0,2009-12-14 11:00:00,0.0,1,33.75,-22.5,
2,12346.0,2009-12-14 11:02:00,0.0,2,30.0,0.0,0.0
3,12346.0,2009-12-18 10:47:00,3.0,3,28.125,0.0,3.0
4,12346.0,2009-12-18 10:55:00,0.0,4,22.7,-21.5,-3.0


In [80]:
# STEP 2.7 — Sanity Checks (Must Pass)

In [81]:
state_df.isna().mean()


Unnamed: 0,0
Customer ID,0.0
InvoiceDate,0.0
recency_days,0.158779
frequency,0.0
monetary_avg,0.0
delta_revenue,0.158779
delta_recency,0.273738


In [82]:
# ✔ Frequency grows monotonically

In [83]:
check_freq = (
    state_df.groupby("Customer ID")["frequency"]
            .apply(lambda x: x.is_monotonic_increasing)
)

check_freq.all()


np.True_

In [84]:
# STEP 2.8 — Save Phase 2 Artifact

In [85]:
state_df.to_parquet("phase2_customer_state.parquet", index=False)
