PHASE 3
Latent Churn & Survival Modeling (Non-Contractual Setting)

In [40]:
# STEP 3.1 — Load Phase 2 Artifact (Immutable)

In [41]:
import pandas as pd

state_df = pd.read_parquet("phase2_customer_state.parquet")
print(state_df.shape)
state_df.head()


(37039, 7)


Unnamed: 0,Customer ID,InvoiceDate,recency_days,frequency,monetary_avg,delta_revenue,delta_recency
0,12346.0,2009-12-14 08:34:00,,0,45.0,,
1,12346.0,2009-12-14 11:00:00,0.0,1,33.75,-22.5,
2,12346.0,2009-12-14 11:02:00,0.0,2,30.0,0.0,0.0
3,12346.0,2009-12-18 10:47:00,3.0,3,28.125,0.0,3.0
4,12346.0,2009-12-18 10:55:00,0.0,4,22.7,-21.5,-3.0


In [42]:
# STEP 3.2 — Define “Observation End” (Critical Concept)

In [43]:
END_DATE = state_df["InvoiceDate"].max()
END_DATE


Timestamp('2011-12-09 12:50:00')

In [44]:
# STEP 3.3 — Define Inactivity Threshold (Censoring Rule)

In [45]:
INACTIVITY_THRESHOLD_DAYS = 180  # 6 months


In [46]:
# STEP 3.4 — Compute “Time Since Last Purchase”

In [47]:
last_purchase = (
    state_df.groupby("Customer ID")["InvoiceDate"]
            .max()
            .reset_index()
            .rename(columns={"InvoiceDate": "last_invoice_date"})
)

last_purchase["days_since_last_purchase"] = (
    END_DATE - last_purchase["last_invoice_date"]
).dt.days

last_purchase.head()


Unnamed: 0,Customer ID,last_invoice_date,days_since_last_purchase
0,12346.0,2011-01-18 10:01:00,325
1,12347.0,2011-12-07 15:52:00,1
2,12348.0,2011-09-25 13:13:00,74
3,12349.0,2011-11-21 09:51:00,18
4,12350.0,2011-02-02 16:01:00,309


In [48]:
# STEP 3.5 — Define Survival Status (Alive vs Censored)

In [49]:
last_purchase["is_alive"] = (
    last_purchase["days_since_last_purchase"]
    <= INACTIVITY_THRESHOLD_DAYS
)

last_purchase["is_alive"].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
is_alive,Unnamed: 1_level_1
True,0.591906
False,0.408094


In [50]:
# STEP 3.6 — Create Survival Table (Customer-Level)

In [51]:
survival_df["event"] = (
    survival_df["days_since_last_purchase"]
    > INACTIVITY_THRESHOLD_DAYS
).astype(int)


In [52]:
# STEP 3.7 — Merge Survival Target Back to State (Time-Indexed)

In [53]:
state_survival_df = state_df.merge(
    survival_df[["Customer ID", "is_alive", "duration", "event"]],
    on="Customer ID",
    how="left"
)

state_survival_df.head()


Unnamed: 0,Customer ID,InvoiceDate,recency_days,frequency,monetary_avg,delta_revenue,delta_recency,is_alive,duration,event
0,12346.0,2009-12-14 08:34:00,,0,45.0,,,False,325,1
1,12346.0,2009-12-14 11:00:00,0.0,1,33.75,-22.5,,False,325,1
2,12346.0,2009-12-14 11:02:00,0.0,2,30.0,0.0,0.0,False,325,1
3,12346.0,2009-12-18 10:47:00,3.0,3,28.125,0.0,3.0,False,325,1
4,12346.0,2009-12-18 10:55:00,0.0,4,22.7,-21.5,-3.0,False,325,1


In [54]:
# STEP 3.8 — Sanity Checks (Must Pass)

In [55]:
#  No missing survival labels
state_survival_df[["is_alive", "duration", "event"]].isna().sum()



Unnamed: 0,0
is_alive,0
duration,0
event,0


In [56]:
#  Reasonable alive ratio
state_survival_df["is_alive"].mean()


np.float64(0.8323118874699641)

In [57]:
# STEP 3.9 — Save Phase 3 Artifact

In [58]:
state_survival_df.to_parquet(
    "phase3_state_with_survival.parquet",
    index=False
)
