PHASE 5
Expected Survival × Conditional Value (CLV Emerges)

In [36]:
# STEP 5.1 — Load Phase 4 Artifacts

In [37]:
import pandas as pd
import numpy as np

person_period_df = pd.read_parquet("phase4_person_period_dataset.parquet")
print(person_period_df.shape)
person_period_df.head()


(133690, 8)


Unnamed: 0,Customer ID,time_bin,event,recency_days,frequency,monetary_avg,delta_revenue,delta_recency
0,12346.0,0,0,,0,45.0,,
1,12346.0,1,0,,0,45.0,,
2,12346.0,2,0,,0,45.0,,
3,12346.0,3,0,,0,45.0,,
4,12346.0,4,0,,0,45.0,,


In [38]:
# STEP 5.1.1 — Refit Hazard Model

In [39]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

features = [
    "recency_days",
    "frequency",
    "monetary_avg",
    "delta_revenue",
    "delta_recency",
    "time_bin"
]

X = person_period_df[features]
y = person_period_df["event"]

# Impute missing values
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# Refit hazard model
hazard_model = LogisticRegression(max_iter=1000)
hazard_model.fit(X_imputed, y)



In [40]:
# STEP 5.2 — Recompute Hazard Probabilities (Explicitly)

In [41]:
# Recreate feature matrix
features = [
    "recency_days",
    "frequency",
    "monetary_avg",
    "delta_revenue",
    "delta_recency",
    "time_bin"
]

X = person_period_df[features]

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# Predict hazard (P(event at t | alive until t))
hazard_prob = hazard_model.predict_proba(X_imputed)[:, 1]

person_period_df["hazard"] = hazard_prob
person_period_df[["Customer ID", "time_bin", "hazard"]].head()


Unnamed: 0,Customer ID,time_bin,hazard
0,12346.0,0,0.00563
1,12346.0,1,0.007274
2,12346.0,2,0.009393
3,12346.0,3,0.012122
4,12346.0,4,0.015632


In [42]:
# STEP 5.3 — Convert Hazard → Survival Probability

In [43]:
# Code (group-wise cumulative product)
person_period_df = person_period_df.sort_values(
    by=["Customer ID", "time_bin"]
)

person_period_df["survival_prob"] = (
    person_period_df.groupby("Customer ID")["hazard"]
    .transform(lambda x: (1 - x).cumprod())
)


In [44]:
# Quick sanity checks (run these)
# Survival prob range
person_period_df["survival_prob"].describe()

Unnamed: 0,survival_prob
count,133690.0
mean,0.6977414
std,0.3162008
min,3.74917e-49
25%,0.5166298
50%,0.8402659
75%,0.9475724
max,0.9965548


In [45]:
# Monotonic decrease per customer
(
    person_period_df
    .groupby("Customer ID")["survival_prob"]
    .apply(lambda x: x.is_monotonic_decreasing)
    .value_counts()
)


Unnamed: 0_level_0,count
survival_prob,Unnamed: 1_level_1
True,5881


In [46]:
# STEP 5.4 — Define Expected Conditional Revenue

In [47]:
person_period_df["expected_revenue"] = person_period_df["monetary_avg"]


In [48]:
# STEP 5.5 — Choose Discount Factor & Horizon

In [49]:
DISCOUNT_RATE = 0.95   # monthly
MAX_HORIZON = 12       # months


In [50]:
# STEP 5.6 — Compute Expected CLV Contribution per Period

In [51]:
person_period_df = person_period_df[
    person_period_df["time_bin"] < MAX_HORIZON
].copy()

person_period_df["discount"] = DISCOUNT_RATE ** person_period_df["time_bin"]

person_period_df["clv_contribution"] = (
    person_period_df["survival_prob"]
    * person_period_df["expected_revenue"]
    * person_period_df["discount"]
)


In [52]:
# STEP 5.7 — Aggregate to Customer-Level Expected CLV

In [53]:
clv_df = (
    person_period_df.groupby("Customer ID")["clv_contribution"]
    .sum()
    .reset_index()
    .rename(columns={"clv_contribution": "expected_clv"})
)

clv_df.describe()


Unnamed: 0,Customer ID,expected_clv
count,5881.0,5881.0
mean,15314.674205,5772.733497
std,1715.429759,13338.156007
min,12346.0,0.0
25%,13833.0,1291.182677
50%,15313.0,2877.532143
75%,16797.0,6143.483476
max,18287.0,401760.565363


In [54]:
# STEP 5.8 — Sanity & Intuition Checks

In [55]:
# High-frequency customers → higher CLV
clv_df.merge(
    person_period_df[["Customer ID", "frequency"]].drop_duplicates(),
    on="Customer ID"
).corr()


Unnamed: 0,Customer ID,expected_clv,frequency
Customer ID,1.0,0.01994,-0.064267
expected_clv,0.01994,1.0,0.378959
frequency,-0.064267,0.378959,1.0


In [56]:
# Survival decreases over horizon (spot check)
(
    person_period_df
    .groupby("time_bin")["survival_prob"]
    .mean()
    .head(10)
)


Unnamed: 0_level_0,survival_prob
time_bin,Unnamed: 1_level_1
0,0.914805
1,0.900121
2,0.850939
3,0.795134
4,0.738548
5,0.685236
6,0.628153
7,0.568123
8,0.50488
9,0.436036


In [57]:
# STEP 5.9 — Save Phase 5 Artifact

In [58]:
clv_df.to_parquet("phase5_expected_clv.parquet", index=False)
