PHASE 7
Counterfactual Uplift & Policy Evaluation

In [39]:
# STEP 7.1 — Load Phase 6 Artifacts

In [40]:
import pandas as pd
import numpy as np

clv_df = pd.read_parquet("phase5_expected_clv.parquet")
state_df = pd.read_parquet("phase2_customer_state.parquet")

# Use latest state per customer
latest_state = (
    state_df.sort_values("InvoiceDate")
            .groupby("Customer ID")
            .tail(1)
            .reset_index(drop=True)
)

df = clv_df.merge(latest_state, on="Customer ID")
df.head()


Unnamed: 0,Customer ID,expected_clv,InvoiceDate,recency_days,frequency,monetary_avg,delta_revenue,delta_recency
0,12346.0,28353.198765,2011-01-18 10:01:00,203.0,11,6463.038333,77041.29,85.0
1,12347.0,5959.262516,2011-12-07 15:52:00,37.0,7,704.165,-1069.5,-53.0
2,12348.0,5518.680179,2011-09-25 13:13:00,173.0,4,403.88,-57.0,103.0
3,12349.0,3631.677336,2011-11-21 09:51:00,389.0,3,1107.1725,354.93,227.0
4,12350.0,2648.932017,2011-02-02 16:01:00,,0,334.4,,


In [41]:
# STEP 7.2 — Define Treatment & Outcome (Synthetic but Defensible)

Treatment

T = 1 → retention action

T = 0 → no action

In [42]:
# STEP 7.3 — Construct Proxy Treatment Effect

In [43]:
# We simulate heterogeneous uplift using behavior


np.random.seed(42)

df["treatment"] = np.random.binomial(1, 0.5, size=len(df))


In [44]:
# Customers with high recency + low frequency benefit more
base_risk = (
    0.6 * (df["recency_days"] / df["recency_days"].max())
    - 0.4 * (df["frequency"] / df["frequency"].max())
)

treatment_effect = 0.2 * (1 - base_risk)

df["outcome"] = (
    1 - base_risk
    + df["treatment"] * treatment_effect
    + np.random.normal(0, 0.02, size=len(df))
)


In [45]:
# STEP 7.3.1 — Impute state variables before outcome generation

In [46]:
from sklearn.impute import SimpleImputer

state_features = [
    "recency_days",
    "frequency",
    "monetary_avg",
    "delta_revenue",
    "delta_recency"
]

imputer = SimpleImputer(strategy="median")
df[state_features] = imputer.fit_transform(df[state_features])


In [47]:
base_risk = (
    0.6 * (df["recency_days"] / df["recency_days"].max())
    - 0.4 * (df["frequency"] / df["frequency"].max())
)

treatment_effect = 0.2 * (1 - base_risk)

df["outcome"] = (
    1 - base_risk
    + df["treatment"] * treatment_effect
    + np.random.normal(0, 0.02, size=len(df))
)


In [48]:
df["outcome"].isna().sum()


np.int64(0)

In [49]:
treated = df[df["treatment"] == 1]
control = df[df["treatment"] == 0]

model_t.fit(treated[features], treated["outcome"])
model_c.fit(control[features], control["outcome"])


In [50]:
# STEP 7.4 — Uplift Modeling (Two-Model Approach)

In [51]:
# Split treated / control
features = [
    "recency_days",
    "frequency",
    "monetary_avg",
    "delta_revenue",
    "delta_recency"
]

treated = df[df["treatment"] == 1]
control = df[df["treatment"] == 0]


In [52]:
# Fit outcome models
from sklearn.ensemble import RandomForestRegressor

model_t = RandomForestRegressor(
    n_estimators=100,
    max_depth=6,
    random_state=42
)

model_c = RandomForestRegressor(
    n_estimators=100,
    max_depth=6,
    random_state=42
)

model_t.fit(treated[features], treated["outcome"])
model_c.fit(control[features], control["outcome"])


In [53]:
# Predict counterfactuals
mu_1 = model_t.predict(df[features])
mu_0 = model_c.predict(df[features])

df["uplift"] = mu_1 - mu_0
df["uplift"].describe()


Unnamed: 0,uplift
count,5881.0
mean,0.1849
std,0.020072
min,0.035772
25%,0.182714
50%,0.191182
75%,0.196223
max,0.358401


In [54]:
# STEP 7.5 — Convert Uplift → Incremental CLV (KEY STEP)

In [55]:
df["incremental_clv"] = df["uplift"] * df["expected_clv"]


In [56]:
# STEP 7.6 — Decision Optimization (Re-run with True Uplift)

In [57]:
ACTION_COST = 100
TOTAL_BUDGET = 50000
K = TOTAL_BUDGET // ACTION_COST

decision_uplift = (
    df.sort_values("incremental_clv", ascending=False)
      .head(K)
)

uplift_value = decision_uplift["incremental_clv"].sum()
uplift_value


np.float64(2997894.3647211646)

In [58]:
 # STEP 7.7 — Policy Evaluation vs Heuristics (MANDATORY)

In [59]:
# Heuristic 1 — Frequency only
freq_policy = (
    df.sort_values("frequency", ascending=False)
      .head(K)
)

freq_value = (
    freq_policy["uplift"] * freq_policy["expected_clv"]
).sum()


In [60]:
# Heuristic 2 — CLV only
clv_policy = (
    df.sort_values("expected_clv", ascending=False)
      .head(K)
)

clv_value = (
    clv_policy["uplift"] * clv_policy["expected_clv"]
).sum()


In [61]:
# Comparison table
comparison = pd.DataFrame({
    "Policy": ["Uplift-Optimized (CLV 4.0)", "CLV Only", "Frequency Only"],
    "Total Incremental Value": [uplift_value, clv_value, freq_value]
})

comparison


Unnamed: 0,Policy,Total Incremental Value
0,Uplift-Optimized (CLV 4.0),2997894.0
1,CLV Only,2989997.0
2,Frequency Only,1989192.0
