🧠 What We’re Doing in This Notebook:

Load the feature-engineered dataset from curated/criteo-1m-features

Split into train/test using stratified sampling on treatment

Train a LightGBM uplift model using CausalML or XGBoost (vanilla uplift workaround)

Track everything via MLflow — metrics, parameters, model


In [0]:
# Step 1: Set the storage account name
storage_account = "stcampaigntp"  # update if your storage account is different

# Step 2: Mount access using secret
spark.conf.set(
  f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
  dbutils.secrets.get(scope="local-scope", key="storage-account-key")
)

# Step 3: Define your path prefix
path_prefix = f"abfss://curated@{storage_account}.dfs.core.windows.net/"


In [0]:
import mlflow
import pandas as pd
from pyspark.sql.functions import col
from sklearn.model_selection import train_test_split

# Load curated dataset
df = spark.read.parquet(f"abfss://curated@{storage_account}.dfs.core.windows.net/criteo-1m-features")

# Convert to pandas for causalml
pdf = df.toPandas()

# Features and labels
features = [c for c in pdf.columns if c.startswith("f")]
X = pdf[features]
treatment = pdf["treatment"]
y = pdf["conversion"]

# Train/test split
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(
    X, y, treatment, test_size=0.2, stratify=treatment, random_state=42
)


🎯 NEXT STEP: Causal Uplift Modeling with causalml

We’ll now train an uplift model to estimate the individual treatment effect (ITE):

“How likely is this person to convert because they were treated?”

In [0]:
from sklift.models import SoloModel
from lightgbm import LGBMClassifier
from sklift.metrics import uplift_at_k

# Define uplift model with supported method
model = SoloModel(
    estimator=LGBMClassifier(random_state=42),
    method='treatment_interaction'  # ✅ supported method
)

# Fit model
model.fit(X_train, y_train, treatment=t_train)

# Predict uplift
uplift_scores = model.predict(X_test)

# Evaluate uplift
score = uplift_at_k(y_test, uplift_scores, t_test, strategy='overall', k=0.3)
print(f"Uplift@30%: {score:.4f}")


In [0]:
from sklift.metrics import uplift_at_k

# Evaluate uplift at top 30%
score = uplift_at_k(y_test, uplift_scores, t_test, strategy='overall', k=0.3)
print(f"Uplift@30%: {score:.4f}")


In [0]:
import pandas as pd

# Create a DataFrame for scored data
scored_df = pd.DataFrame({
    "uplift_score": uplift_scores,
    "treatment": t_test.values,
    "conversion": y_test.values
})

# Convert to Spark DataFrame
spark_df = spark.createDataFrame(scored_df)


In [0]:
spark_df.write.mode("overwrite").parquet(
    f"abfss://scored@{storage_account}.dfs.core.windows.net/criteo-1m-scored"
)


In [0]:
# reuse this model, log and register it in MLflow
import mlflow

with mlflow.start_run(run_name="uplift-xgboost"):
    mlflow.log_metric("uplift_at_30%", score)
    mlflow.set_tag("model_type", "XGBTRegressor")
    mlflow.set_tag("framework", "causalml" if 'causalml' in model.__module__ else "sklift")
    print("MLflow logging done.")


In [0]:
top_100_df = scored_df.sort_values(by="uplift_score", ascending=False).head(100)
top_100_df.head()


In [0]:
top100_spark_df = spark.createDataFrame(top_100_df)
top100_spark_df.write.mode("overwrite").parquet(
    f"abfss://scored@{storage_account}.dfs.core.windows.net/criteo-1m-top100"
)


Step 7: Evaluate Uplift Results & Create Business-Friendly Outputs

We now create interpretations and business outcomes, even if you skip plots.

In [0]:
# Add uplift score to test set
X_test_copy = X_test.copy()
X_test_copy["uplift"] = uplift_scores
X_test_copy["treatment"] = t_test.values
X_test_copy["conversion"] = y_test.values

# Sort by uplift
ranked = X_test_copy.sort_values(by="uplift", ascending=False)

# Top 1% target strategy
top_1pct = int(0.01 * len(ranked))
top_group = ranked.head(top_1pct)

# Conversion rate in top group vs random
conv_top = top_group["conversion"].mean()
conv_all = ranked["conversion"].mean()

print(f"📈 Conversion Rate in Top 1%: {conv_top:.4f}")
print(f"📉 Baseline Conversion Rate: {conv_all:.4f}")
print(f"📊 Uplift in Top 1%: {conv_top - conv_all:.4f}")


In [0]:
# saving output to curated container

# Convert to Spark
spark_df = spark.createDataFrame(ranked)

# Save to curated container
spark_df.write.mode("overwrite").parquet(f"abfss://curated@{storage_account}.dfs.core.windows.net/criteo-1m-uplift-scored")


In [0]:
summary = {
    "Top 1% Conversion": conv_top,
    "Overall Conversion": conv_all,
    "Absolute Uplift": conv_top - conv_all,
    "Relative Uplift (%)": 100 * (conv_top - conv_all) / conv_all
}
pd.DataFrame([summary])


In [0]:
import pandas as pd

# Create final DataFrame to export
uplift_df = pd.DataFrame({
    "uplift_score": uplift_scores,
    "conversion": y_test.values,
    "treatment": t_test.values
})

# Save as Parquet to curated layer
uplift_spark_df = spark.createDataFrame(uplift_df)
uplift_spark_df.write.mode("overwrite").parquet(f"abfss://curated@{storage_account}.dfs.core.windows.net/uplift-output/")
