🧠 What We’re Doing in This Notebook:

Load the feature-engineered dataset from curated/criteo-1m-features

Split into train/test using stratified sampling on treatment

Train a LightGBM uplift model using CausalML or XGBoost (vanilla uplift workaround)

Track everything via MLflow — metrics, parameters, model


In [0]:
# Step 1: Set the storage account name
storage_account = "stcampaigntp"  # update if your storage account is different

# Step 2: Mount access using secret
spark.conf.set(
  f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
  dbutils.secrets.get(scope="local-scope", key="storage-account-key")
)

# Step 3: Define your path prefix
path_prefix = f"abfss://curated@{storage_account}.dfs.core.windows.net/"


In [0]:
import mlflow
import pandas as pd
from pyspark.sql.functions import col
from sklearn.model_selection import train_test_split

# Load curated dataset
df = spark.read.parquet(f"abfss://curated@{storage_account}.dfs.core.windows.net/criteo-1m-features")

# Convert to pandas for causalml
pdf = df.toPandas()

# Features and labels
features = [c for c in pdf.columns if c.startswith("f")]
X = pdf[features]
treatment = pdf["treatment"]
y = pdf["conversion"]

# Train/test split
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(
    X, y, treatment, test_size=0.2, stratify=treatment, random_state=42
)


🎯 NEXT STEP: Causal Uplift Modeling with causalml

We’ll now train an uplift model to estimate the individual treatment effect (ITE):

“How likely is this person to convert because they were treated?”

In [0]:
from sklift.models import SoloModel
from lightgbm import LGBMClassifier
from sklift.metrics import uplift_at_k

# Define uplift model with supported method
model = SoloModel(
    estimator=LGBMClassifier(random_state=42),
    method='treatment_interaction'  # ✅ supported method
)

# Fit model
model.fit(X_train, y_train, treatment=t_train)

# Predict uplift
uplift_scores = model.predict(X_test)

# Evaluate uplift
score = uplift_at_k(y_test, uplift_scores, t_test, strategy='overall', k=0.3)
print(f"Uplift@30%: {score:.4f}")


In [0]:
from sklift.metrics import uplift_at_k

# Evaluate uplift at top 30%
score = uplift_at_k(y_test, uplift_scores, t_test, strategy='overall', k=0.3)
print(f"Uplift@30%: {score:.4f}")


In [0]:
import pandas as pd

# Create a DataFrame for scored data
scored_df = pd.DataFrame({
    "uplift_score": uplift_scores,
    "treatment": t_test.values,
    "conversion": y_test.values
})

# Convert to Spark DataFrame
spark_df = spark.createDataFrame(scored_df)


In [0]:
spark_df.write.mode("overwrite").parquet(
    f"abfss://scored@{storage_account}.dfs.core.windows.net/criteo-1m-scored"
)


In [0]:
import mlflow

with mlflow.start_run(run_name="uplift-xgboost"):
    mlflow.log_metric("uplift_at_30%", score)
    mlflow.set_tag("model_type", "XGBTRegressor")
    mlflow.set_tag("framework", "causalml" if 'causalml' in model.__module__ else "sklift")
    print("MLflow logging done.")
