In [0]:
# Step 1: Set the storage account name
storage_account = "stcampaigntp"  # update if your storage account is different

# Step 2: Mount access using secret
spark.conf.set(
  f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
  dbutils.secrets.get(scope="local-scope", key="storage-account-key")
)


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

# 🔁 Create or get Spark session (if not already)
spark = SparkSession.builder.getOrCreate()

# 📥 Load the feature-enhanced dataset
df = spark.read.parquet(f"abfss://curated@{storage_account}.dfs.core.windows.net/criteo-1m-features")

# ✅ Optional: Check the schema
df.printSchema()


In [0]:
from pyspark.sql.functions import when, col

# Step 2: Add uplift_label (4-class) and uplift_binary_label (binary target)
df = df.withColumn(
    "uplift_label",
    when((col("treatment") == 0) & (col("conversion") == 0), 0)
    .when((col("treatment") == 0) & (col("conversion") == 1), 1)
    .when((col("treatment") == 1) & (col("conversion") == 0), 2)
    .when((col("treatment") == 1) & (col("conversion") == 1), 3)
)

df = df.withColumn(
    "uplift_binary_label",
    when((col("treatment") == 1) & (col("conversion") == 1), 1).otherwise(0)
)

# Confirm added columns
df.select("treatment", "conversion", "uplift_label", "uplift_binary_label").show(5)


In [0]:
# ⚠️ If your dataset is too large, consider sampling before converting to Pandas
# Example: df = df.sample(fraction=0.2)

# Convert Spark DataFrame to Pandas
pandas_df = df.toPandas()
print(f"✅ Converted to pandas with shape: {pandas_df.shape}")

# Confirm value counts for stratification
print("\n🎯 Value counts of uplift_label:")
print(pandas_df['uplift_label'].value_counts())

# Train/Test Split (Stratified by uplift_label)
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    pandas_df,
    test_size=0.3,
    random_state=42,
    stratify=pandas_df['uplift_label']
)

print(f"\n✅ Train shape: {train_df.shape} | Test shape: {test_df.shape}")


In [0]:
# Define columns used for modeling (all renamed features)
feature_cols = ['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12']

# Check that all columns exist in the DataFrame
missing = [col for col in feature_cols if col not in train_df.columns]
if missing:
    raise ValueError(f"Missing expected feature columns: {missing}")

print("✅ Feature columns defined:", feature_cols)


In [0]:
from sklearn.model_selection import train_test_split

# 80/20 split for training and validation
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=42)

print(f"✅ Train set shape: {train_df.shape}")
print(f"✅ Validation set shape: {valid_df.shape}")


In [0]:
import numpy as np

# 🟩 Create class transformation label
# Only retain rows where class transformation is meaningful
train_df['uplift_binary_label'] = np.where(
    (train_df['treatment'] == 1) & (train_df['conversion'] == 1), 1,
    np.where((train_df['treatment'] == 0) & (train_df['conversion'] == 1), 0, np.nan)
)

valid_df['uplift_binary_label'] = np.where(
    (valid_df['treatment'] == 1) & (valid_df['conversion'] == 1), 1,
    np.where((valid_df['treatment'] == 0) & (valid_df['conversion'] == 1), 0, np.nan)
)

# 🟦 Two-Model Approach: Split treated and control groups
train_treated = train_df[train_df['treatment'] == 1]
train_control = train_df[train_df['treatment'] == 0]

valid_treated = valid_df[valid_df['treatment'] == 1]
valid_control = valid_df[valid_df['treatment'] == 0]

# 🧪 Diagnostic printout
print(f"📦 Prepared modeling datasets:")
print(f"Two-Model: Treated → {train_treated.shape} | Control → {train_control.shape}")
print(f"Class Transformation: {train_df[['uplift_binary_label']].dropna().shape}")


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Optional: Set random_state for reproducibility
seed = 42

# -------------------------------
# 🟩 Logistic Regression – Two Model
# -------------------------------
logit_treat = LogisticRegression(random_state=seed, max_iter=1000)
logit_control = LogisticRegression(random_state=seed, max_iter=1000)

logit_treat.fit(train_treated[feature_cols], train_treated['conversion'])
logit_control.fit(train_control[feature_cols], train_control['conversion'])

# -------------------------------
# 🟩 Logistic Regression – Class Transformation
# -------------------------------
train_ct = train_df.dropna(subset=['uplift_binary_label'])

logit_ct = LogisticRegression(random_state=seed, max_iter=1000)
logit_ct.fit(train_ct[feature_cols], train_ct['uplift_binary_label'])

# -------------------------------
# 🟦 Random Forest – Two Model
# -------------------------------
rf_treat = RandomForestClassifier(n_estimators=100, random_state=seed)
rf_control = RandomForestClassifier(n_estimators=100, random_state=seed)

rf_treat.fit(train_treated[feature_cols], train_treated['conversion'])
rf_control.fit(train_control[feature_cols], train_control['conversion'])

# -------------------------------
# 🟦 Random Forest – Class Transformation
# -------------------------------
rf_ct = RandomForestClassifier(n_estimators=100, random_state=seed)
rf_ct.fit(train_ct[feature_cols], train_ct['uplift_binary_label'])

print("✅ All models trained successfully!")


In [0]:
# Create test set (entire dataset used here for simplicity)
X_test = train_df[feature_cols]
T_test = train_df['treatment']
Y_test = train_df['conversion']

# -------------------------------
# 🟩 Logistic Regression – Two-Model
# -------------------------------
p_treat_logit = logit_treat.predict_proba(X_test)[:, 1]
p_control_logit = logit_control.predict_proba(X_test)[:, 1]
uplift_logit_twomodel = p_treat_logit - p_control_logit

# -------------------------------
# 🟩 Logistic Regression – Class Transformation
# -------------------------------
uplift_logit_ct = logit_ct.predict_proba(X_test)[:, 1]

# -------------------------------
# 🟦 Random Forest – Two-Model
# -------------------------------
p_treat_rf = rf_treat.predict_proba(X_test)[:, 1]
p_control_rf = rf_control.predict_proba(X_test)[:, 1]
uplift_rf_twomodel = p_treat_rf - p_control_rf

# -------------------------------
# 🟦 Random Forest – Class Transformation
# -------------------------------
uplift_rf_ct = rf_ct.predict_proba(X_test)[:, 1]

print("✅ Uplift scores predicted for all models.")


In [0]:
from sklift.metrics import qini_curve
from sklift.viz import plot_qini_curve
import matplotlib.pyplot as plt

# Prepare dictionary of model outputs
uplift_scores = {
    "LR (Two-Model)": uplift_logit_twomodel,
    "LR (Class Trans.)": uplift_logit_ct,
    "RF (Two-Model)": uplift_rf_twomodel,
    "RF (Class Trans.)": uplift_rf_ct
}

# Set plot style
plt.figure(figsize=(10, 6))
plt.title("🎯 Qini Curves – Uplift Model Comparison", fontsize=14)

# Plot each model’s Qini curve
for model_name, uplift in uplift_scores.items():
    plot_qini_curve(Y_test.values, uplift, T_test.values, label=model_name)

# Final plot touches
plt.legend()
plt.xlabel("Share of Population Targeted (%)")
plt.ylabel("Incremental Conversions (Qini)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [0]:
from sklift.metrics import qini_auc_score

# Store model names and Qini scores
qini_scores = {}

# Loop through all models and calculate Qini AUC
for model_name, uplift in uplift_scores.items():
    score = qini_auc_score(y_true=Y_test.values, uplift=uplift, treatment=T_test.values)
    qini_scores[model_name] = score

# Display scores
print("📊 Qini Coefficients (Area Under Qini Curve):")
for model, score in sorted(qini_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{model}: {score:.4f}")


In [0]:
import numpy as np
import pandas as pd

def simulate_gain_curve(y_true, treatment, uplift_scores, model_name):
    data = pd.DataFrame({
        'treatment': treatment,
        'conversion': y_true,
        'uplift_score': uplift_scores
    })
    
    data = data.sort_values(by='uplift_score', ascending=False).reset_index(drop=True)
    total_samples = len(data)
    
    # Simulate at 10% intervals
    percents = np.arange(0.1, 1.1, 0.1)
    incremental_conversions = []
    
    for pct in percents:
        cutoff = int(pct * total_samples)
        targeted = data.iloc[:cutoff]
        
        treated = targeted[targeted['treatment'] == 1]
        control = targeted[targeted['treatment'] == 0]
        
        # Conversion rates
        cr_treated = treated['conversion'].mean() if not treated.empty else 0
        cr_control = control['conversion'].mean() if not control.empty else 0
        uplift = cr_treated - cr_control
        
        incremental_conversions.append(uplift)
    
    return percents, incremental_conversions

# Plot for each model
plt.figure(figsize=(10, 6))
for model_name, uplift_scores_model in uplift_scores.items():
    x, y = simulate_gain_curve(Y_test.values, T_test.values, uplift_scores_model, model_name)
    plt.plot(x * 100, y, label=model_name)

plt.title("📈 Simulated Incremental Lift vs % Targeted")
plt.xlabel("% of Population Targeted (Top Uplift Score)")
plt.ylabel("Incremental Conversion Lift")
plt.legend()
plt.grid(True)
plt.show()


### Step 4: Train/Test Split using Uplift Labels

In uplift modeling, it’s essential that the relationship between treatment and outcome is preserved in both the training and testing data.

To do this, we use stratified splitting based on the uplift_label that we previously created:

0: Control group – did not convert

1: Control group – converted

2: Treated group – did not convert

3: Treated group – converted

By stratifying on this label, we maintain the same distribution of user behavior across training and test sets. This ensures our model learns and is evaluated on balanced group proportions.

In [0]:
from sklearn.model_selection import train_test_split

# Confirm label column
assert 'uplift_label' in pandas_df.columns, "uplift_label column is missing."

# Keep treatment and conversion columns as we need them later
feature_cols = [col for col in pandas_df.columns if col.startswith('feature_')]
X = pandas_df[feature_cols + ['treatment', 'conversion']]
y = pandas_df['uplift_label']

# Stratified Train/Test Split — preserves uplift label distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

# Merge back uplift_label for analysis
train_df = X_train.copy()
train_df['uplift_label'] = y_train

test_df = X_test.copy()
test_df['uplift_label'] = y_test

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Optional: plot uplift label distribution
def plot_uplift_distribution(df, title):
    label_names = {
        0: 'Control - No Conversion',
        1: 'Control - Conversion',
        2: 'Treated - No Conversion',
        3: 'Treated - Conversion'
    }

    label_counts = df['uplift_label'].value_counts().sort_index()
    label_counts.index = label_counts.index.map(label_names)

    plt.figure(figsize=(10, 5))
    bars = plt.bar(label_counts.index, label_counts.values)
    plt.title(title)
    plt.ylabel("Count")

    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:,}', xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

    plt.xticks(rotation=20)
    plt.tight_layout()
    plt.show()

plot_uplift_distribution(train_df, "Uplift Label Distribution - Training Set")
plot_uplift_distribution(test_df, "Uplift Label Distribution - Testing Set")


In [0]:
print(pandas_df.columns.tolist())
print(pandas_df.shape)
pandas_df.head()


In [0]:
# 🛠️ Fix: Automatically detect valid feature columns (excluding target and treatment)
exclude_cols = ['treatment', 'conversion', 'uplift_label', 'uplift_binary_label']
feature_cols = [col for col in pandas_df.columns if col not in exclude_cols]

print("✅ Final feature columns used:", feature_cols)


In [0]:
print(train_df.columns.tolist())


In uplift modeling, we don’t just predict conversion — we try to predict the incremental effect of treatment.

There are several modeling strategies. We’ll focus on two common ones (just like the Kaggle reference):

## 1. Two-Model Approach

We train:

One model on the treatment group

Another model on the control group

Then we estimate uplift by:

Uplift(x) = P(convert | treated, x) - P(convert | control, x)

## 2. Class Transformation Approach

This converts the dataset into a binary classification problem using this rule:

Treatment	  Conversion	    Transformed Label
1	            1	            1 (Positive uplift)
0	            0	            1 (Positive uplift)
1	            0	            0 (Negative uplift)
0	            1	            0 (Negative uplift)

We can then use any binary classifier (e.g., LightGBM, Logistic Regression) to predict uplift.

--------------------------------------------------------------

### Step 5: Model Preparation for Uplift Modeling

Now that we have stratified training and test sets, we will structure the data for uplift modeling using two approaches:

## Two-Model Strategy

Train two separate models: one on treated users, another on control users.

At prediction time, compute uplift as the difference between predicted probabilities.

## Class Transformation Strategy

Convert treatment + outcome combinations into a binary label.

This allows training a single binary classifier to directly learn uplift behavior.

Both are widely used in industry and research. We’ll prepare the data for both.

In [0]:
# First, combine back treatment and conversion for clarity
train_df['treatment'] = train_df['treatment'].astype(int)
train_df['conversion'] = train_df['conversion'].astype(int)

test_df['treatment'] = test_df['treatment'].astype(int)
test_df['conversion'] = test_df['conversion'].astype(int)

# 📌 Prepare for Two-Model Approach
train_treated = train_df[train_df['treatment'] == 1].copy()
train_control = train_df[train_df['treatment'] == 0].copy()

# Drop uplift_label column if still present
for df in [train_treated, train_control]:
    if 'uplift_label' in df.columns:
        df.drop(columns=['uplift_label'], inplace=True)

# Separate features and labels
X_treated = train_treated.drop(columns=['conversion', 'treatment'])
y_treated = train_treated['conversion']

X_control = train_control.drop(columns=['conversion', 'treatment'])
y_control = train_control['conversion']

# ✅ Ready for Two-Model training

# 📌 Prepare for Class Transformation Strategy
def transform_classification_label(treatment, conversion):
    return int((treatment == 1 and conversion == 1) or (treatment == 0 and conversion == 0))

train_df['uplift_binary_label'] = train_df.apply(
    lambda row: transform_classification_label(row['treatment'], row['conversion']), axis=1
)

X_class_trans = train_df.drop(columns=['conversion', 'uplift_label', 'uplift_binary_label'])
y_class_trans = train_df['uplift_binary_label']

# ✅ Ready for Class Transformation training

print("Prepared datasets:")
print("Two-Model: Treated →", X_treated.shape, "| Control →", X_control.shape)
print("Class Transformation:", X_class_trans.shape)


In [0]:
# reuse this model, log and register it in MLflow
import mlflow

with mlflow.start_run(run_name="uplift-xgboost"):
    mlflow.log_metric("uplift_at_30%", score)
    mlflow.set_tag("model_type", "XGBTRegressor")
    mlflow.set_tag("framework", "causalml" if 'causalml' in model.__module__ else "sklift")
    print("MLflow logging done.")
