In [0]:
# Grab secrets from the local scope (do NOT print them)
client_id       = dbutils.secrets.get("local-scope","sp-client-id")
tenant_id       = dbutils.secrets.get("local-scope","sp-tenant-id")
client_secret   = dbutils.secrets.get("local-scope","sp-client-secret")
storage_account = dbutils.secrets.get("local-scope","storage-account-name")






In [0]:
# Configure OAuth (service principal) for ADLS Gen2
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net",
               "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net",
               f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
raw_path = f"abfss://raw@{storage_account}.dfs.core.windows.net/dataset.csv/"
display(dbutils.fs.ls(raw_path))

In [0]:
# Adjust filename if yours differs
src = raw_path  # or "criteo_uplift.csv"

df_raw = (spark.read
          .option("header","true")
          .option("inferSchema","true")
          .csv(src))

print("Row count:", df_raw.count())
print("Column count:", len(df_raw.columns))
display(df_raw.limit(10))


In [0]:
cols = [c.lower() for c in df_raw.columns]
candidate_labels = [c for c in cols if c in ["conversion","response","visit","label","click"]]
candidate_treat  = [c for c in cols if c in ["treatment","exposed","exposure","test_group"]]
print("Label candidates:", candidate_labels)
print("Treatment candidates:", candidate_treat)


In [0]:
LABEL_COL = "conversion"  # <-- change to the actual label name found above
TREAT_COL = "treatment"   # <-- if present; else set to None

from pyspark.sql.functions import col

df = df_raw.select(*df_raw.columns)  # start from raw
if LABEL_COL not in [c.lower() for c in df.columns]:
    raise ValueError("Set LABEL_COL to an existing label column name.")

# Basic EDA: class balance
pos_rate = df.select((col(LABEL_COL).cast("int") == 1).cast("int").alias("pos")).agg({"pos":"avg"}).collect()[0][0]
print(f"Positive rate (approx): {pos_rate:.4f}")
