In [2]:
bureau_raw_df = spark.read.format("csv").option("header","true").load("Files/raw_data/bureau_customer_raw.csv")
# df now is a Spark DataFrame containing CSV data from "Files/raw_data/bureau_customer_raw.csv".
display(bureau_raw_df)

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 779f8eb3-5018-4d09-84d8-dbad426bdeab)

In [9]:
from pyspark.sql.functions import sha2, col, concat_ws

bureau_tokenized_df = bureau_raw_df.withColumn("TokenizedID", sha2(concat_ws("SSN", "Name"), 256)) \
                             .select("TokenizedID", "CreditScore", "NumDefaults", "InquiriesLast6Mo")

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 11, Finished, Available, Finished)

In [10]:
display(bureau_tokenized_df)

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e33d004c-2c1a-4b10-bd06-4b05a579b905)

## Differential Privacy on Credit Bureau Data

In [8]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import numpy as np

# Parameters for Differential Privacy
epsilon = 1.0  # privacy budget (lower = more private)
sensitivity = {
    "CreditScore": 50,      # assumed max change if one row changes
    "NumDefaults": 1,
    "InquiriesLast6Mo": 2
}

# Laplace mechanism
def add_laplace_noise(value, sensitivity, epsilon):
    scale = sensitivity / epsilon
    noise = np.random.laplace(0, scale)
    return float(value) + noise

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 10, Finished, Available, Finished)

In [11]:
# UDFs for noisy columns
laplace_udf_score = udf(lambda x: add_laplace_noise(x, sensitivity["CreditScore"], epsilon), DoubleType())
laplace_udf_defaults = udf(lambda x: add_laplace_noise(x, sensitivity["NumDefaults"], epsilon), DoubleType())
laplace_udf_inquiries = udf(lambda x: add_laplace_noise(x, sensitivity["InquiriesLast6Mo"], epsilon), DoubleType())

# Apply differential privacy
dp_df = bureau_tokenized_df.withColumn("DP_CreditScore", laplace_udf_score(col("CreditScore"))) \
                 .withColumn("DP_NumDefaults", laplace_udf_defaults(col("NumDefaults"))) \
                 .withColumn("DP_InquiriesLast6Mo", laplace_udf_inquiries(col("InquiriesLast6Mo")))

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 13, Finished, Available, Finished)

In [12]:
# Drop original sensitive columns (optional)
dp_sanitized_df = dp_df.drop("CreditScore", "NumDefaults", "InquiriesLast6Mo")

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 14, Finished, Available, Finished)

In [13]:
# Save or use this as a clean room view
dp_sanitized_df.show()

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 15, Finished, Available, Finished)

+--------------------+------------------+--------------------+-------------------+
|         TokenizedID|    DP_CreditScore|      DP_NumDefaults|DP_InquiriesLast6Mo|
+--------------------+------------------+--------------------+-------------------+
|79b2e757d7c0df9fa...|488.62546811254845|   2.992230968348187|-0.7924719895250356|
|1561be5562449fd1f...| 623.3407989324556|  0.4443780952995523|  4.410320721796793|
|fbcd1ad1239c94c83...| 728.6089369741393| -1.2515634510707552| 10.175557366732889|
|8207b3040e516aea8...|   696.68456744055|   1.893537222128773|  7.836413586094627|
|63cbeac1b8212f5ad...| 668.5918631816483|  0.7794475686714584| 3.0016266411103514|
|ca3d1c648666bea2b...| 660.0865139288923|    3.39190393952105| 0.8732418328047649|
|6ef7257c45a2b57f0...| 633.6302353076231|  1.8297785438325174| 0.2562770937510682|
|4e1de4ce054d8ca03...| 553.4532136366339|   3.060041657531441|  6.706110810102823|
|42d11be07de98435f...|  650.081881385375|  2.3331769501971498| -6.748022290618345|
|165

In [14]:
dp_sanitized_df.write.mode("overwrite").saveAsTable("dp_creditscore_data")

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 16, Finished, Available, Finished)

In [15]:
df = spark.sql("SELECT * FROM BureauLH.dp_creditscore_data LIMIT 1000")
display(df)

StatementMeta(, c936e209-3e79-4488-b065-c7bdeef1c631, 17, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a3ae5460-0155-48c0-8eef-59035900eb82)