In [53]:
from pyspark.sql import SparkSession
from faker import Faker



spark = (
        SparkSession
        .builder.master("local")
        .appName("anonymise_data")
        .getOrCreate()
)

df = spark.read.format("parquet").load("./data/sample-prod-data/")

In [54]:
DATA_SAMPLE = 1.0

df_sampled = df.sample(withReplacement=False, fraction=DATA_SAMPLE)

df_sampled.show()

+---------+-------------------+----------+---------+-------------------+----------+--------------------+----------------------+----------+
|client_id|         event_date|first_name|last_name|              email|country_cd|      transaction_id|transaction_amount_usd|product_cd|
+---------+-------------------+----------+---------+-------------------+----------+--------------------+----------------------+----------+
|      100|2024-05-03 19:17:53|      Jack|     Paul|jack.paul@gmail.com|        US|5c93a148-13ac-4ed...|                  83.5|    hoodie|
|      100|2024-04-20 10:20:01|      Jack|     Paul|jack.paul@gmail.com|        US|91db632b-cfa9-47c...|                  72.3|  trousers|
|      200|2023-12-03 11:10:42|     Maria|   Sharak|  m.rak@hotmail.com|        ES|83d8bd8d-89ff-488...|                 120.1|    jacket|
|      300|2023-09-07 18:02:29|    Maciej|     Wilk|      djmac@onet.pl|        PL|419fde3d-c243-4b2...|                  25.9|    beanie|
+---------+----------------

In [43]:
fake = Faker()
print(f"Fake name: {fake.name()}")
print(f"Fake adress: {fake.address()}")
print(f"Fake email: {fake.ascii_email()}")
print(f"Fake bank account: {fake.bban()}")
print(f"Fake date of birth: {fake.date_of_birth()}")
print(f"Fake phone number: {fake.basic_phone_number()}")

Fake name: Matthew Marshall
Fake adress: 31005 Schmidt Circle
Port Richardville, PR 65626
Fake email: khill@garcia.com
Fake bank account: MIFC30070869587364
Fake date of birth: 1963-02-24
Fake phone number: 7479909080


In [55]:
df_masked = df_sampled

anyonymization_configs = [
    {
        "col_name": "client_id",
        "faker_fun": fake.unique.random_int,
        "args": {"min": 100, "max": 500},
    },
    {
        "col_name": "first_name",
        "faker_fun": fake.unique.first_name,
        "args": {},
    },
    {
        "col_name": "last_name",
        "faker_fun": fake.unique.last_name,
        "args": {},
    },
    {
        "col_name": "email",
        "faker_fun": fake.unique.email,
        "args": {},
    },
    {
        "col_name": "country_cd",
        "faker_fun": fake.unique.country_code,
        "args": {},
    },
    {
        "col_name": "transaction_id",
        "faker_fun": fake.unique.uuid4,
        "args": {},
    },
    {
        "col_name": "transaction_amount_usd",
        "faker_fun": fake.unique.random_int,
        "args": {"min": 100, "max": 500},
    },
]

for anyonymization_config in anyonymization_configs:

    col_name = anyonymization_config["col_name"]
    faker_fun = anyonymization_config["faker_fun"]
    args = anyonymization_config["args"]

    distinct_col_values = [value[0] for value in df_masked.select(col_name).drop_duplicates().collect()]
    replace_vals = {dcv: faker_fun(**args) for dcv in distinct_col_values}

    df_masked = df_masked.replace(replace_vals, subset=col_name)


df_masked.show(truncate=False)

+---------+-------------------+----------+---------+-------------------------+----------+------------------------------------+----------------------+----------+
|client_id|event_date         |first_name|last_name|email                    |country_cd|transaction_id                      |transaction_amount_usd|product_cd|
+---------+-------------------+----------+---------+-------------------------+----------+------------------------------------+----------------------+----------+
|303      |2024-05-03 19:17:53|Alicia    |Baker    |xmahoney@example.net     |PS        |444ed2b1-91f2-4172-b5ee-b4b31619037f|175.0                 |hoodie    |
|303      |2024-04-20 10:20:01|Alicia    |Baker    |xmahoney@example.net     |PS        |3db4b9af-596b-4140-a689-22f731768b54|387.0                 |trousers  |
|429      |2023-12-03 11:10:42|Natalie   |Bailey   |stevensimmons@example.com|CH        |b1ddf8eb-74ee-41f5-819e-63e98faf1243|156.0                 |jacket    |
|403      |2023-09-07 18:02:29|Joh