## Overview

This notebook will show you how to handale the PII data in deltalake

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark import Row
from pyspark.sql.functions import col, sha2, udf, array
import base64
import boto3
import hashlib
import os
from delta.tables import *

In [0]:
dbutils.fs.rm("/tmp/employee", True)

Out[2]: True

In [0]:
employeeBronzeData = '[ \
                {"employee_id": "1", "name": "Bailey Sullivan", "address": "962 Carr Neck Suite 674 Rodriguezshire, DC 38907", \
                 "phone": "(792)372-8891", "ssn": "173-20-2537", "birthdate": "2002-06-27", "sex": "F", "job_title": "Web designer",  \
                 "company": "Walker Inc", "email": "ebutler@example.org", "credit_card": "30483664588532"}, \
                {"employee_id": "2", "name": "Kelly Walsh", "address": "36640 Jennifer Crest Katherineport, OH 65499", \
                 "phone": "(939)593-1101x3096", "ssn": "704-46-4736", "birthdate": "2002-09-13", "sex": "NA", "job_title": "Horticulturist, commercial", \
                 "company": "Bailey, Garcia and Becker", "email": "testrada@example.com", "credit_card": "4598140055698512"}, \
                {"employee_id": "3", "name": "Jessica Morgan", "address": "Unit 6357 Box 1287 DPO AP 37713", \
                 "phone": "031-516-7147", "ssn": "858-96-6093", "birthdate": "2012-12-16", "sex": "M", "job_title": "Systems developer", \
                 "company": "Henderson, Morales and Adams", "email": "elizabethdavis@example.com", "credit_card": "377646699843129"}, \
                {"employee_id": "4", "name": "Cory Garcia", "address": "0983 David Valleys Apt. 301 Pattersonchester, NC 64044", \
                 "phone": "678.888.3537", "ssn": "306-95-7897", "birthdate": "2020-11-27", "sex": "M", "job_title": "Insurance risk surveyor", \
                 "company": "Brandt LLC", "email": "ingrambruce@example.net", "credit_card": "4911243531845687347"}, \
                {"employee_id": "5", "name": "Annette Brock", "address": "021 Austin Field Lake Gregoryborough, RI 71294", \
                 "phone": "001-610-131-3233x8327", "ssn": "579-93-6202", "birthdate": "2005-08-29", "sex": "F", "job_title": "Firefighter", \
                 "company": "Rodriguez-Allen", "email": "stephaniejones@example.net", "credit_card": "4887590753378603"}, \
                {"employee_id": "6", "name": "Mark Sanchez", "address": "846 Huff Spur Apt. 726 West Jessica, PA 48561", \
                 "phone": "216.905.2262", "ssn": "303-33-9126", "birthdate": "2007-01-15", "sex": "M", "job_title": "Insurance claims handler", \
                 "company": "Schmidt-Adams", "email": "blevinssarah@example.com", "credit_card": "3577795263152276"}, \
                {"employee_id": "7", "name": "Henry Garrett", "address": "33968 Mason Plaza Suite 181 West Heather, UT 74877", \
                 "phone": "001-835-323-7131x9985", "ssn": "539-40-8913", "birthdate": "2012-07-21", "sex": "M", "job_title": "Artist", \
                 "company": "Smith, Kelly and Walters", "email": "vevans@example.com", "credit_card": "373314503936423"}, \
                {"employee_id": "8", "name": "James Mcmahon", "address": "363 Spence Mountain Jamesfurt, IN 58404", \
                 "phone": "001-816-407-9074x25533", "ssn": "376-25-0277", "birthdate": "2005-07-26", "sex": "F", "job_title": "Human resources officer", \
                 "company": "Elliott Inc", "email": "rgardner@example.com", "credit_card": "4424457845976019723"} \
           ]'
employeeBronzeDF = spark.read.json(sc.parallelize([employeeBronzeData]))

In [0]:
PIIControlList = '[ \
                   {"PII_Column": "address"}, \
                   {"PII_Column": "phone"}, \
                   {"PII_Column": "ssn"}, \
                   {"PII_Column": "email"}, \
                   {"PII_Column": "credit_card"} \
                  ]'
PIIControlDF = spark.read.json(sc.parallelize([PIIControlList]))
display(PIIControlDF)

def get_piicol_list():
    piicol_list = PIIControlDF.select(PIIControlDF['PII_Column']).collect()
    piicol = []
    for col in piicol_list:
        col_nm = col[0]
        piicol.append(col_nm)
    return piicol

PII_Column
address
phone
ssn
email
credit_card


In [0]:
def salt_key():
  salt_value = os.urandom(32)
  return salt_value

salt_key_udf = udf(salt_key, BinaryType())

In [0]:
employeeSaltDF=employeeBronzeDF.select('employee_id').withColumn("salt_key", salt_key_udf())
employeeSaltDF.write.format("delta").mode("overwrite").save("dbfs:/tmp/employee/employee-salt-key-control")
display(employeeSaltDF)

employee_id,salt_key
1,tP+gwH7zHVKRn+A8W1isqIYzgcSNlCkhxcuGzw9TBa4=
2,gVj/e4iExgmHEbMFKwa3k+ew7a7oB5Lxeu48OLKcGpA=
3,1ULAfKniCGvxswXvKS0P50qlR4iajA51iEFlbl4xzZg=
4,tPRj/oEejgbSyaupcTuo2VCLVYA04r1Du+xsz65MmyM=
5,LmjNJPVR685ANfICqlrmVnXMhfwJ78aZBJkpbRiMZSw=
6,AOPoXLFMRZFhf0kFC1GYADoTQvrdiMtVKtXXINh0FD8=
7,BtnrKPr1z64J+oF9nfRnDy9wwOzLFIxG6j91jd3Ldv8=
8,fcvvPw+EdX6lJZHZjf5J2qHnwX72/3QYclJ8rBQJJ9I=


In [0]:
def mask_col(col_val, salt):
    key = hashlib.pbkdf2_hmac('sha256', str(col_val).encode(), salt, 100000, dklen=64)
    return key

mask_udf = udf(mask_col, BinaryType())

def mask_dataframe(df, piicol):
    for col in df.columns:
        if col in piicol:           
            df = df.withColumn('masked_'+ col, mask_udf(df[col],df['salt_key'])).drop(col).withColumnRenamed('masked_'+ col, col)
    return df

In [0]:
silverEmployeeSaltDF = spark.read.format("delta").load("dbfs:/tmp/employee/employee-salt-key-control")
silverEmployeeJoinedDF = employeeBronzeDF.join(silverEmployeeSaltDF, "employee_id") \
                                         .drop(silverEmployeeSaltDF.employee_id)

silverEmployeeMaskedDF = mask_dataframe(silverEmployeeJoinedDF, get_piicol_list())

silverEmployeeMaskedDF=silverEmployeeMaskedDF.drop('salt_key')
silverEmployeeMaskedDF.write.format("delta").mode("overwrite").save("dbfs:/tmp/employee/employee-silver-masked")
display(silverEmployeeMaskedDF.select('employee_id', 'ssn', 'credit_card', 'address', 'email', 'phone'))

employee_id,ssn,credit_card,address,email,phone
1,h5FjPcK70STvpFsQEYvkGETRlLaq+xWagJiK/ApPm/X+PGAuFqJKocfvSauoCSIsFVbU0FDSb6L6QdG73T59Fw==,E6xmz8ZXXPwW+DEjwAaN9mgMaEIlRDtKa/V/f+BhypUkhKIB+GGd9Z9WqECbO3feKB5lrhs7WwKCCjW4qGG8AA==,Tl/8HqkfjdMVKtokMvijD6vUj7htPHgOiX9s/BjjnQDNRTgbHUO4hhC1Muhoa8zM7pFhAiGeOtSaQ69Fo5Xmog==,+V/fJKPUTn23ywEWj8FPVQT6U3GDB7kxSB3J6SfXApz91J/R50JZgSJgqI9N4PW68EHyqFZABdgf48tAC2CbNg==,1Xdsom5IN20BeGcHT3vxuzJqqPrruoCW39+GRJDe5YTjSqQP1/tc5M9SOsnjq+a8H/GmGmyG+djCCH0FQieFgg==
2,IdFFUr4ykTcO1ZICaQ3/R3nDWYIHgWBb5mDNyOHqZskTHX3qMWzSmn7Ko4CNeWLVKvTGKyUn9qd4NMfLHRAneg==,O+JQOjAcsS8M7x5vG62IFNDShKA6uEutfcqMEGxlpCJNl+odsO1wdm820MxIvj3JIc+zj7b4G30MReuWlKln7g==,wU1iO+ZE2oDMyRGrvyERNFu+ye0tsgciJuvUhXMKayuc6Ngc5h1ZSjjYpAxmBBgotyd3kmU0E1nh/Gyy1GEIvQ==,u1Ycjrg3Yyh0REw56RrrlfgltVY+HulHEHSxWnNko5tMGu0CtT38f+dvf62wUdvIxL90ZztsKfLwCcwjHAui3A==,5RsOvrnMZntw+o1zEIzWmRLyl5pVvV5JKqHndp0CnyqBiYPuXom/TkNt5XBTl0vA4ahUAzBH9VtXCuwjKXa12w==
3,Go1ofZKpuCPfCmAWJEkRMWJkF3A4vrZPf/DwMdK68mKwxYoLpL/Ccu9Blc3bg5RiGBvC4RLPKL6tHVtpPcc1eg==,8nkv73G9eTdc/qekyrfHN/RGqmkFdJjSmxe0ZQws902k6nqBNd8H0+As8QszTOFXv/QeByjwDiQdhhjB1B+9qg==,zFSFAV5Ht03QUnGLhRqlxTd35ysr5muBBj814YyWSNHE3hXoEVzB3ea9h+qHjWXypKTVUmSmRpY2X32VrB/Eng==,i0cDQ9oXAAJBIQUBCQZZR8Qr3hnj2aA1yxx6VBpxu/9y9UXeEV0O8DsVkrWMdGwZYclkjI7XGyKMNBaDd78pTg==,fgvaD27+pI85NajmPg2bPIVKOlHMFIrEjXv+QZaYRpkHuLUA+NSm3xUY1Vlxi8/ls3i8wRckwaA7ZtaLbAN3zw==
4,W9MHbAmCK9BadTyZ7TbUCZMxEDKhnwy5y10QhkIXy+5xtZ5A2XiamDtADKvW7FhGfkRqzfB5sKz/11pu3HcFyw==,Zf3K19yA3ZqJ/eWxpdgKWSro9GoPkIGO06XuUXa8x8Ng/A1oIfYfjiB+NYla5SqoYeavFlRF8N5LtFDPCccg9g==,UfCPQEcqkUyxEqnbfcZNVMrrW5/95yIYnBTxg96NW0gLQUf96lhLe3dJikQ6vWwNOSs5gc5OXfpuySF8/aw1Gg==,kRWEDdtd6zuGDdvZBh4rPwxuEWZWFI6XD1isjXGFeK0euuyloNoAEP9+7onLzH6jDr2NNOrkegJj73N84bPYhg==,PO88wLoVne/kYR5guyApip6Abd03qDGoZYS88Hj9NQGoJaOjuSFpkdfz8TiKWCgnGk6RapWc6Qw+oxnH2VNjOQ==
5,+hgwcqUaLjwB9Lo5U5S3bnKM8pEbZpuo/9tDb9Ss4vi4HuBjOZUcd63Sin4dh60OJyIo4089StT6D8k3ZeALyg==,HmHMefl4mlmv9/oqSTC5qAjndr7Srb0iInfmsm01ptphT8/JMqf1ntmZAapcFhfByG7xvwYFNMS8tcvvOtXAGw==,pOvp0Pjh1B3VwOftVqjdseO9qpSy9MMTDntZ86BRGTmipJcvreW6v02tqWKS78Nl8jbaWbKnC7QUbov/ATqSMw==,lrr7sP5WEhpH3uWGGxG7RHx2BxVVLQCdPwaNxj1NVKV3BWyXLpSD7V7RycfD1vh2FUgwlNzCyXpAZcW1OCWwPw==,DtLyxJwle+DL6MfB26kwTZsMOazd0PzbkH4mHn6qFDihgtmpTXbwaxzGnBtmUAlbutlSGWrYAncpIgrfaxRJ+A==
6,3sWIeFEVLfMfKpDXOiPtJg017Zhm8a446JdkIIWP4yEBGRGyNkKOj4XGGOpcgzQYakZm452E3Pd7+3wBKJp1dA==,A1S+Z2LcnXpsx3wDrr2nBpOezYlzOlef5l2g99aTJDBxe4sQ7YP8Ke6HKDw6Ra5CnJZY9f3Q6lo2ihXfWroxUQ==,tLFCRx9pVXyTsm5zZEmACjsSKa/ZBLjkhrDNKmIOLTJ8R/Irq6PdOe8trogLlXYrjujG1QV6Su1kbwkTUwoIbQ==,1gg28Hct1tSomhYCbYxUnJjQ2YIVLIhK8mKE3vdm1rzToCANY/2hPsLXNpF+lSeOdFr30/JWfFvMd0FGoYSiwg==,CTHsUx9ugB9KRoYteLGd3MSZSYegKeQXi0hFQvssobpMKU+wnGX9OEDGHoW/svB9Ks0QsmWrdN8kFqb1nCckfA==
7,iycw71A34VdNg5ZNDF7IB5hibdphbuVk1Msbyc5/sxRULWRiCtPNXx9oDp19ZBDEje0zHkanClXY+ET79tMJew==,YyHYEPStpvQulgWWOowUxUjV6kHLbqQ1F/+15nZ/JKfk/7P9CKPUsTVCQD9dv5RBfnGv4Q3qrEiCAp8jX3Y8tA==,pK6WrtbJPgVvgxaDYe6AuzRN4sYKwIKDRVBknRC5KedT1HZPHtH9eCc02M/RDiFZZXRyHQvKVDdy+iXI69aiWw==,0E4WrTsvWsMmjyvGFU+X80pwCFxllYmoAmcEiq+0DU8sKlIY1Y/eHay83ThycNeNxTTMIyo9H7/4ortYUNgSUQ==,pSYHNL9MY73iSoyhmEHMVH+6lMZvU1L6fu/hfDFBEuauHdwavJlvqtSnNfRD7HmBW4qIFZPgOJsg+XbSg279bw==
8,7MuI7wblXK5HOb+p8zocy3U2oVwwclJnGADVaa4su/fI9wRGrnuCe6Nz2UbH8FAbIGypD3gd8k7MGDcRpuWjFg==,uHckWZbGNkzeuf256UQIgaJz1pZVEq9WUYSCYFI7t7SOXGI4X3JLbJje01gBOH5HtoGXXPV/EsinqGdiqGox6A==,9MI6+dnCPlkL5uayi6eczRwlZRKF7SdmzwHoU8rgB8P8aQ87hH0KXuVA8w6c8QYs4oO5839/v5Jg01OiWxVFcQ==,tzmNiWv8UVqIN+UeH0ouAqQfnuMB4nZtEi7w3xlDdOaJDz/lbNVCGomq1H7tkd/1ayU6TCXW5wOYA5BgFsk2Pg==,yhwVIsNNSqXcIxKvm+zA7DifIyuHpNZUO+sktX4Um0rhJBXRxChGEnwpwzvx748wdM+bIMc9RQNLoiraeEK/0w==


In [0]:
awsAccessKeyId = "xxxxxxxxxx"
awsSecretKey = "xxxxxxxxxx"
kmskeyid = 'alias/gdpr-key'
awsRegion = "us-west-2"

In [0]:
# udf to encrypt the pii columns using aws kms
def encrypt_col(col_val):
    kmsclient = boto3.client('kms', aws_access_key_id=awsAccessKeyId, aws_secret_access_key=awsSecretKey, region_name=awsRegion)
    ciphertext = kmsclient.encrypt(KeyId=kmskeyid, Plaintext=str(col_val))
    binary_encrypted = ciphertext[u'CiphertextBlob']
    encrypted_col = base64.b64encode(binary_encrypted)
    return encrypted_col.decode()

encrypt_udf = udf(encrypt_col, StringType())

# function to create Dataframe with encrypted pii columns using aws kms
def encrypt_df_col(df, piicol):
    for col in df.columns:
        if col in piicol:
            df = df.withColumn('encrypted_'+ col, encrypt_udf(df[col])).drop(col).withColumnRenamed('encrypted_'+ col, col)
    return df

In [0]:
silverEmployeeEncryptedDF = encrypt_df_col(employeeBronzeDF, get_piicol_list())
silverEmployeeEncryptedDF.write.format("delta").mode("overwrite").save("dbfs:/tmp/employee/employee-silver-encrypted")
display(silverEmployeeEncryptedDF.select('employee_id', 'ssn', 'credit_card', 'address', 'email', 'phone'))

employee_id,ssn,credit_card,address,email,phone
1,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQGG6aaeNiQDNDS9xcC07338AAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM+snvJtMMhjQtReXHAgEQgCb9QyjD/YzTFTstCey+ZvqWZpcEPu/FuaZrYu3jqqMBYGy5DCCseQ==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQHHrLP5BWVhzPblUD6qg8wFAAAAbDBqBgkqhkiG9w0BBwagXTBbAgEAMFYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMzwsY+K3f37ELMtNnAgEQgCl+bfSTwF1M/jXB5PX0AQ/FM+82Z598bBNy/P/v/d9iW7vImEZlobJKvw==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQEeqAdweCFOBo6Tnr/KPj1PAAAAjzCBjAYJKoZIhvcNAQcGoH8wfQIBADB4BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDJuPG1rwXLBmz1QIOgIBEIBL1e3OPBbNtfyxnInTGu9RlRnO8n35p5YjPF1B2ywYhsUMkupIv2MZWxueQKjS0VqOuqElesJDyrqKdB39DL1K8sK7uTqA5ediCa5d,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQEx1qxqszc+j75SXpWKnYjVAAAAcTBvBgkqhkiG9w0BBwagYjBgAgEAMFsGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM+thOk0xe5xN7RP17AgEQgC48rCWyOm1CvsN5jllQMg34iKQkaTWF+S1Y9QXd8TdW+hhwuZUb01ivDJY7s8b3,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFK+8eIBVhbH8O3FI0jRcgMAAAAazBpBgkqhkiG9w0BBwagXDBaAgEAMFUGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMe72hGsYyGp/0E3bgAgEQgCivYJPbKBSQN7RgWf1VcciSIQO1LQZZpom6VMhLvxEfonqn8ftvrfG/
2,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQGr94ewj6+aWpS2kp0KYKIsAAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMAhs3fHAiQzrvvgenAgEQgCZFQUFF/Vto846ZGgyGVz0A8hl2IVVTESvWsUpx71cMW3B5tb1AUw==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQGGkYGpAFTE4kSiHIBqTeGIAAAAbjBsBgkqhkiG9w0BBwagXzBdAgEAMFgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMScbMf5sWrm7l/N06AgEQgCsH6KnaQCQVqYck9rv3WbfO+OQW2CAJ/LCbMBaifxuP6j7meFduiuR9m4V5,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQE0/ieRA4yCGxr68pxuqJfOAAAAizCBiAYJKoZIhvcNAQcGoHsweQIBADB0BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDJZRyFTLvPlek85rVQIBEIBHizZ7NJBPHtmLQgrK9mBVMxf436V20owWJDKkeVG2yt4rXs1LHiOTLkswwEWGveh5soDTUagbiaa8XatrInk06WpHXHTljiI=,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQF92Mge5305g6MGdZMnVqSTAAAAcjBwBgkqhkiG9w0BBwagYzBhAgEAMFwGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMnl1dwP8T31sSr/W6AgEQgC9FUZewcFUHPll9IT/iOn5PIAKG+zwbBuotuNfxafPaVCKJMd8yruvfyIXTKglpXQ==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFeeG7DAl51E1DDPh9GrvL+AAAAcDBuBgkqhkiG9w0BBwagYTBfAgEAMFoGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMR+ttpnio4O0myPKMAgEQgC0B1PpzjRgyIbD/dXNrGYYdkXgIgHPDOkwxyw4Lhch3YO+e0ZWhhLsBTLSRlwc=
3,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQEIc2R6wP41jA+FaGdvVJuWAAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM9LcwwFC6qGZYgADcAgEQgCaDjRaWWmLXEGaVSkCVl0+Hhnw3NXNN9dXdeKH7z/zLyd9q/Lcu7g==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQESKBjMqib1L+6b7cvonrumAAAAbTBrBgkqhkiG9w0BBwagXjBcAgEAMFcGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM3UgvGy06ioCooQ/PAgEQgCpj5EVW0IsjYdJCbJj72Q8tgK7L50w2TntKNucXMvhUPPJrQGpxF4bfEw8=,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFJuWEe+uGybw/vnrO/b9ujAAAAfTB7BgkqhkiG9w0BBwagbjBsAgEAMGcGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM5iLFUBs6MKJJ9QkvAgEQgDpmQYbY8b7V5s/EcTRfIX1tjWOZHomyI7OOsgELaLSNDHyN7vug09Skl646hVPf3QNXbd+3LTsMOlBr,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQH26xO25M1OX+wmULdtbNbFAAAAeDB2BgkqhkiG9w0BBwagaTBnAgEAMGIGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM8ysRR0LA1yFBXC7SAgEQgDXc3/VTzXoZ251gg62xDDZcLAuMP1FAX+Mm+9/muYbD4fT3tAfKOGo5ZevT+fw7uP2moAjI7A==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFBXr/j0XRnFw4RZjdDwBPlAAAAajBoBgkqhkiG9w0BBwagWzBZAgEAMFQGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMbJ3Sk4H2r1hFBIZgAgEQgCdRpTiGlEzSFLnBpVLxxE6ULi+gf2gpYMUO0V5OdLZNv69Giduujeg=
4,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQH8HrNZ2KWvGZQq1XHEAJeJAAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMkngykNXkecxQ/HxgAgEQgCZDvvvhp9v8aeMhOatgAAH3nO4aSwbsb8BJ3SVodBp5/6f+vFbedg==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQHzq4cdrPxsHByKhSfe63brAAAAcTBvBgkqhkiG9w0BBwagYjBgAgEAMFsGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQME44KkOtBL20TYc2lAgEQgC6j1zAk8slySQcuqe+9xeF6pwVLPjhpXoBC2ng0esXwpOCk9ymRIMaZDiL6QfXh,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQEtkOt2eXiKjdMnjPSfi5CPAAAAlzCBlAYJKoZIhvcNAQcGoIGGMIGDAgEAMH4GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMd17RzTlSfgybu+lRAgEQgFEYdyuDfnD5w52qzn8VQ4H7TrOF5ieBolFtQdwqPv4JZvd1ZZA42le4zmCwbbxt8hdD2ExG7x/s8NG0kopf/LLjQLqsM47gWkerfZV09VD1Z5s=,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFi3G5F+xC1W8Hx1loLZRKrAAAAdTBzBgkqhkiG9w0BBwagZjBkAgEAMF8GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM0SFhsUNTIyV3Q+XoAgEQgDJncW44ymD2xnbOTR1LxPheuLl0DkK9i5/1G5n9BcKc2HqEJROFOic2KpoL5N2yHRdW8w==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFhLzrm6eMtec5aHeuIxC2oAAAAajBoBgkqhkiG9w0BBwagWzBZAgEAMFQGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMsJn5uvt1e/Sj5iCiAgEQgCeSpiPfM0E2bQYtOF9z0TyVlC/HVtBpBvlzrf+3YJ3zYbxeekkL+gQ=
5,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFbIoldPHyxnMd4XEjAQAu7AAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMYBgldgi/rq84L2q3AgEQgCZJPnq+vhUCwvPFMFEQTt4xNpYsLAxR7uuBVTjHGQ+aKGyNwnl/vw==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQF4Y8Xiy9QRjZHEImvTSItJAAAAbjBsBgkqhkiG9w0BBwagXzBdAgEAMFgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMIOc4S6ykqB9S/bHCAgEQgCsWo2DROR1RjFMK5nER+D9vx6QH8YgHIfZpG/ivkwkoWIhkC0STW0qzRWsH,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFzoNKZDIqVSfN9tmnsSsQYAAAAjTCBigYJKoZIhvcNAQcGoH0wewIBADB2BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDGUnObyg0zm0Nkvm1QIBEIBJvP6+qE7EYU+a9yhOvJSqgTqxfaspXCS3+8N9DDGv/9k6v9L2nH+1cA6l9av63RUeQ2DAxMkTeKt4SM5k4GXxNcOE7jyenDJQdg==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQG1joFNKRDbhaOWYjfN7+A1AAAAeDB2BgkqhkiG9w0BBwagaTBnAgEAMGIGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMoLzV75jX+If5nua5AgEQgDWlDCvgyiVY4wOFcogJqGvBGWbV8iIDANAi1vzanqcwWJlBD3Qm/tg3VNc1wL7hmGl7y1TtGg==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQHdwKFbxWSLkb5OYwq6av3PAAAAczBxBgkqhkiG9w0BBwagZDBiAgEAMF0GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMBQX6U+wzjMzR5467AgEQgDDSUMdXPsg7LLWTG4+R50XGHFZED63aynAtnHyjscVwn6ACSl/6MrvO1NVU1cOcupw=
6,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQGMwL0Jcy8F7Fqi1ZUllG3OAAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMEYD1Ruz8am0PZ+Q+AgEQgCYrXy/QX9RwaKV3dQbqT/kUsYjpmHSA4ez+PyzRzPDiTwap31RJUA==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQHv7UgtrDxjxzdTMid/6tjPAAAAbjBsBgkqhkiG9w0BBwagXzBdAgEAMFgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMa7mYPoP//SKPG1dMAgEQgCsul2hDGDfcep9AiQryCr3qH3IytTi5dS9scCN+mR55m019cZQVC/c192of,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQHILCBJn2PLj7cxz3ZX1rC8AAAAjDCBiQYJKoZIhvcNAQcGoHwwegIBADB1BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDIb8ZfDphvo1hf7vKwIBEIBIuwoCdiC7Ct0WOMd38jYDjmh6tRof5PfQPhsKXLyPx242WRjyyUfHPZlUhFjFDBYL1PjAF3Ug3w++pxkx9y28hcGQqoVaJyNi,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQHKsdWGCM3HSABbbyecBZBsAAAAdjB0BgkqhkiG9w0BBwagZzBlAgEAMGAGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMx5zS4NQO1UpmN0MWAgEQgDM7QijuN1Wfm88Pv3PmH6bFf6p+ePfuDIdV1xyCOt9KniRhHQfZKdLkMPGFCif1wvbZqtk=,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFYT30Q8u7UHvxQyuGVGI2cAAAAajBoBgkqhkiG9w0BBwagWzBZAgEAMFQGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM6B5zF/VlRvwzTGIxAgEQgCc6C/UOE8IQc7SYIo4FFcxClDrWVYiDJar+issEBGf0k5V0bx4KO38=
7,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQGgYHgnOUZB5FCUAW7JXWi8AAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMj5LBuRPkJUssztyXAgEQgCa5Aw8QdXPWNskv8GsieJEID9MeMyicnYu+LlEHslky+zX2C8oW4A==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQF8KQTAg3ezKHAn9MpUxrifAAAAbTBrBgkqhkiG9w0BBwagXjBcAgEAMFcGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMq74XXOmFW0CzY/rDAgEQgCpP1NZMDBfUNPtzXsjCDyiaNHQij121fsEoVnz94anHx3FiH/6W9GcHZog=,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQF3w69MJ4ZOaLThWpy2fRwGAAAAkjCBjwYJKoZIhvcNAQcGoIGBMH8CAQAwegYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAwebHnAAz/xit2lnYoCARCATbSib0Tt03qspSgTacgAlQ2VzRtf3rHjuYfqvcCCjTy6HlMjhKnNVDAkRdUhkgSMOrxQ8epun3TxYUngF7CtGx1O8PxZ729Zrsdfjk77,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQESA0UjpKlvn3n5CUcvYeYqAAAAcDBuBgkqhkiG9w0BBwagYTBfAgEAMFoGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM1iGs9KtqsJ85/m2UAgEQgC3bqCp0hSOimk4WNyQ4qkYhd8YoP6kB++88bN1fFIRSrmN7xapbK9EHeJ2kqOs=,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQF/DkczssAnwY68gujs45zrAAAAczBxBgkqhkiG9w0BBwagZDBiAgEAMF0GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMfQg+WxDcMTuLJEblAgEQgDBePYxWibmbZv/dnTy4A5TeCaoq+vQ8mmeEjgZ5o8jsYEXFdKqVr0EsNo4dBDkKaRw=
8,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQEg1HN/FKBrZ7spjAYqw59MAAAAaTBnBgkqhkiG9w0BBwagWjBYAgEAMFMGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMbmvUInv2M0BtpyMcAgEQgCaw2zc1Bfi8+Nrby4oLq7dAM+69Qu16DsWlEpaLALFeFh5mDCcJpw==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQG+w7dQ/mEn8RUMNtKTl9pPAAAAcTBvBgkqhkiG9w0BBwagYjBgAgEAMFsGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMrbSPUGY0dhh9UQ6uAgEQgC4Pl9sKt9ODDKajwkKOv8WjgfR9frty/bAeppkRj+rCgQCeOdMEn+6yEOKR1Edj,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQFjh0e+tmBru032mrxGr30sAAAAhjCBgwYJKoZIhvcNAQcGoHYwdAIBADBvBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDF+me1UF/nAkLBbfxAIBEIBCqgOi2RU5DNDaKzG2YuMnM3OVykdmRke3XdNpvkKx3PhBBfce/0ffXGuJ40lwwmOKJ+/X6KNDkCdYIBPQtTylK8Nl,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQEUBrbm5yOF/n+IDSWLsK8eAAAAcjBwBgkqhkiG9w0BBwagYzBhAgEAMFwGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMUyFbLT5NBI/rugo8AgEQgC/hfTDta0KevOWkusE2gRxnLp7CdIgzxaBeqa352DPGUQ4e9JtVtqxKJtZ9Pifycg==,AQICAHh4jXytp626JHUSi983W/bLHNsLZQ/8mb+FIvPCebI/UQF1g5mp7f4dPlDMSY6bgqyRAAAAdDByBgkqhkiG9w0BBwagZTBjAgEAMF4GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQML7V33Y3uhr/ycfCMAgEQgDEyaZa+yd47CsJXkGkb6e66m/N0lNnEcwSeaQorD1+DO7Bma4dr6adwepUtA72PiBt2


In [0]:
# udf to decrypt the pii columns using aws kms 
def decrypt_col(col_val):
    print(col_val)
    kmsclient = boto3.client('kms', aws_access_key_id=awsAccessKeyId, aws_secret_access_key=awsSecretKey, region_name=awsRegion)
    plaintext = kmsclient.decrypt(CiphertextBlob=bytes(base64.b64decode(col_val)))
    return plaintext["Plaintext"].decode()

decrypt_udf = udf(decrypt_col, StringType())

# function to create Dataframe with encrypted pii columns using aws kms
def decrypt_df_col(df, piicol):
    for col in df.columns:
        if col in piicol:
            df = df.withColumn(col, decrypt_udf(df[col]))
    return df 

In [0]:
silverEmployeeEncryptedDF = spark.read.format("delta").load("dbfs:/tmp/employee/employee-silver-encrypted")
silverEmployeeDecryptedDF = decrypt_df_col(silverEmployeeEncryptedDF, get_piicol_list())
display(silverEmployeeDecryptedDF.select('employee_id', 'ssn', 'credit_card', 'address', 'email', 'phone'))

employee_id,ssn,credit_card,address,email,phone
1,173-20-2537,30483664588532,"962 Carr Neck Suite 674 Rodriguezshire, DC 38907",ebutler@example.org,(792)372-8891
2,704-46-4736,4598140055698512,"36640 Jennifer Crest Katherineport, OH 65499",testrada@example.com,(939)593-1101x3096
3,858-96-6093,377646699843129,Unit 6357 Box 1287 DPO AP 37713,elizabethdavis@example.com,031-516-7147
4,306-95-7897,4911243531845687347,"0983 David Valleys Apt. 301 Pattersonchester, NC 64044",ingrambruce@example.net,678.888.3537
5,579-93-6202,4887590753378603,"021 Austin Field Lake Gregoryborough, RI 71294",stephaniejones@example.net,001-610-131-3233x8327
6,303-33-9126,3577795263152276,"846 Huff Spur Apt. 726 West Jessica, PA 48561",blevinssarah@example.com,216.905.2262
7,539-40-8913,373314503936423,"33968 Mason Plaza Suite 181 West Heather, UT 74877",vevans@example.com,001-835-323-7131x9985
8,376-25-0277,4424457845976019723,"363 Spence Mountain Jamesfurt, IN 58404",rgardner@example.com,001-816-407-9074x25533


In [0]:
# function to create a dataframe of employees without pii columns
def employee_pii_df(df, piicol):
    for col in df.columns:
        if col not in piicol:
            df = df.drop(col)
    return df

# function to create a Dataframe of employees with pii columns
def employee_non_pii_df(df, piicol):
    for col in df.columns:
        if col in piicol:
            df = df.drop(col)
    return df

In [0]:
%sql
DROP TABLE IF EXISTS employee_silver_pii;
DROP TABLE IF EXISTS employee_silver_non_pii;

In [0]:
piicol=get_piicol_list()
piicol.append('employee_id')
silverEmployeePIIDF = mask_dataframe(silverEmployeeJoinedDF.drop('employee_id'), ['email'])
silverEmployeePIIDF = employee_pii_df(silverEmployeePIIDF, piicol)
silverEmployeePIIDF.write.format("delta").option("path", "dbfs:/tmp/employee/employee-silver-pii").saveAsTable("employee_silver_pii")
display(silverEmployeePIIDF)

piicol=get_piicol_list()
piicol.remove('email')
silverEmployeeNotPIIDF = mask_dataframe(silverEmployeeJoinedDF.drop('employee_id'), ['email'])
silverEmployeeNotPIIDF = employee_non_pii_df(silverEmployeeNotPIIDF.drop('salt_key'), piicol)
silverEmployeeNotPIIDF.write.format("delta").mode("overwrite").option("path", "dbfs:/tmp/employee/employee-silver-non-pii").saveAsTable("employee_silver_non_pii")
display(silverEmployeeNotPIIDF)

address,credit_card,phone,ssn,email
"962 Carr Neck Suite 674 Rodriguezshire, DC 38907",30483664588532,(792)372-8891,173-20-2537,+V/fJKPUTn23ywEWj8FPVQT6U3GDB7kxSB3J6SfXApz91J/R50JZgSJgqI9N4PW68EHyqFZABdgf48tAC2CbNg==
"36640 Jennifer Crest Katherineport, OH 65499",4598140055698512,(939)593-1101x3096,704-46-4736,u1Ycjrg3Yyh0REw56RrrlfgltVY+HulHEHSxWnNko5tMGu0CtT38f+dvf62wUdvIxL90ZztsKfLwCcwjHAui3A==
Unit 6357 Box 1287 DPO AP 37713,377646699843129,031-516-7147,858-96-6093,i0cDQ9oXAAJBIQUBCQZZR8Qr3hnj2aA1yxx6VBpxu/9y9UXeEV0O8DsVkrWMdGwZYclkjI7XGyKMNBaDd78pTg==
"0983 David Valleys Apt. 301 Pattersonchester, NC 64044",4911243531845687347,678.888.3537,306-95-7897,kRWEDdtd6zuGDdvZBh4rPwxuEWZWFI6XD1isjXGFeK0euuyloNoAEP9+7onLzH6jDr2NNOrkegJj73N84bPYhg==
"021 Austin Field Lake Gregoryborough, RI 71294",4887590753378603,001-610-131-3233x8327,579-93-6202,lrr7sP5WEhpH3uWGGxG7RHx2BxVVLQCdPwaNxj1NVKV3BWyXLpSD7V7RycfD1vh2FUgwlNzCyXpAZcW1OCWwPw==
"846 Huff Spur Apt. 726 West Jessica, PA 48561",3577795263152276,216.905.2262,303-33-9126,1gg28Hct1tSomhYCbYxUnJjQ2YIVLIhK8mKE3vdm1rzToCANY/2hPsLXNpF+lSeOdFr30/JWfFvMd0FGoYSiwg==
"33968 Mason Plaza Suite 181 West Heather, UT 74877",373314503936423,001-835-323-7131x9985,539-40-8913,0E4WrTsvWsMmjyvGFU+X80pwCFxllYmoAmcEiq+0DU8sKlIY1Y/eHay83ThycNeNxTTMIyo9H7/4ortYUNgSUQ==
"363 Spence Mountain Jamesfurt, IN 58404",4424457845976019723,001-816-407-9074x25533,376-25-0277,tzmNiWv8UVqIN+UeH0ouAqQfnuMB4nZtEi7w3xlDdOaJDz/lbNVCGomq1H7tkd/1ayU6TCXW5wOYA5BgFsk2Pg==


birthdate,company,job_title,name,sex,email
2002-06-27,Walker Inc,Web designer,Bailey Sullivan,F,+V/fJKPUTn23ywEWj8FPVQT6U3GDB7kxSB3J6SfXApz91J/R50JZgSJgqI9N4PW68EHyqFZABdgf48tAC2CbNg==
2002-09-13,"Bailey, Garcia and Becker","Horticulturist, commercial",Kelly Walsh,,u1Ycjrg3Yyh0REw56RrrlfgltVY+HulHEHSxWnNko5tMGu0CtT38f+dvf62wUdvIxL90ZztsKfLwCcwjHAui3A==
2012-12-16,"Henderson, Morales and Adams",Systems developer,Jessica Morgan,M,i0cDQ9oXAAJBIQUBCQZZR8Qr3hnj2aA1yxx6VBpxu/9y9UXeEV0O8DsVkrWMdGwZYclkjI7XGyKMNBaDd78pTg==
2020-11-27,Brandt LLC,Insurance risk surveyor,Cory Garcia,M,kRWEDdtd6zuGDdvZBh4rPwxuEWZWFI6XD1isjXGFeK0euuyloNoAEP9+7onLzH6jDr2NNOrkegJj73N84bPYhg==
2005-08-29,Rodriguez-Allen,Firefighter,Annette Brock,F,lrr7sP5WEhpH3uWGGxG7RHx2BxVVLQCdPwaNxj1NVKV3BWyXLpSD7V7RycfD1vh2FUgwlNzCyXpAZcW1OCWwPw==
2007-01-15,Schmidt-Adams,Insurance claims handler,Mark Sanchez,M,1gg28Hct1tSomhYCbYxUnJjQ2YIVLIhK8mKE3vdm1rzToCANY/2hPsLXNpF+lSeOdFr30/JWfFvMd0FGoYSiwg==
2012-07-21,"Smith, Kelly and Walters",Artist,Henry Garrett,M,0E4WrTsvWsMmjyvGFU+X80pwCFxllYmoAmcEiq+0DU8sKlIY1Y/eHay83ThycNeNxTTMIyo9H7/4ortYUNgSUQ==
2005-07-26,Elliott Inc,Human resources officer,James Mcmahon,F,tzmNiWv8UVqIN+UeH0ouAqQfnuMB4nZtEi7w3xlDdOaJDz/lbNVCGomq1H7tkd/1ayU6TCXW5wOYA5BgFsk2Pg==


In [0]:
# dataframe to have key value pair of all pii columns in a column 'key_val'

silverTokenKeyValDF = employeeBronzeDF.select('employee_id', \
                                                'address', \
                                                'phone', \
                                                'ssn', \
                                                'email', \
                                                'credit_card') \
                                        .withColumn('key_val', array( \
                                                struct(lit('address').alias('token_name'), employeeBronzeDF.address.alias('token_value')), \
                                                struct(lit('phone').alias('token_name'), employeeBronzeDF.phone.alias('token_value')), \
                                                struct(lit('ssn').alias('token_name'), employeeBronzeDF.ssn.alias('token_value')), \
                                                struct(lit('email').alias('token_name'), employeeBronzeDF.email.alias('token_value')), \
                                                struct(lit('credit_card').alias('token_name'), employeeBronzeDF.credit_card.alias('token_value')))) 

silverTokenDF = silverTokenKeyValDF.select(silverTokenKeyValDF.employee_id, explode(silverTokenKeyValDF.key_val)) \
                                   .select(silverTokenKeyValDF.employee_id, "col.*") \
                                   .dropDuplicates(['token_name','token_value']) \
                                   .withColumn('token', expr('row_number() over (order by rand())')).orderBy('employee_id')
silverTokenKeyValDF.write.format("delta").mode("overwrite").save("dbfs:/tmp/employee/employee-silver-token")
display(silverTokenDF)

employee_id,token_name,token_value,token
1,phone,(792)372-8891,22
1,email,ebutler@example.org,1
1,address,"962 Carr Neck Suite 674 Rodriguezshire, DC 38907",24
1,ssn,173-20-2537,15
1,credit_card,30483664588532,33
2,ssn,704-46-4736,31
2,email,testrada@example.com,38
2,credit_card,4598140055698512,18
2,phone,(939)593-1101x3096,17
2,address,"36640 Jennifer Crest Katherineport, OH 65499",32


In [0]:
# function to extract the pii column and its token

def tokendim(df, key):
    resdf = df.filter(df.token_name == key) \
      .withColumnRenamed('token_value', key) \
      .withColumnRenamed('token', key + '_token') \
      .drop(df.token_name)
    
    return resdf

In [0]:
# dataframe of the pii columns represented by its token 

silverEmployeeTokenDF = silverTokenKeyValDF.join(tokendim(silverTokenDF, 'address'), ['address', 'employee_id']) \
                                           .join(tokendim(silverTokenDF, 'phone'), ['phone', 'employee_id']) \
                                           .join(tokendim(silverTokenDF, 'ssn'), ['ssn', 'employee_id']) \
                                           .join(tokendim(silverTokenDF, 'email'), ['email', 'employee_id']) \
                                           .join(tokendim(silverTokenDF, 'credit_card'), ['credit_card', 'employee_id']) \
                                           .select('employee_id', 'address_token', 'phone_token', 'ssn_token', 'email_token', 'credit_card_token').orderBy('employee_id')  

silverEmployeeTokenDF.write.format("delta").mode("overwrite").save("dbfs:/tmp/employee/employee-silver-tokenized")
display(silverEmployeeTokenDF)

employee_id,address_token,phone_token,ssn_token,email_token,credit_card_token
1,24,22,15,1,33
2,32,17,31,38,18
3,5,26,35,11,6
4,12,28,21,40,37
5,23,20,16,14,34
6,2,9,13,10,29
7,25,3,36,30,8
8,39,4,27,19,7


In [0]:
%sql
select * from employee_silver_pii WHERE ssn = "303-33-9126"

address,credit_card,phone,ssn,email
"846 Huff Spur Apt. 726 West Jessica, PA 48561",3577795263152276,216.905.2262,303-33-9126,1gg28Hct1tSomhYCbYxUnJjQ2YIVLIhK8mKE3vdm1rzToCANY/2hPsLXNpF+lSeOdFr30/JWfFvMd0FGoYSiwg==


In [0]:
employeeBronzeChangeData = '[ \
                {"employee_id": "6", "name": "Mark Sanchez", "address": "999 Main St, Apt. 726 West Jessica, PA 48745", \
                 "phone": "216.435.6545", "ssn": "303-33-9126", "birthdate": "2007-01-15", "sex": "M", "job_title": "Insurance claims handler", \
                 "company": "Schmidt-Adams", "email": "blevinssarah@example.com", "credit_card": "3577795263152276"} \
           ]'
employeeBronzeChangeDF = spark.read.json(sc.parallelize([employeeBronzeChangeData]))
display(employeeBronzeChangeDF)

address,birthdate,company,credit_card,email,employee_id,job_title,name,phone,sex,ssn
"999 Main St, Apt. 726 West Jessica, PA 48745",2007-01-15,Schmidt-Adams,3577795263152276,blevinssarah@example.com,6,Insurance claims handler,Mark Sanchez,216.435.6545,M,303-33-9126


In [0]:
piicol=get_piicol_list()
piicol.append('employee_id')
silverEmployeeChangeJoinedDF = employeeBronzeChangeDF.join(silverEmployeeSaltDF, "employee_id") \
                                                     .drop(silverEmployeeSaltDF.employee_id)
bronzeEmployeeChangePIIDF = mask_dataframe(silverEmployeeChangeJoinedDF.drop('employee_id'), ['email'])
bronzeEmployeeChangePIIDF = employee_pii_df(bronzeEmployeeChangePIIDF, piicol)

deltaTable = DeltaTable.forPath(spark, "dbfs:/tmp/employee/employee-silver-pii")
deltaTable.alias("employee_pii").merge(
bronzeEmployeeChangePIIDF.alias("employee_change"), \
                      "employee_pii.email = employee_change.email") \
                      .whenMatchedUpdateAll() \
                      .whenNotMatchedInsertAll() \
                      .execute()
display(bronzeEmployeeChangePIIDF)

address,credit_card,phone,ssn,email
"999 Main St, Apt. 726 West Jessica, PA 48745",3577795263152276,216.435.6545,303-33-9126,1gg28Hct1tSomhYCbYxUnJjQ2YIVLIhK8mKE3vdm1rzToCANY/2hPsLXNpF+lSeOdFr30/JWfFvMd0FGoYSiwg==


In [0]:
employeeBronzeDeleteData = '[ \
                {"employee_id": "8", "email": "rgardner@example.com"} \
           ]'
employeeBronzeDeleteDF = spark.read.json(sc.parallelize([employeeBronzeDeleteData]))
display(employeeBronzeDeleteDF)

email,employee_id
rgardner@example.com,8


In [0]:
piicol=get_piicol_list()
piicol.append('employee_id')
silverEmployeeDeleteJoinedDF = employeeBronzeDeleteDF.join(silverEmployeeSaltDF, "employee_id") \
                                                     .drop(silverEmployeeSaltDF.employee_id)
employeeBronzeDeleteDF = mask_dataframe(silverEmployeeDeleteJoinedDF.drop('employee_id'), ['email'])
employeeBronzeDeleteDF = employee_pii_df(employeeBronzeDeleteDF, piicol)
employeeBronzeDeleteDF.createOrReplaceTempView('employee_delete')
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false")

Out[22]: DataFrame[key: string, value: string]

In [0]:
%sql
DELETE FROM employee_silver_pii where email in (select email from employee_delete);
VACUUM employee_silver_pii RETAIN 0 HOURS;

path
dbfs:/tmp/employee/employee-silver-pii


In [0]:
%sql
DESCRIBE HISTORY employee_silver_pii;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata
2,2022-03-06T22:18:42.000+0000,883777857817571,manoj.kukreja@northbaysolutions.com,DELETE,"Map(predicate -> [""(spark_catalog.default.employee_silver_pii.`email` IN (listquery()))""])",,List(1166981717273972),0306-185347-u5luvl2s,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numCopiedRows -> 7, numAddedChangeFiles -> 0, executionTimeMs -> 4309, numDeletedRows -> 1, scanTimeMs -> 2047, numAddedFiles -> 1, rewriteTimeMs -> 2262)",
1,2022-03-06T22:18:32.000+0000,883777857817571,manoj.kukreja@northbaysolutions.com,MERGE,"Map(predicate -> (employee_pii.`email` = employee_change.`email`), matchedPredicates -> [{""actionType"":""update""}], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(1166981717273972),0306-185347-u5luvl2s,0.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 7, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, executionTimeMs -> 4984, numTargetRowsInserted -> 0, scanTimeMs -> 2755, numTargetRowsUpdated -> 1, numOutputRows -> 8, numTargetChangeFilesAdded -> 0, numSourceRows -> 1, numTargetFilesRemoved -> 1, rewriteTimeMs -> 2133)",
0,2022-03-06T22:17:48.000+0000,883777857817571,manoj.kukreja@northbaysolutions.com,CREATE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(1166981717273972),0306-185347-u5luvl2s,,WriteSerializable,True,"Map(numFiles -> 1, numOutputBytes -> 3107, numOutputRows -> 8)",
