# Anonymize PII Entities in text files

<br>Using Presidio, anonymize PII content of files in an Azure Storage account.

<br>The following code sample will:
<ol>
<li>Import the content of text files located in an Azure Storage blob folder</li>
<li>Anonymize the content using Presidio</li>
<li>Write the anonymized content back to the Azure Storage blob account</li>
</ol>

In [None]:
from azure.storage.blob import  BlobServiceClient
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import AnonymizerConfig
from pyspark.sql.types import StringType
from pyspark.sql.functions import input_file_name, regexp_replace

dbutils.widgets.text("storage_account_name", "", "Blob Storage Account Name")
dbutils.widgets.text("storage_container_name", "", "Blob Container Name")
dbutils.widgets.text("storage_account_access_key", "", "Storage Account Access Key")
dbutils.widgets.text("storage_input_folder", "input", "Input Folder")
dbutils.widgets.text("storage_output_folder", "output", "Output Folder")


# Import the text files from Azure Blob storage


In [None]:
storage_account_name = dbutils.widgets.get("storage_account_name")
storage_container_name = dbutils.widgets.get("storage_container_name")
storage_account_access_key = dbutils.widgets.get("storage_account_access_key")


blob_service_client = BlobServiceClient(account_url="https://" + storage_account_name + ".blob.core.windows.net/", credential=storage_account_access_key)
container_client = blob_service_client.get_container_client(storage_container_name)

blob_names = container_client.list_blobs(name_starts_with=dbutils.widgets.get("storage_input_folder") + "/")
blobs = [] 
for blob in blob_names:
  blobs.append("wasbs://" + storage_container_name + "@" + storage_account_name + ".blob.core.windows.net/" + blob.name)
  
spark.conf.set("fs.azure.account.key."+storage_account_name+".blob.core.windows.net", storage_account_access_key)

input_rdd = spark.read.text(blobs).withColumn("filename", input_file_name())

input_rdd.show()

# Anonymize Text using Presidio


In [None]:
def anonymize_text(text):
  analyzer = AnalyzerEngine()
  anonymizer = AnonymizerEngine()
  analyzer_results = analyzer.analyze(text=text, language='en')
  anonymized_results = anonymizer.anonymize(
    text=text,
    analyzer_results=analyzer_results,    
    anonymizers_config={"DEFAULT": AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"})}
  )
  return anonymized_results
anonymized_rdd = input_rdd.rdd.map(lambda x: (x["value"], x["filename"], anonymize_text(x["value"]))).toDF(["text", "filename", "anonymized_text"])

anonymized_rdd.show()


# Write the Anonymized content back to Azure Blob storage

In [None]:
output_rdd = anonymized_rdd.withColumn('filename', regexp_replace('filename', "^.*(/" + dbutils.widgets.get("storage_input_folder") + "/)", dbutils.widgets.get("storage_output_folder") + "/"))

def upload_to_blob(text, file_name):
  blob_client = blob_service_client.get_blob_client(container=storage_container_name, blob=file_name)
  blob_client.upload_blob(text)
  return "SAVED"

save_udf = udf(upload_to_blob, StringType())
# Invoke UDF for each row of the Dataframe.
out_df = output_rdd.withColumn("processed", save_udf(output_rdd.anonymized_text, output_rdd.filename))

# Check if all the rows are processed successfully.
out_df.show() 

out_df.collect()
