# Notebook to identify PII data in Structured and Unstructured data using Presidio and generate Synthetic Data Using Faker for Anonymization

In [None]:
# Installing the large model from the lakehouse as it exceeds the size limit for custom libraries in the Fabric environment.
# Replace based on your location of Wheel file.
%pip install /lakehouse/default/Files/presidio/models/en_core_web_lg-3.8.0-py3-none-any.whl

In [None]:
%pip install "numpy==1.26.3"
# This is required to make numpy comatible with Spacy

In [None]:
%pip install Faker


In [4]:
from pyspark.sql.functions import (
    array, lit, explode, col, monotonically_increasing_id, concat
)
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from presidio_anonymizer.entities import OperatorConfig
import pandas as pd
from faker import Faker

StatementMeta(, 3f9ce763-b1a0-4834-9d42-e84635e16a05, 25, Finished, Available, Finished)

# This configuration sets up a PII detection and anonymization pipeline using Microsoft Presidio with a SpaCy NLP engine and Apache Spark for distributed processing.
# 
```
nlp_engine_name: Specifies the NLP engine to use (spacy).
models: A list of language models to load. Here, it uses the large English model en_core_web_lg for better accuracy in entity recognition.

In [5]:
configuration = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": "en", "model_name": "en_core_web_lg"},
    ]
}

provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine, supported_languages=["en"]
)
anonymizer = AnonymizerEngine()

# Broadcasting analyzer and anonymizer objects in Spark to serialize the object only once.
# Send it to each worker node once, and reuse it for all tasks on that node.
broadcasted_analyzer = spark.sparkContext.broadcast(analyzer)
broadcasted_anonymizer = spark.sparkContext.broadcast(anonymizer)

StatementMeta(, 3f9ce763-b1a0-4834-9d42-e84635e16a05, 26, Finished, Available, Finished)

# Identify PII Data and replace with synthetic data for a Column in DataFrames
This method is used for dataframe with column which consists of PII data which needs to be identified and replaced with synthetic data.

```
Args:
    dataframe – Spark Dataframe consisting of dataset
    column_name – Column name
    output_column_name(Optional) – Output column name. Default: {column_name}_redacted

Returns:
    anonymizedDataframe - with new redacted column

```
**Typical Use Cases**

- **Unstructured Data:**
Useful for processing columns extracted from unstructured sources such as .txt or .pdf files, where free-form text may contain PII.

- **Structured Data:**
Can be applied to columns like comments, feedback, or notes in structured tables, where text fields may also include PII.

**How It Works**
- **Detection**:
The method scans the specified column for PII entities using an NLP-based analyzer.

- **Replace with Synthetic data**:
Detected PII is replaced with synthetic data ensuring sensitive information is not exposed.


In [6]:
def identify_and_replace_pii_column_with_SyntheticData(dataframe,column_name,output_column_name=None):
    if output_column_name is None:
        output_column_name = f"{column_name}_redacted"
    return dataframe.withColumn(
        output_column_name,
        synthetic_udf(dataframe[column_name])
    )

StatementMeta(, 3f9ce763-b1a0-4834-9d42-e84635e16a05, 27, Finished, Available, Finished)

# PySpark Pandas UDF (synthetic_udf)

This code defines a **PySpark Pandas UDF (synthetic_udf)** that takes a column of text data, detects PII (Personally Identifiable Information) in each row, and replaces detected PII with synthetic (fake) values using the Faker library.

In [7]:
@pandas_udf(StringType())
def synthetic_udf(texts: pd.Series) -> pd.Series:
    analyzer = broadcasted_analyzer.value
    anonymizer = broadcasted_anonymizer.value
    fake = Faker()

    def synthesize(text):
        if text is None:
            return None
        results = analyzer.analyze(text=text, entities=[], language='en')
        operators = {}
        for result in results:
            entity_type = result.entity_type
            synthetic_value = get_synthetic_value(entity_type, fake)
            operators[entity_type] = OperatorConfig("replace", {"new_value": synthetic_value})
        if not operators:
            operators = {"DEFAULT": OperatorConfig("replace", {"new_value": "SYNTHETIC_DATA"})}
        anonymized_result = anonymizer.anonymize(
            text=text,
            analyzer_results=results,
            operators=operators
        )
        return anonymized_result.text

    # Always return a pandas.Series
    return texts.apply(synthesize)


StatementMeta(, 3f9ce763-b1a0-4834-9d42-e84635e16a05, 28, Finished, Available, Finished)

# Synthetic Value Generator Function

This method returns a synthetic value for a given PII entity type using Faker.

In [8]:
def get_synthetic_value(entity_type, fake):
    if entity_type == "PERSON":
        return fake.name()
    elif entity_type == "EMAIL_ADDRESS":
        return fake.email()
    elif entity_type == "PHONE_NUMBER":
        return fake.phone_number()
    elif entity_type == "LOCATION":
        return fake.city()
    elif entity_type == "DATE_TIME":
        return str(fake.date_time())
    elif entity_type == "CREDIT_CARD":
        return fake.credit_card_number()
    # Add more entity types as needed
    else:
        return "SYNTHETIC_DATA"

StatementMeta(, 3f9ce763-b1a0-4834-9d42-e84635e16a05, 29, Finished, Available, Finished)

# Usage Example

In [9]:
df = spark.read.format("csv").option("header","true").load("Files/data/customer-profile-sample-data/customers-sampledata.csv")
df = identify_and_replace_pii_column_with_SyntheticData(df, "Email","Email_redacted_RK")
display(df.select ('Email','Email_redacted_RK'))

StatementMeta(, 3f9ce763-b1a0-4834-9d42-e84635e16a05, 30, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 2f92b1dc-4b31-426c-be9d-2387b71b885e)