# Notebook to identify and redact PII data in Structured and Unstructured data using Presidio

In [None]:
# Installing the large model from the lakehouse as it exceeds the size limit for custom libraries in the Fabric environment.
# Replace based on your location of Wheel file.
%pip install /lakehouse/default/Files/presidio/models/en_core_web_lg-3.8.0-py3-none-any.whl

In [None]:
%pip install "numpy==1.26.3"
# This is required to make numpy comatible with Spacy

In [15]:
from pyspark.sql.functions import (
    array, lit, explode, col, monotonically_increasing_id, concat
)
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from presidio_anonymizer.entities import OperatorConfig
import pandas as pd

StatementMeta(, bf2b6c99-4ef9-4c57-bed9-28aaab77d989, 58, Finished, Available, Finished)

# This configuration sets up a PII detection and anonymization pipeline using Microsoft Presidio with a SpaCy NLP engine and Apache Spark for distributed processing.
# 
```
nlp_engine_name: Specifies the NLP engine to use (spacy).
models: A list of language models to load. Here, it uses the large English model en_core_web_lg for better accuracy in entity recognition.

```

In [16]:
configuration = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": "en", "model_name": "en_core_web_lg"},
    ]
}

provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine, supported_languages=["en"]
)
anonymizer = AnonymizerEngine()

# Broadcasting analyzer and anonymizer objects in Spark to serialize the object only once.
# Send it to each worker node once, and reuse it for all tasks on that node.
broadcasted_analyzer = spark.sparkContext.broadcast(analyzer)
broadcasted_anonymizer = spark.sparkContext.broadcast(anonymizer)

StatementMeta(, bf2b6c99-4ef9-4c57-bed9-28aaab77d989, 59, Finished, Available, Finished)

# Redact PII Data for a Column in DataFrames
This method is designed to process DataFrames containing columns with potentially sensitive personally identifiable information (PII). It identifies and redacts PII by replacing detected entities with a placeholder text.

```
Args:
    df – Spark Dataframe consisting of dataset
    column_name – Column name for which PII data needs to be redacted
    replacement_txt(Optional) – Replacement Text. Default:{Entity type}
    output_column_name(Optional) – Output column name. Default: {column_name}_redacted

Returns:
    anonymizedDataframe - with new redacted column

```

**Typical Use Cases**

- **Unstructured Data:**
Useful for processing columns extracted from unstructured sources such as .txt or .pdf files, where free-form text may contain PII.

- **Structured Data:**
Can be applied to columns like comments, feedback, or notes in structured tables, where text fields may also include PII.

**How It Works**
- **Detection**:
The method scans the specified column for PII entities using an NLP-based analyzer.

- **Redaction**:
Detected PII is replaced with a placeholder (e.g., "REDACTED" or "ANONYMIZED"), ensuring sensitive information is not exposed.


In [17]:
from pyspark.sql.functions import lit

def identify_and_redact_pii_column(df, column_name, replacement_txt="", output_column_name=None):
    if output_column_name is None:
        output_column_name = f"{column_name}_redacted"
    return df.withColumn(
        output_column_name,
        identify_and_redact_pii_pandas_udf(df[column_name], lit(replacement_txt))
    )

StatementMeta(, bf2b6c99-4ef9-4c57-bed9-28aaab77d989, 60, Finished, Available, Finished)

# PySpark Pandas UDF (identify_and_redact_pii_pandas_udf)

This code defines a **PySpark Pandas UDF (identify_and_redact_pii_pandas_udf)** to identify and redact PII in a Pandas Series of texts using Presidio.

In [18]:
@pandas_udf(StringType())
def identify_and_redact_pii_pandas_udf(texts: pd.Series, replacements: pd.Series) -> pd.Series:
    analyzer = broadcasted_analyzer.value
    anonymizer = broadcasted_anonymizer.value

    def redact(text, replacement):
        if text is None:
            return None
        results = analyzer.analyze(text=text, entities=[], language='en')
        operators = {"DEFAULT": OperatorConfig("replace", {"new_value": replacement})}
        anonymized_result = anonymizer.anonymize(
            text=text,
            analyzer_results=results,
            operators=operators
        )
        return anonymized_result.text

    return pd.Series([redact(t, r) for t, r in zip(texts, replacements)])

StatementMeta(, bf2b6c99-4ef9-4c57-bed9-28aaab77d989, 61, Finished, Available, Finished)

# Usage:
```
df = identify_and_redact_pii_column(df, "EmailAddress", replacement_txt="MASKED")
```
Since the replacement_text is passed as "MASKED" all identified PII entities will be replaced with "MASKED". 
A Dataframe will be returned with redacted column with column name {EmailAddress__redacted} is returned.

```
df = identify_and_redact_pii_column(df, "EmailAddress")

```
Since the replacement_text is not passed all identified PII entities will be replaced with "EMAIL_ADDRESS".e.g.  
A Dataframe with redacted column with column name {EmailAddress__redacted} is returned.

In [19]:
df = spark.read.format("csv").option("header","true").load("Files/data/customer-profile-sample-data/customers-sampledata.csv")
df_redacted = identify_and_redact_pii_column(df, "Email", replacement_txt="REDACTED")
display(df_redacted)

StatementMeta(, bf2b6c99-4ef9-4c57-bed9-28aaab77d989, 62, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, fe422dd5-300a-48f8-a9bd-f022774b1106)