# Run Presidio on structured / semi-structured data

This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame.
It introduces methods for the analysis and anonymization of both lists and dicts. 

In this example we leverage the batch mode in presidio-analyzer and implement a `BatchAnonymizerEngine(AnonymizerEngine)` class for the presidio-anonymizer side

Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.

path to notebook: https://github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb

### Set up imports

In [1]:
from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint

import pandas as pd

from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import EngineResult


### Handling batch data on Presidio Anonymizer

In [2]:
class BatchAnonymizerEngine(AnonymizerEngine):
    """
    Class inheriting from the AnonymizerEngine and adding additional functionality 
    for anonymizing lists or dictionaries.
    """
    
    def anonymize_list(
        self, 
        texts:List[Union[str, bool, int, float]], 
        recognizer_results_list: List[List[RecognizerResult]], 
        **kwargs
    ) -> List[EngineResult]:
        """
        Anonymize a list of strings.
        
        :param texts: List containing the texts to be anonymized (original texts)
        :param recognizer_results_list: A list of lists of RecognizerResult, 
        the output of the AnalyzerEngine on each text in the list.
        :param kwargs: Additional kwargs for the `AnonymizerEngine.anonymize` method
        """
        return_list = []
        if not recognizer_results_list:
            recognizer_results_list = [[] for _ in range(len(texts))]
        for text, recognizer_results in zip(texts, recognizer_results_list):
            if type(text) in (str, bool, int, float):
                res = self.anonymize(text=str(text), analyzer_results=recognizer_results, **kwargs)
                return_list.append(res.text)
            else:
                return_list.append(text)

        return return_list


    def anonymize_dict(self, analyzer_results: Iterable[DictAnalyzerResult], **kwargs) -> Dict[str, str]:

        """
        Anonymize values in a dictionary.
        
        :param analyzer_results: Iterator of `DictAnalyzerResult` 
        containing the output of the AnalyzerEngine.analyze_dict on the input text.
        :param kwargs: Additional kwargs for the `AnonymizerEngine.anonymize` method
        """
        
        return_dict = {}
        for result in analyzer_results:
            
            if isinstance(result.value, dict):
                resp = self.anonymize_dict(analyzer_results = result.recognizer_results, **kwargs)
                return_dict[result.key] = resp
                
            elif isinstance(result.value, str):
                resp = self.anonymize(text=result.value, analyzer_results=result.recognizer_results, **kwargs)
                return_dict[result.key] = resp.text
                
            elif isinstance(result.value, collections.abc.Iterable):
                anonymize_respones = self.anonymize_list(texts=result.value,
                                                         recognizer_results_list=result.recognizer_results, 
                                                         **kwargs)
                return_dict[result.key] = anonymize_respones 
            else:
                return_dict[result.key] = result.value
        return return_dict

## Example using sample tabular data

In [3]:
columns = ["name phrase", "phone number phrase", "integer", "boolean" ]
sample_data = [
        ('Morris likes this', 'Please call 212-555-1234 after 2pm', 1, True),
        ('You should talk to Mike', 'his number is 978-428-7111', 2, False),
        ('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)
]

In [4]:
# Create Pandas DataFrame
df  = pd.DataFrame(sample_data,columns=columns)

df

Unnamed: 0,name phrase,phone number phrase,integer,boolean
0,Morris likes this,Please call 212-555-1234 after 2pm,1,True
1,You should talk to Mike,his number is 978-428-7111,2,False
2,Mary had a little startup,Phone number: 202-342-1234,3,False


In [5]:
# DataFrame to dict
df_dict = df.to_dict(orient="list")

In [6]:
pprint.pprint(df_dict)

{'boolean': [True, False, False],
 'integer': [1, 2, 3],
 'name phrase': ['Morris likes this',
                 'You should talk to Mike',
                 'Mary had a little startup'],
 'phone number phrase': ['Please call 212-555-1234 after 2pm',
                         'his number is 978-428-7111',
                         'Phone number: 202-342-1234']}


In [7]:
analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
batch_anonymizer = BatchAnonymizerEngine()

In [8]:
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
analyzer_results = list(analyzer_results)
analyzer_results

[DictAnalyzerResult(key='name phrase', value=['Morris likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 6, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], []]),
 DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]),
 DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]),
 DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]

In [9]:
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)

In [10]:
scrubbed_df = pd.DataFrame(anonymizer_results)

In [11]:
scrubbed_df

Unnamed: 0,name phrase,phone number phrase,integer,boolean
0,<PERSON> likes this,Please call <PHONE_NUMBER> after <DATE_TIME>,1,True
1,You should talk to <PERSON>,his number is <PHONE_NUMBER>,2,False
2,Mary had a little startup,Phone number: <PHONE_NUMBER>,3,False


## Example using JSON

In [12]:
nested_dict = {
    "key_a": {"key_a1": "My phone number is 212-121-1424"},
    "key_b": {"www.abc.com"},
    "key_c": 3,
    "names": ["James Bond", "Clark Kent", "Hakeem Olajuwon", "No name here!"]
}

pprint.pprint(nested_dict)

{'key_a': {'key_a1': 'My phone number is 212-121-1424'},
 'key_b': {'www.abc.com'},
 'key_c': 3,
 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}


In [13]:
# Analyze dict
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en")

# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)

{'key_a': {'key_a1': 'My phone number is <PHONE_NUMBER>'},
 'key_b': ['<URL>'],
 'key_c': 3,
 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}


### Ignoring specific keys

In [14]:
keys_to_skip=["key_a1", "names"]
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)

# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)

{'key_a': {'key_a1': 'My phone number is 212-121-1424'},
 'key_b': ['<URL>'],
 'key_c': 3,
 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}


### Ignoring nested keys

In [15]:
keys_to_skip = ["key_a.key_a1"]

analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)

# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)

{'key_a': {'key_a1': 'My phone number is 212-121-1424'},
 'key_b': ['<URL>'],
 'key_c': 3,
 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}


#### **Note!**

JSON files with objects within lists, e.g.:
```
{
  "key": [
    {
      "key2": "Peter Parker"
    },
    {
      "key3": "555-1234"
    }
  ]
}
```

Are not yet supported. Consider breaking the JSON to parts if needed.