# Run Presidio on structured / semi-structured data

This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame.
It introduces methods for the analysis and anonymization of both lists and dicts. 

In this example we create two classes which implement the base Presidio classes:
1. `BatchAnalyzerEngine(AnalyzerEngine)`: for the presidio-anlyzer side
2. `BatchAnonymizerEngine(AnonymizerEngine`): for the presidio-anonymizer side

In addition, we create a `dataclass` (`DictAnalyzerResult`) to serve as the data transfer object between the two.

Note: this sample input here is a Pandas DataFrame, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.

### Set up imports

In [1]:
from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint

import pandas as pd

from presidio_analyzer import AnalyzerEngine, RecognizerResult
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities.engine.result import EngineResult


### Set up classes for batch processing

1. Presidio Analyzer: Batch mode

In [2]:
@dataclass
class DictAnalyzerResult:
    """Hold the analyzer results per value or list of values."""
    key: str
    value: Union[str, List[str]]
    recognizer_results: Union[List[RecognizerResult], List[List[RecognizerResult]]]


class BatchAnalyzerEngine(AnalyzerEngine):
    """
    Class inheriting from AnalyzerEngine and adds the funtionality to analyze lists or dictionaries.
    """
    
    def analyze_list(self, list_of_texts: Iterable[str], **kwargs) -> List[List[RecognizerResult]]:
        """
        Analyze an iterable of strings
        
        :param list_of_texts: An iterable containing strings to be analyzed.
        :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
        """
        
        list_results = []
        for text in list_of_texts:
            results = self.analyze(text=text, **kwargs) if isinstance(text, str) else []
            list_results.append(results)
        return list_results

    def analyze_dict(
     self, input_dict: Dict[str, Union[object, Iterable[object]]], **kwargs) -> Iterator[DictAnalyzerResult]:
        """
        Analyze a dictionary of keys (strings) and values (either object or Iterable[object]). 
        Non-string values are returned as is.
        
        :param input_dict: The input dictionary for analysis
        :param kwargs: Additional keyword arguments for the `AnalyzerEngine.analyze` method
        """
        
        for key, value in input_dict.items():
            if not value:
                results = []
            else:
                if isinstance(value, str):
                    results: List[RecognizerResult] = self.analyze(text=value, **kwargs)
                elif isinstance(value, collections.Iterable):
                    results: List[List[RecognizerResult]] = self.analyze_list(
                                list_of_texts=value, 
                                **kwargs)
                else:
                    results = []
            yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)


2. Presidio Anonymizer

In [3]:
class BatchAnonymizerEngine(AnonymizerEngine):
    """
    Class inheriting from the AnonymizerEngine and adding additional functionality 
    for anonymizing lists or dictionaries.
    """
    
    def anonymize_list(
        self, 
        texts:List[str], 
        recognizer_results_list: List[List[RecognizerResult]], 
        **kwargs
    ) -> List[EngineResult]:
        """
        Anonymize a list of strings.
        
        :param texts: List containing the texts to be anonymized (original texts)
        :param recognizer_results_list: A list of lists of RecognizerResult, 
        the output of the AnalyzerEngine on each text in the list.
        :param kwargs: Additional kwargs for the `AnonymizerEngine.anonymize` method
        """
        return_list = []
        for text, recognizer_results in zip(texts, recognizer_results_list):
            if isinstance(text,str):
                res = self.anonymize(text=text,analyzer_results=recognizer_results,**kwargs)
                return_list.append(res.text)
            else:
                return_list.append(text)

        return return_list


    def anonymize_dict(self, analyzer_results: Iterator[DictAnalyzerResult],**kwargs) -> Dict[str, str]:

        """
        Anonymize values in a dictionary.
        
        :param analyzer_results: Iterator of `DictAnalyzerResult` 
        containing the output of the AnalyzerEngine.analyze_dict on the input text.
        :param kwargs: Additional kwargs for the `AnonymizerEngine.anonymize` method
        """
        
        return_dict = {}
        for result in analyzer_results:
            if isinstance(result.value, str):
                resp = self.anonymize(text=result.value, analyzer_results=result.recognizer_results, **kwargs)
                return_dict[result.key] = resp.text
            elif isinstance(result.value, collections.Iterable):
                anonymize_respones = self.anonymize_list(texts=result.value,
                                                         recognizer_results_list=result.recognizer_results, 
                                                         **kwargs)
                return_dict[result.key] = anonymize_respones 
            else:
                return_dict[result.key] = result.value

        return return_dict

## Example using sample data

In [4]:
columns = ["name phrase","phone number phrase", "integer", "boolean" ]
sample_data = [
        ('Morris likes this','Please call 212-555-1234 after 2pm', 1, True),
        ('You should talk to Mike','his number is 978-428-7111', 2, False),
        ('Mary had a little startup','Phone number: 202-342-1234', 3, False)
]

In [5]:
# Create Pandas DataFrame
df  = pd.DataFrame(sample_data,columns=columns)

df

Unnamed: 0,name phrase,phone number phrase,integer,boolean
0,Morris likes this,Please call 212-555-1234 after 2pm,1,True
1,You should talk to Mike,his number is 978-428-7111,2,False
2,Mary had a little startup,Phone number: 202-342-1234,3,False


In [6]:
# DataFrame to dict
df_dict = df.to_dict(orient="list")

In [7]:
pprint.pprint(df_dict)

{'boolean': [True, False, False],
 'integer': [1, 2, 3],
 'name phrase': ['Morris likes this',
                 'You should talk to Mike',
                 'Mary had a little startup'],
 'phone number phrase': ['Please call 212-555-1234 after 2pm',
                         'his number is 978-428-7111',
                         'Phone number: 202-342-1234']}


In [8]:
batch_analyzer = BatchAnalyzerEngine()
batch_anonymizer = BatchAnonymizerEngine()

In [9]:
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")

In [None]:
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)

In [11]:
scrubbed_df = pd.DataFrame(anonymizer_results)

In [12]:
scrubbed_df

Unnamed: 0,name phrase,phone number phrase,integer,boolean
0,<PERSON> likes this,Please call <PHONE_NUMBER> after <DATE_TIME>,1,True
1,You should talk to <PERSON>,his number is <PHONE_NUMBER>,2,False
2,<PERSON> had a little startup,Phone number: <PHONE_NUMBER>,3,False
