In [1]:
from presidio_analyzer import AnalyzerEngine, RecognizerResult
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities.engine.result import EngineResult

from typing import Iterator, List, Optional, Union, Dict, Iterable
from dataclasses import dataclass
import collections

In [2]:
import pandas as pd


In [3]:
@dataclass
class DictAnalyzerResult:
    key: str
    value: Union[str, List[str]]
    recognizer_results: Union[List[RecognizerResult], List[List[RecognizerResult]]]

In [4]:
class DictAnalyzerEngine(AnalyzerEngine):
    def analyze_dict(self, input_dict: Dict[str, str], **kwargs) -> Iterator[DictAnalyzerResult]:
        for key, value in input_dict.items():
            results = self.analyze(text=value, **kwargs) if isinstance(value, str) else []
            yield DictAnalyzerResult(key=key, value=value, recognizer_results=results) 
            
analyzer = DictAnalyzerEngine()

In [5]:
class DictAnonymizerEngine(AnonymizerEngine):
    
    def anonymize_dict(self, analyzer_results:Iterator[DictAnalyzerResult],**kwargs) -> Dict[str, str]:
        """ Run over all key, values and anonymize given the analyzer output for this cell. 
        Return the original data structure, with anonymized PII entities.
        """
 
        return_dict = {}
        for result in analyzer_results:
            if isinstance(result.value, str):
                resp = self.anonymize(text=result.value, analyzer_results=result.recognizer_results, **kwargs)
                return_dict[result.key] = resp.text              
            else:
                return_dict[result.key] = result.value             

        return return_dict
anonymizer = DictAnonymizerEngine()

Analyze

In [6]:
simple_dict = {"a":"My name is Robert", "b":"I live in Maine","c":5,"d":None}
simple_dict_results = analyzer.analyze_dict(simple_dict,language="en")
simple_dict_results = list(simple_dict_results)
print(simple_dict_results)



[DictAnalyzerResult(key='a', value='My name is Robert', recognizer_results=[type: PERSON, start: 11, end: 17, score: 0.85]), DictAnalyzerResult(key='b', value='I live in Maine', recognizer_results=[type: LOCATION, start: 10, end: 15, score: 0.85]), DictAnalyzerResult(key='c', value=5, recognizer_results=[]), DictAnalyzerResult(key='d', value=None, recognizer_results=[])]


Anonymize

In [7]:
# One result
one_resp = anonymizer.anonymize(text=simple_dict_results[0].value, 
                                analyzer_results= simple_dict_results[0].recognizer_results)
print(one_resp.text)


# All results

resp = anonymizer.anonymize_dict(simple_dict_results)
print(resp)

My name is <PERSON>
{'a': 'My name is <PERSON>', 'b': 'I live in <LOCATION>', 'c': 5, 'd': None}


## Option 2

In [27]:
class DictAnalyzerEngine(AnalyzerEngine):
    def analyze_list(
        self, list_of_texts: Iterable[str], key: str = None, **kwargs
    ) -> List[List[RecognizerResult]]:
        list_results = []
        for text in list_of_texts:
            results = self.analyze(text=text, **kwargs) if isinstance(text, str) else []
            list_results.append(results)

        return list_results

    def analyze_dict(
        self, input_dict: Dict[str, Union[str, Iterable[str]]], **kwargs
    ) -> Iterator[DictAnalyzerResult]:

        for key, value in input_dict.items():
            if not value:
                results = []
            else:
                if isinstance(value, str):
                    results: List[RecognizerResult] = self.analyze(text=value, **kwargs)
                elif isinstance(value, collections.Iterable):
                    results: List[List[RecognizerResult]] = self.analyze_list(
                        list_of_texts=value, key=key, **kwargs
                    )
                else:
                    results= []

            yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)
analyzer = DictAnalyzerEngine()

In [28]:
class DictAnonymizerEngine(AnonymizerEngine):
    
    def anonymize_list(self, texts:List[str], recognizer_results_list: List[List[RecognizerResult]], **kwargs) -> List[EngineResult]:
        return_list = []
        for text, recognizer_results in zip(texts, recognizer_results_list):
            if isinstance(text,str):
                res = self.anonymize(text=text,analyzer_results=recognizer_results,**kwargs)
                return_list.append(res.text)
            else:
                return_list.append(text)
           
        return return_list
    
    
    def anonymize_dict(self, analyzer_results: Iterator[DictAnalyzerResult],**kwargs) -> Dict[str, str]:
        """ Run over all key, values and anonymize given the analyzer output for this cell. 
        Return the original data structure, with anonymized PII entities.
        """
 
        return_dict = {}
        for result in analyzer_results:
            if isinstance(result.value, str):
                resp = self.anonymize(text=result.value, analyzer_results=result.recognizer_results, **kwargs)
                return_dict[result.key] = resp.text
            elif isinstance(result.value, collections.Iterable):
                
                anonymize_respones = self.anonymize_list(texts=result.value,
                                                         recognizer_results_list=result.recognizer_results, 
                                                         **kwargs)
                return_dict[result.key] = anonymize_respones 
            else:
                return_dict[result.key] = result.value
        return return_dict

anonymizer = DictAnonymizerEngine()

In [29]:
simple_dict = {"a":"My name is Robert", "b":"I live in Maine"}
simple_dict_results = analyzer.analyze_dict(simple_dict,language="en")
anonymizer_results = anonymizer.anonymize_dict(simple_dict_results)
print(anonymizer_results)



{'a': 'My name is <PERSON>', 'b': 'I live in <LOCATION>'}


Json with lists


In [30]:
json_w_lists = {"a": "My name is Robert",
                "b": ["My phone is 054-3332111", "5"],
                "c": 5
}
json_w_lists_results = analyzer.analyze_dict(json_w_lists,language="en")
json_w_lists_results = list(json_w_lists_results)
print(json_w_lists_results)
anonymizer.anonymize_dict(json_w_lists_results)



[DictAnalyzerResult(key='a', value='My name is Robert', recognizer_results=[type: PERSON, start: 11, end: 17, score: 0.85]), DictAnalyzerResult(key='b', value=['My phone is 054-3332111', '5'], recognizer_results=[[type: PHONE_NUMBER, start: 12, end: 23, score: 0.85, type: US_DRIVER_LICENSE, start: 16, end: 23, score: 0.01], []]), DictAnalyzerResult(key='c', value=5, recognizer_results=[])]


{'a': 'My name is <PERSON>', 'b': ['My phone is <PHONE_NUMBER>', '5'], 'c': 5}

Test on pandas data frames

In [31]:
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
dataset_len = len(iris)

new_col = ["My name is Morris" for _ in range(dataset_len)]
iris["new_col"] = new_col

iris.head()

    

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,new_col
0,5.1,3.5,1.4,0.2,setosa,My name is Morris
1,4.9,3.0,1.4,0.2,setosa,My name is Morris
2,4.7,3.2,1.3,0.2,setosa,My name is Morris
3,4.6,3.1,1.5,0.2,setosa,My name is Morris
4,5.0,3.6,1.4,0.2,setosa,My name is Morris


In [32]:
#Option 1: pd.to_dict
iris_dict = iris.to_dict(orient="list")

analyzer_results = analyzer.analyze_dict(iris_dict,language="en")
anonymized_results = anonymizer.anonymize_dict(analyzer_results)
new_iris = pd.DataFrame(anonymized_results)
new_iris















































Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,new_col
0,5.1,3.5,1.4,0.2,setosa,My name is <PERSON>
1,4.9,3.0,1.4,0.2,setosa,My name is <PERSON>
2,4.7,3.2,1.3,0.2,setosa,My name is <PERSON>
3,4.6,3.1,1.5,0.2,setosa,My name is <PERSON>
4,5.0,3.6,1.4,0.2,setosa,My name is <PERSON>
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,My name is <PERSON>
146,6.3,2.5,5.0,1.9,virginica,My name is <PERSON>
147,6.5,3.0,5.2,2.0,virginica,My name is <PERSON>
148,6.2,3.4,5.4,2.3,virginica,My name is <PERSON>


In [34]:
#Option 1: pd.to_dict as columns
iris_dict = iris.to_dict(orient="records")
iris_dict
#iris_dict["sepal_length"]
anonymized = []
for record in iris_dict:
    analyzer_results = analyzer.analyze_dict(record, language="en")
    anonymized_results = anonymizer.anonymize_dict(analyzer_results)
    anonymized.append(anonymized_results)
pd.DataFrame(anonymized)
















































Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,new_col
0,5.1,3.5,1.4,0.2,setosa,My name is <PERSON>
1,4.9,3.0,1.4,0.2,setosa,My name is <PERSON>
2,4.7,3.2,1.3,0.2,setosa,My name is <PERSON>
3,4.6,3.1,1.5,0.2,setosa,My name is <PERSON>
4,5.0,3.6,1.4,0.2,setosa,My name is <PERSON>
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,My name is <PERSON>
146,6.3,2.5,5.0,1.9,virginica,My name is <PERSON>
147,6.5,3.0,5.2,2.0,virginica,My name is <PERSON>
148,6.2,3.4,5.4,2.3,virginica,My name is <PERSON>
