In [None]:
%%capture
!pip install presidio_analyzer presidio_anonymizer

In [None]:
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult
# from presidio_anonymizer import AnonymizerEngine

# Simple example

In [3]:
text="My phone number is 212-555-5555"

In [4]:
analyzer = AnalyzerEngine()

In [5]:
results = analyzer.analyze(text=text,
                           entities=["PHONE_NUMBER"],
                           language='en')
print(results)

[type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]


# Recognize column types in csv

In [6]:
data_file_path = "../data/orders.csv"
import pandas as pd
df = pd.read_csv(data_file_path, nrows=100)
df.head()

Unnamed: 0,order_number,user_id,email,street_address,city,state,num_items,total_price,timestamp
0,754,roy50,margaretwalker@example.com,8222 Jared Plains Suite 489,Millerfurt,Delaware,4,162,2022-07-31T13:53:42.176704
1,371,handerson,james63@example.org,39972 Isabella Haven Suite 670,Farrellport,Oklahoma,3,42,2022-08-03T17:54:12.176704
2,376,william55,sotopatricia@example.net,4418 Raymond Club,West Andreafort,Illinois,8,198,2022-08-05T18:54:45.176704
3,431,uking,cponce@example.com,369 Angela Green Suite 649,Whitemouth,Tennessee,6,90,2022-08-06T20:55:01.176704
4,713,tiffany94,kingelizabeth@example.net,3030 Kerr Streets Apt. 695,Hamptonhaven,Indiana,5,62,2022-08-09T22:55:50.176704


## Experiment 1: Per-column recognition

In [7]:
import csv

def csv_to_dict(file_path, rows=100):
    data_dict = {}
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        count = 0
        for row in reader:
            if count >= rows:
                break
            for key, value in row.items():
                if key not in data_dict:
                    data_dict[key] = []
                data_dict[key].append(value)
            count += 1
    return data_dict

file_path = data_file_path
result = csv_to_dict(file_path)

In [8]:
analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)

In [9]:
analyzer_results = batch_analyzer.analyze_dict(result, language="en")
analyzer_results = list(analyzer_results)

In [11]:
result_data = {}

for entry in analyzer_results:
    result_data[entry.key]={}
    type_counts = {}
    total_entries = len(entry.recognizer_results)

    for element in entry.recognizer_results:
        if len(element)!=0:
            type_value = element[0].entity_type
            type_counts[type_value] = type_counts.get(type_value, 0) + 1
        else:
            type_counts['NONE'] = type_counts.get("NONE", 0) + 1

    percentage_per_type = {type_value: (count / total_entries) * 100 for type_value, count in type_counts.items()}

    for type_value, percentage in percentage_per_type.items():
        result_data[entry.key][type_value]=percentage

In [18]:
from tabulate import tabulate

def generate_table(data):
    # Extract unique categories
    categories = set(category for values in data.values() for category in values.keys())
    
    # Initialize table with headers
    table = [[''] + list(categories)]
    
    # Populate table rows
    for key, value in data.items():
        row = [key]
        for category in categories:
            percentage = value.get(category, 0.0)
            row.append(f"{percentage:.0f}%")
        table.append(row)
    
    return table

table = generate_table(result_data)
print(tabulate(table, headers="firstrow",tablefmt="heavy_outline"))


┏━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┓
┃                ┃ NONE   ┃ EMAIL_ADDRESS   ┃ DATE_TIME   ┃ LOCATION   ┃ NRP   ┃ PERSON   ┃
┣━━━━━━━━━━━━━━━━╋━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━╋━━━━━━━━━━┫
┃ order_number   ┃ 100%   ┃ 0%              ┃ 0%          ┃ 0%         ┃ 0%    ┃ 0%       ┃
┃ user_id        ┃ 63%    ┃ 0%              ┃ 1%          ┃ 11%        ┃ 0%    ┃ 25%      ┃
┃ email          ┃ 0%     ┃ 100%            ┃ 0%          ┃ 0%         ┃ 0%    ┃ 0%       ┃
┃ street_address ┃ 48%    ┃ 0%              ┃ 3%          ┃ 6%         ┃ 1%    ┃ 42%      ┃
┃ city           ┃ 47%    ┃ 0%              ┃ 0%          ┃ 26%        ┃ 9%    ┃ 18%      ┃
┃ state          ┃ 0%     ┃ 0%              ┃ 0%          ┃ 100%       ┃ 0%    ┃ 0%       ┃
┃ num_items      ┃ 100%   ┃ 0%              ┃ 0%          ┃ 0%         ┃ 0%    ┃ 0%       ┃
┃ total_price    ┃ 100%   ┃ 0%              ┃ 0%          ┃ 0%         ┃ 0%    ┃