### Fake PII data: Exploratory data analysis

This notebook is used to verify the different fake entities before and after the creation of a synthetic dataset / augmented dataset. First part looks at the generation details and stats, second part evaluates the created synthetic dataset after it has been generated.

In [None]:
%cd ..
!ls

In [None]:
!pip install -q matplotlib
!pip install -q wordcloud

In [None]:
import pandas as pd

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioDataGenerator

from collections import Counter

import matplotlib.pyplot as plt

%matplotlib inline

1. Evaluate generation logic and the fake PII bank used during generation

In [None]:
pii_df = pd.read_csv(
    "presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv",
    encoding="utf-8",
)

In [None]:
PresidioDataGenerator.update_fake_name_generator_df(pii_df)
pii_df.head()

In [None]:
for (name, series) in pii_df.iteritems():
    print(name)
    print("Unique values: {}".format(len(series.unique())))
    print(series.value_counts())
    print("\n**************\n")

In [None]:
from wordcloud import WordCloud


def series_to_wordcloud(series):
    freqs = series.value_counts()
    wordcloud = WordCloud(
        background_color="white", width=800, height=400
    ).generate_from_frequencies(freqs)
    fig = plt.figure(figsize=(16, 8))
    plt.suptitle("{} word cloud".format(series.name))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")

In [None]:
series_to_wordcloud(pii_df.first_name)

In [None]:
series_to_wordcloud(pii_df.last_name)

In [None]:
series_to_wordcloud(pii_df.country_full)

In [None]:
series_to_wordcloud(pii_df.company)

In [None]:
series_to_wordcloud(pii_df.city)

2. Evaluate different entities in the synthetic dataset after creation

In [None]:
synth = InputSample.read_dataset_json("data/synth_dataset_v2.json")

In [None]:
def get_entity_values_from_sample(sample, entity_types):
    name_entities = [
        span.entity_value for span in sample.spans if span.entity_type in entity_types
    ]
    return name_entities


names = [
    get_entity_values_from_sample(sample, ["PERSON", "FIRST_NAME", "LAST_NAME"])
    for sample in synth
]
names = [item for sublist in names for item in sublist]
series_to_wordcloud(pd.Series(names, name="PERSON, FIRST_NAME, LAST_NAME"))

In [None]:
countries = [get_entity_values_from_sample(sample, ["LOCATION"]) for sample in synth]
countries = [item for sublist in countries for item in sublist]
series_to_wordcloud(pd.Series(countries, name="LOCATION"))

In [None]:
orgs = [get_entity_values_from_sample(sample, ["ORGANIZATION"]) for sample in synth]
orgs = [item for sublist in orgs for item in sublist]
series_to_wordcloud(pd.Series(orgs, name="ORGANIZATION"))