### Fake PII data: Exploratory data analysis

This notebook is used to verify the different fake entities before and after the creation of a synthetic dataset / augmented dataset. First part looks at the generation details and stats, second part evaluates the created synthetic dataset after it has been generated.

In [None]:
import pandas as pd

from presidio_evaluator.data_generator.extensions import generate_iban, generate_ip_addresses, generate_SSNs, \
    generate_company_names, generate_url, generate_roles, generate_titles, generate_nationality, generate_nation_man, \
    generate_nation_woman, generate_nation_plural, generate_title

from presidio_evaluator.data_generator import FakeDataGenerator, read_synth_dataset

from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

1. Evaluate generation logic and the fake PII bank used during generation

In [None]:
df = pd.read_csv("../presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv",encoding="utf-8")

In [None]:
generator = FakeDataGenerator(fake_pii_df=df, 
                              templates=None, 
                              dictionary_path=None,
                              ignore_types={"IP_ADDRESS", 'US_SSN', 'URL','ADDRESS'})

In [None]:
pii_df = generator.prep_fake_pii(df)

In [None]:
for (name, series) in pii_df.iteritems():
    print(name)
    print("Unique values: {}".format(len(series.unique())))
    print(series.value_counts())
    print("\n**************\n")

In [None]:
#!pip install wordcloud
from wordcloud import WordCloud

def series_to_wordcloud(series):
    freqs = series.value_counts()
    wordcloud = WordCloud(background_color='white',width=800,height=400).generate_from_frequencies(freqs)
    fig = plt.figure(figsize=(16, 8))
    plt.suptitle("{} word cloud".format(series.name))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")

In [None]:
series_to_wordcloud(pii_df.FIRST_NAME)

In [None]:
series_to_wordcloud(pii_df.LAST_NAME)

In [None]:
series_to_wordcloud(pii_df.COUNTRY)

In [None]:
series_to_wordcloud(pii_df.ORGANIZATION)

In [None]:
series_to_wordcloud(pii_df.CITY)

2. Evaluate different entities in the synthetic dataset after creation

In [None]:
synth = read_synth_dataset("../data/generated_train_November 12 2019.json")

In [None]:
sentences_only = [(sample.full_text,sample.metadata) for sample in synth]

In [None]:
sentences_only[2]

In [None]:
print("Proportions of female vs. male based samples:")
Counter([sentence[1]['Gender'] for sentence in sentences_only])

In [None]:
print("Proportion of lower case samples:")
Counter([sentence[1]['Lowercase'] for sentence in sentences_only])

In [None]:
print("Proportion of nameset across samples:")
Counter([sentence[1]['NameSet'] for sentence in sentences_only])

In [None]:
def get_entity_values_from_sample(sample,entity_types):
    name_entities = [span.entity_value for span in sample.spans if span.entity_type in entity_types]
    return name_entities
    
names = [get_entity_values_from_sample(sample,['PERSON','FIRST_NAME','LAST_NAME']) for sample in synth]
names = [item for sublist in names for item in sublist]
series_to_wordcloud(pd.Series(names,name='PERSON, FIRST_NAME, LAST_NAME'))

In [None]:
countries = [get_entity_values_from_sample(sample,['LOCATION']) for sample in synth]
countries = [item for sublist in countries for item in sublist]
series_to_wordcloud(pd.Series(countries,name='LOCATION'))

In [None]:
orgs = [get_entity_values_from_sample(sample,['ORGANIZATION']) for sample in synth]
orgs = [item for sublist in orgs for item in sublist]
series_to_wordcloud(pd.Series(orgs,name='ORGANIZATION'))