In [1]:
from revised_data_generator.main import generate

import datetime
import pandas as pd
import pickle

# Expanding Enron with data Presidio data generator

In [3]:
def extract_address_templates(row) -> str:
    """
    get a row and extract the template for of the label for it
    i.e. 
    a third right onto Vernon Blvd
    turns to:
    a third right onto [ADDRESS]
    """
    text = row.data
    #remove any square brackets since they interfere with the process
    text = text.replace("[","").replace("]","")

    template = ""
    labels = row.label
    index = 0
    for label in labels:
        start = label[0]
        end = label[1]
        template += text[index:start] + "{ADDRESS}" 
        index = end
    template += text[index:]
    return template

f = open("../../enron.pickle", 'rb')
df = pickle.load(f)

# 1. get the rows that have at least one address in them
df_with_address = df[df.label.apply(len) != 0]
df_with_address

# 2. make a list of all the Addresses
addresses = df_with_address['values'].sum()
print("Addresses len:", len(addresses))
address_df = pd.DataFrame(addresses, columns=['ADDRESS'])

# 3. create template sentneces
df_with_address["template"] = df_with_address.apply(extract_address_templates, axis = 1)
templates = df_with_address["template"].to_list()
print("templates len:", len(templates))

# 4. let's use presidio data generator
EXAMPLES = 1000
SPAN_TO_TAG = True 
KEEP_ONLY_TAGGED = False
LOWER_CASE_RATIO = 0.1

cur_time = datetime.date.today().strftime("%B_%d_%Y")
OUTPUT = "../../presidio-research/data/generated_address_size_{}_date_{}.json".format(EXAMPLES, cur_time)


dictionary_path = None

examples = generate(fake_pii_csv = address_df,
                        utterances_file=templates,
                        dictionary_path=dictionary_path,
                        output_file=OUTPUT,
                        lower_case_ratio=LOWER_CASE_RATIO,
                        num_of_examples=EXAMPLES,
                        # ignore_types=IGNORE_TYPES,
                        include_metadata = True,
                        if_prep_templates = False, 
                        keep_only_tagged=KEEP_ONLY_TAGGED,
                        span_to_tag=SPAN_TO_TAG)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_address["template"] = df_with_address.apply(extract_address_templates, axis = 1)
  1%|          | 11/1000 [00:00<00:09, 103.23it/s]

Addresses len: 397
templates len: 309
DATE is taken from the BIRTHDAY column which is missing
Generating address parts
Generating roles
Generating titles
Cannot generate title without a GENDER column. Generating FEMALE_TITLE and MALE_TITLE
Generating nationalities
Generating IBANs
Generating IP addresses
Generating SSN numbers
Generating US driver license numbers
Generating URLs
Cannot generate url without a domain name
Generating company names
Finished preparing fake PII data


100%|██████████| 1000/1000 [00:10<00:00, 92.53it/s]


generated 1000 examples
Finished creating generated dataset. File location:../../presidio-research/data/generated_address_size_1000_date_November_07_2021.json
