In this example we will load the following dataset https://huggingface.co/datasets/ai4privacy/pii-masking-200k?row=0 , then will modify the data to be corresponded with the presidio InputSample structure by this steps :

1 - keeping only the english language data 

2 - renaming the columns that fit the "full text" , "masked" and "spans" in presidio 

3 - removing all other columns

4 - spliting the data to train and test data , *notice that datasets from huggingface in general are splitted into train and test data , and in some cases validation, but in this spesific set the whole set is a train set so we need to split it.

5 - changing the spans dict keys names to be compatible with some of presidio's functions 

6 - saving the training and testing files as new json files



In [1]:
#!pip install datasets
from datasets import load_dataset

dataset = load_dataset("ai4privacy/pii-masking-200k", data_files=["*.jsonl"])
dataset = dataset.filter(lambda example: example['language'] == 'en') #getting only the english language data

dataset = dataset.rename_column("source_text", "full_text")
dataset = dataset.rename_column("target_text", "masked")
dataset = dataset.rename_column("privacy_mask", "spans")


In [2]:
train_length = len(dataset['train'])
print (f'data length :  {train_length}')

data length :  43501


In [3]:
import ast
entities = [ast.literal_eval(i['span_labels']) for i in dataset['train']]  # Safely evaluate the list from the string
from collections import Counter
count_per_entity = Counter()
for entity in entities : 
     for wanted_entity in entity:
         if wanted_entity[2]!= "O":
             count_per_entity[wanted_entity[2]] +=1 
print (f"entities in the original data : {count_per_entity}")
print (f"Number of different entites : {len(count_per_entity)}")

entities in the original data : Counter({'FIRSTNAME': 13323, 'LASTNAME': 4817, 'DATE': 4555, 'EMAIL': 4092, 'PREFIX': 3446, 'AMOUNT': 3103, 'USERNAME': 3056, 'JOBTITLE': 2954, 'URL': 2910, 'TIME': 2883, 'JOBAREA': 2882, 'MIDDLENAME': 2881, 'ACCOUNTNUMBER': 2831, 'STREET': 2799, 'ACCOUNTNAME': 2793, 'CITY': 2787, 'COUNTY': 2745, 'STATE': 2735, 'IPV4': 2720, 'BUILDINGNUMBER': 2714, 'JOBTYPE': 2710, 'CURRENCYSYMBOL': 2678, 'PASSWORD': 2622, 'SEX': 2606, 'CREDITCARDNUMBER': 2574, 'COMPANYNAME': 2502, 'PHONENUMBER': 2425, 'BITCOINADDRESS': 2403, 'IPV6': 2401, 'GENDER': 2385, 'AGE': 2370, 'DOB': 2354, 'ZIPCODE': 2322, 'SECONDARYADDRESS': 2307, 'MASKEDNUMBER': 2131, 'USERAGENT': 2084, 'SSN': 2045, 'IP': 1985, 'IBAN': 1973, 'CURRENCY': 1908, 'PHONEIMEI': 1875, 'NEARBYGPSCOORDINATE': 1651, 'ETHEREUMADDRESS': 1624, 'CREDITCARDISSUER': 1510, 'MAC': 1114, 'ORDINALDIRECTION': 1109, 'VEHICLEVRM': 950, 'CREDITCARDCVV': 869, 'EYECOLOR': 868, 'LITECOINADDRESS': 850, 'VEHICLEVIN': 849, 'HEIGHT': 837, 'C

In [4]:
dataset = dataset.remove_columns(['span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'])
dataset_split = dataset['train'].train_test_split(test_size=0.2)
train_data = dataset_split['train']
test_data = dataset_split['test']


In [5]:
train_data = [dict(row) for row in train_data]
test_data = [dict(row) for row in test_data]

# Iterate through each item in the list
for item in train_data:
     for span in item['spans']:
        span['entity_type'] = span.pop('label')
        span['entity_value'] = span.pop('value')
        span['start_position'] = span.pop('start')
        span['end_position'] = span.pop('end')
for item in test_data:
     for span in item['spans']:
        span['entity_type'] = span.pop('label')
        span['entity_value'] = span.pop('value')
        span['start_position'] = span.pop('start')
        span['end_position'] = span.pop('end')

import json
#saving the training and testing files as json files
with open(r'train_data.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open(r'test_data.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

In [6]:
print(f"train data length: {len(train_data)}")
print(f"test data length: {len(test_data)}")


train data length: 34800
test data length: 8701
