In [1]:
import json
import os

import random



In [2]:
train_path = f"train_data_all"
finetune_path = f"finetune_data"

## Load data

In [3]:
with open(os.path.join(train_path,f"pile_mistral_v0.1_train.json"),'r') as f:
    data_pile = json.load(f)

In [4]:
with open(os.path.join(train_path,f"urchadesynthetic-pii-ner-mistral-v1_train.json"),'r') as f:
    data_pii = json.load(f)

In [21]:
with open(os.path.join(train_path,f"anon_train_empty.json"),'r') as f:
    anon_data_train_empty = json.load(f)

In [19]:
with open(os.path.join(train_path,f"anon_train_non_empty.json"),'r') as f:
    anon_data_train_non_empty = json.load(f)

In [20]:
with open(os.path.join(train_path,f"citation_train.json"),'r') as f:
    citation_data = json.load(f)

In [6]:
with open(os.path.join(train_path,f"pilener_train.json"),'r') as f:
    pilener = json.load(f)

In [None]:
print(f"size anon train empty :     {len(anon_data_train_empty)}")
print(f"size anon train n-empty :   {len(anon_data_train_non_empty)}")

In [8]:
print(f"size pile-mistral :         {len(data_pile)}")
print(f"size pii :                  {len(data_pii)}")
print(f"size citation train data :  {len(citation_data)}")
print(f"size pilener train data :   {len(pilener)}")



size pile-mistral :         15778
size pii :                  15707
size citation train data :  87760
size pilener train data :   36711


# Dataset mix = 1

Data_pile-mistral : 90% of full dataset for training, 10% for testing -> 17.5K
Data_pii : 90% of full dataset for training, 10% for testing -> 17.5K 
--> total pretrain data : 35K
anon_data_train_non_empty:  randomly sample 10K
anon_data_train_empty: randomly sample 10K
citation_data: randomly sample 20K 


In [9]:
random.seed(42)

merge_v1 = data_pile + data_pii + random.sample(anon_data_train_non_empty,10**4) + random.sample(anon_data_train_empty,10**4) + random.sample(citation_data,2*10**4)

In [10]:
#save merge_v1 dataset for finetuning

with open(f"finetune_data/merged_v1.json",'w') as f:
    json.dump(merge_v1,f)

# Dataset mix 2

Don't include empty data in order to increase recall

In [11]:
random.seed(42)

merge_v2 = data_pile + data_pii + random.sample(anon_data_train_non_empty,10**4) + random.sample(citation_data,10**4)

#save merge_v1 dataset for finetuning

with open(f"finetune_data/merged_v2.json",'w') as f:
    json.dump(merge_v2,f)

## anon_cit_v1

25K anon + 25 K citation

In [12]:
anon_cit_v1 = random.sample(anon_data_train_non_empty,25000) + random.sample(citation_data,25000)

with open(f"finetune_data/anon_cit_v1.json","w") as f:
    json.dump(anon_cit_v1,f)

## create negatives tag

In [19]:
def add_negatives(sample: dict, all_labels = ['person','organization','location','citation','law']):

    ner_labels = set([x[2] for x in sample['ner']])

    sample['negatives'] = list(set(all_labels) - ner_labels)
    
    random.shuffle(sample["negatives"])

    return sample





In [42]:
citation_negatives = [add_negatives(sample) for sample in citation_data]

In [45]:
anon_ne_negatives = [add_negatives(sample) for sample in anon_data_train_non_empty]

In [46]:
anon_empty_negatives = [add_negatives(sample) for sample in anon_data_train_empty]

## anon_cit_v2

In [50]:
len(citation_negatives)

87760

In [51]:
len(anon_ne_negatives)

303458

In [52]:
random.seed(42)
anon_cit_v2 = citation_negatives + random.sample(anon_ne_negatives,80000)

In [None]:
with open(f"finetune_data/anon_cit_v2.json"),'w') as f:
    json.dump(anon_cit_v2,f)

## merged_v3

In [9]:
from utils import get_label_dict

In [10]:
def exclude_samples_by_labels(sample,labels_to_exclude : list) -> bool:

    ner_labels = set([x[2] for x in sample['ner']])

    if set(labels_to_exclude) & ner_labels:
        return False
    else: return True

In [11]:
get_label_dict(pilener,100)

[('concept', 34960),
 ('Person', 32292),
 ('person', 31014),
 ('Organization', 30791),
 ('Location', 25839),
 ('organization', 25327),
 ('product', 22886),
 ('location', 22713),
 ('variable', 16950),
 ('Concept', 12294),
 ('object', 12227),
 ('Product', 10055),
 ('technology', 9111),
 ('Date', 8568),
 ('Medical Condition', 7960),
 ('chemical', 7742),
 ('number', 7715),
 ('software', 7595),
 ('medical condition', 7528),
 ('disease', 6323),
 ('date', 5983),
 ('attribute', 5977),
 ('Other', 5405),
 ('entity type', 5285),
 ('group', 5174),
 ('Technology', 5096),
 ('measurement', 5048),
 ('function', 4906),
 ('class', 4772),
 ('event', 4612),
 ('material', 4448),
 ('protein', 4411),
 ('Event', 3883),
 ('process', 3711),
 ('Nationality', 3209),
 ('condition', 3178),
 ('Country', 3038),
 ('substance', 3031),
 ('type', 3027),
 ('animal', 2958),
 ('method', 2755),
 ('activity', 2677),
 ('property', 2648),
 ('component', 2607),
 ('food', 2570),
 ('company', 2560),
 ('title', 2556),
 ('drug', 240

In [12]:
labels_to_exclude = ['person','Person','organization','Organization','location','Location','Other','other',
                    'misc','Miscellaneous','miscellaneous','entity type','PERSON','LOCATION','ORGANIZATION','PER','ORG','LOC']

In [13]:
pilener_filtered = [sample for sample in pilener if exclude_samples_by_labels(sample,labels_to_exclude)]
print(len(pilener))
print(len(pilener_filtered))

36711
13803


In [14]:
get_label_dict(pilener_filtered,10)

[('concept', 15732),
 ('variable', 15213),
 ('product', 8777),
 ('number', 7070),
 ('object', 6583),
 ('chemical', 6405),
 ('technology', 5534),
 ('software', 5440),
 ('function', 4078),
 ('medical condition', 3913)]

In [15]:
get_label_dict(data_pii)

Counter({'person': 26203,
         'organization': 9046,
         'phone number': 6950,
         'address': 6643,
         'passport number': 6553,
         'email': 5201,
         'credit card number': 4340,
         'location': 4098,
         'social security number': 4093,
         'date': 3604,
         'birth date': 3260,
         'health insurance ID number': 2691,
         'date of birth': 2460,
         'city': 1884,
         'mobile phone number': 1526,
         'medication': 1396,
         'bank account number': 1382,
         'CPF': 1014,
         'birthdate': 907,
         'company registration number': 850,
         "driver's license number": 841,
         'tax identification number': 747,
         'health insurance number': 737,
         'health insurance id number': 662,
         'currency': 591,
         'medical condition': 570,
         'identity card number': 506,
         'age': 470,
         'personal identification number': 450,
         'country': 445,
         '

In [16]:
data_pii_filtered = [sample for sample in data_pii if exclude_samples_by_labels(sample,labels_to_exclude)]
print(len(data_pii))
print(len(data_pii_filtered)) #not enough to proceed

15707
113


In [17]:
data_pile_filtered = [sample for sample in data_pile if exclude_samples_by_labels(sample,labels_to_exclude)]
print(len(data_pile))
print(len(data_pile_filtered)) 

15778
7822


In [22]:
random.seed(42)
merged_v3 = citation_data + random.sample(anon_data_train_non_empty,40000) + data_pii + pilener_filtered + data_pile_filtered
print(len(merged_v3))

165092


In [23]:
with open(os.path.join(f"finetune_data/merged_v3.json"),'w') as f:
    json.dump(merged_v3,f)