In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
import json
from sklearn.model_selection import train_test_split
from re import sub
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-1B')

## GRETELAI - 1st approach

In [2]:
dataset = load_from_disk('datasets/gretelai_synthetic_pii_finance_multilingual_curated')
dataset['train'].column_names, len(dataset['train'])

(['level_0',
  'index',
  'document_type',
  'document_description',
  'expanded_type',
  'expanded_description',
  'language',
  'language_description',
  'domain',
  'generated_text',
  'pii_spans',
  'conformance_score',
  'quality_score',
  'toxicity_score',
  'bias_score',
  'groundedness_score'],
 27636)

In [3]:
json.loads(dataset['train'][0]['pii_spans'])

[{'start': 119, 'end': 141, 'label': 'date'},
 {'start': 181, 'end': 197, 'label': 'company'},
 {'start': 305, 'end': 333, 'label': 'street_address'},
 {'start': 363, 'end': 379, 'label': 'company'},
 {'start': 386, 'end': 399, 'label': 'name'},
 {'start': 441, 'end': 469, 'label': 'street_address'},
 {'start': 481, 'end': 497, 'label': 'company'},
 {'start': 598, 'end': 614, 'label': 'company'},
 {'start': 709, 'end': 725, 'label': 'company'},
 {'start': 915, 'end': 931, 'label': 'company'},
 {'start': 1026, 'end': 1042, 'label': 'company'},
 {'start': 1797, 'end': 1813, 'label': 'company'},
 {'start': 1923, 'end': 1939, 'label': 'company'},
 {'start': 1283, 'end': 1295, 'label': 'date'}]

In [4]:
def generate_prompt(candidate_labels):
  prompt = (
    f"Classify the following text into one of the following labels and mask all PII (Personally Identifiable Information). \n"
    f"Possible labels: {', '.join(candidate_labels)}\n"
  )
  return prompt

def generate_label(text, label):
  prompt = (
    f"Text:\n{text}\n"
    f"Document type: {label}"
  )
  return prompt

In [5]:
new_dataset = []
prompt = generate_prompt(set(dataset['train']['document_type']))
for i in range(10000):
  sample = {}
  piis = {}
  text = dataset['train'][i]['generated_text']
  for pii in json.loads(dataset['train'][i]['pii_spans']):
    pii_text = text[pii['start']:pii['end']]
    piis[pii_text] = pii['label'].upper()
  for pii in piis.keys():
    text = text.replace(pii, f'<{piis[pii]}>')
  
  sample['prompt'] = prompt
  sample['chosen'] = generate_label(text, dataset['train'][i]['document_type'])
  sample['rejected'] = generate_label(dataset['train'][i]['generated_text'], dataset['train'][i]['document_type'])
  new_dataset.append(sample)
  

In [7]:
new_dataset[5]

{'prompt': "Classify the following text into one of the following labels and mask all PII (Personally Identifiable Information). \nPossible labels: Employment Contract, Financial Data Feed, Credit Application, Investment Prospectus, Tax Return, Mortgage Contract, Bank Statement, Health Insurance Claim Form, Customer support conversational log, Pension Plan Agreement, Audit Report, Corporate Tax Return, IT support ticket, Financial Risk Assessment, Loan Application, Corporate Governance Guidelines, Customer Agreement, Credit Card Statement, Financial Aid Application, Financial Regulatory Compliance Report, Securities Prospectus, Insurance Claim Form, Compliance Certificate, Trade Confirmation, Financial Disclosure Statement, Renewal Reminder, Policyholder's Report, Real Estate Loan Agreement, Mortgage Amortization Schedule, Bill of Lading, Business Plan, Dispute Resolution Policy, Email, Safety Data Sheet, Credit Card Application, Privacy Policy, Financial Forecast, Annual Report, Trans

In [8]:
dataset = pd.DataFrame(new_dataset)
dataset.head(2)

Unnamed: 0,prompt,chosen,rejected
0,Classify the following text into one of the fo...,Text:\nSUPPLY CHAIN MANAGEMENT AGREEMENT\n\nTh...,Text:\nSUPPLY CHAIN MANAGEMENT AGREEMENT\n\nTh...
1,Classify the following text into one of the fo...,Text:\nCONTRATO DE PRÉSTAMO PARA INVERSIÓN INM...,Text:\nCONTRATO DE PRÉSTAMO PARA INVERSIÓN INM...


In [9]:
# create train / validation split for the comparison dataframe
train_df, val_df = train_test_split(dataset, test_size=0.1, random_state=42, shuffle=True)

print(f"Train set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")

Train set shape: (9000, 3)
Validation set shape: (1000, 3)


In [None]:
dpo_dataset = DatasetDict({
  'train': Dataset.from_pandas(train_df),
  'validation': Dataset.from_pandas(val_df)
})

dpo_dataset.save_to_disk('')

Saving the dataset (1/1 shards): 100%|██████████| 9000/9000 [00:00<00:00, 668000.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 396324.67 examples/s]


## Sliding Window

Discharge Summaries

In [None]:
dataset = load_from_disk('')
len(dataset['train']), dataset['train'].column_names

(21044,
 ['report', 'file', 'id', 'piis', 'label', 'input_text', '__index_level_0__'])

In [None]:
dataset['train'][0]['piis']

In [None]:
# This list included other common terms such as City names or locations. PIIs have been removed for privacy.
stop_terms = set(term.lower() for term in [
  "vor.", "sich", "des", "Wochen", "langen", "zur", "Schmerzen", "Straße", "Lunge", "Str.", 
  "befand.", "Becken", "Bereich", "Reich", "Therapie", "einem", "Dank", "Seite", "Schulter", 
  "RECH", "Rech", "Praxis", "Meter", "Alt", "März", "Sohn", "Hand", "Schule", "Eltern", 
  "Unfallchirurgie", "Befundes", "Kernspintomographie", "Procedere", "Montag", "Befund", 
  "Freund", "Liebe", "Ganz", "GANZ", "St.", "Ihnen", "Orthop.", "Sportorthop.", "Then", 
  "Orth", "Orth.", "Junge.", "Jahres", "List", "Danke", "Länge", "seh", "beidseits.", 
  "Humerusfraktur", "Skisturzes", "L5/S1", "ambulanter", "Ellenbogen", "medial", "FREUND", 
  "Entz", "Streck", "MEHR", "Schuh", "Voll", "Berichten", "Beschwerden", "Hausarzt", "Kraft", 
  "Führt", "Hause", "Regel", "voll", "Kurz", "Heim", "links.", "besten", "Sch.", " v.a.", 
  "Klinik", "Kliniken", "Klinikums", "Klinikum", "schwer", "Hoch", "Lang", "Weil", "Länger", 
  "res.", "rein", "Sonntag", "dürfen", "Stat.", "GRUND", "rascher", "Pantozol", "Oxygesic", 
  "Fragmin", "Chirurgie", "Osteosynthese", "Stock", "Paar", "Block", "Osteosarkom", 
  "Beurteilung", "Kleine", "Kleiner", "Schnell", "Kopf", "III.", "linksseitig", "Lauf", 
  "Unterschenkel", "Lendenwirbelsäule", "ZINK", "That", "LEBER", "Radiatio", "Chirurg", 
  "Beckenübersicht", "Lange", "Überweisung", "III;", "General", "Mann", "Röntgen", "v.a.", 
  "Herz", "Hüfte", "beider", "Last", "Kern", "Kardiologen", "Oral", "Preis", "Führ", 
  "Haltung", "Groß", "Gross", "Grösse", "raschen", "Gonarthrose", "Zimmer", "Knie", "Herbst", 
  "Klein", "freies", "Glück", "Finger", "Zahn", "Spina", "Kniegelenk", "Metallentfernung", 
  "Chirurgische", "Brust", "Schlag", "Hüftgelenkes", "Brüder"
])

In [None]:
dpo_samples = []
rejecteds = []
prompt = f"You are a privacy‑preserving assistant. Under no circumstances should you output any personally identifiable information (PII). PII includes but is not limited to: full names, email addresses, phone numbers, street addresses, Social Security numbers, dates of birth, and any unique identifiers.",
f"Mask all PII in the text with the format <PII_TYPE> (e.g., <NAME>, <EMAIL>, <PHONE>).",
f"Always retain the meaning of the text but remove or mask every instance of PII."

prefix_length = 150
tokens_window = 20

for sample in dataset['train']:
    text = sample['report']
    cleaned_text = sub(r'[\r\t\n\xa0\0\v\f]', ' ', text)
    cleaned_text = sub(r' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    tokenized_text = tokenizer(cleaned_text, return_tensors='pt')
    input_ids = tokenized_text['input_ids'][0]
    
    piis = []
    for pii in sample['piis']:
        pii_text = pii['sequence']
        if pii_text not in piis and pii_text.lower() not in stop_terms and len(pii_text) > 3:
            piis.append(pii_text)
    
    start = 0
    end = start + prefix_length + tokens_window
    while end < len(input_ids):
        window_text = tokenizer.decode(input_ids[start+prefix_length:end])
        pii_window = window_text
        
        for pii in piis:
            if pii in pii_window:
                pii_window = pii_window.replace(pii, f"<MASK>")
        if pii_window.count("<MASK>") > 1 and window_text not in rejecteds:
            dpo_samples.append({
                'prompt': f"{prompt} {tokenizer.decode(input_ids[start:end])}",
                'chosen': pii_window,
                'rejected': window_text
            })
            rejecteds.append(window_text)

        start = end
        end = start + prefix_length + tokens_window

In [42]:
len(dpo_samples)

6263

In [None]:
dpo_samples[0:10]

In [None]:
df = pd.DataFrame(dpo_samples)
df.head(2)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

dataset.save_to_disk('')

Saving the dataset (1/1 shards): 100%|██████████| 5636/5636 [00:00<00:00, 428066.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 627/627 [00:00<00:00, 245617.69 examples/s]


GretelAI

In [4]:
dataset = load_from_disk('datasets/gretelai_synthetic_pii_finance_multilingual_curated')
len(dataset['train']), dataset['train'].column_names

(27636,
 ['level_0',
  'index',
  'document_type',
  'document_description',
  'expanded_type',
  'expanded_description',
  'language',
  'language_description',
  'domain',
  'generated_text',
  'pii_spans',
  'conformance_score',
  'quality_score',
  'toxicity_score',
  'bias_score',
  'groundedness_score'])

In [6]:
json.loads(dataset['train'][0]['pii_spans'])

[{'start': 119, 'end': 141, 'label': 'date'},
 {'start': 181, 'end': 197, 'label': 'company'},
 {'start': 305, 'end': 333, 'label': 'street_address'},
 {'start': 363, 'end': 379, 'label': 'company'},
 {'start': 386, 'end': 399, 'label': 'name'},
 {'start': 441, 'end': 469, 'label': 'street_address'},
 {'start': 481, 'end': 497, 'label': 'company'},
 {'start': 598, 'end': 614, 'label': 'company'},
 {'start': 709, 'end': 725, 'label': 'company'},
 {'start': 915, 'end': 931, 'label': 'company'},
 {'start': 1026, 'end': 1042, 'label': 'company'},
 {'start': 1797, 'end': 1813, 'label': 'company'},
 {'start': 1923, 'end': 1939, 'label': 'company'},
 {'start': 1283, 'end': 1295, 'label': 'date'}]

In [11]:
dpo_samples = []
rejecteds = []
prompt = f"You are a privacy‑preserving assistant. Under no circumstances should you output any personally identifiable information (PII). PII includes but is not limited to: full names, email addresses, phone numbers, street addresses, Social Security numbers, dates of birth, and any unique identifiers.",
f"Mask all PII in the text with the format <PII_TYPE> (e.g., <NAME>, <EMAIL>, <PHONE>).",
f"Always retain the meaning of the text but remove or mask every instance of PII."

prefix_length = 150
tokens_window = 20

for sample in dataset['train']:
    text = sample['generated_text']
    cleaned_text = sub(r'[\r\t\n\xa0\0\v\f]', ' ', text)
    cleaned_text = sub(r' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    tokenized_text = tokenizer(cleaned_text, return_tensors='pt')
    input_ids = tokenized_text['input_ids'][0]
    
    piis = []
    for pii in json.loads(sample['pii_spans']):
        pii_text = text[pii['start']:pii['end']]
        if pii_text not in piis and len(pii_text) > 3:
            piis.append(pii_text)
    
    start = 0
    end = start + prefix_length + tokens_window
    while end < len(input_ids):
        window_text = tokenizer.decode(input_ids[start+prefix_length:end])
        pii_window = window_text
        
        for pii in piis:
            if pii in pii_window:
                pii_window = pii_window.replace(pii, f"<MASK>")
        if pii_window.count("<MASK>") > 1 and window_text not in rejecteds:
            dpo_samples.append({
                'prompt': f"{prompt} {tokenizer.decode(input_ids[start:end])}",
                'chosen': pii_window,
                'rejected': window_text
            })
            rejecteds.append(window_text)

        start = end
        end = start + prefix_length + tokens_window


        
    
    
    

In [None]:
print(len(dpo_samples))
dpo_samples[0:10]

In [None]:
df = pd.DataFrame(dpo_samples)
df.head(2)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

dataset.save_to_disk('')

Saving the dataset (1/1 shards): 100%|██████████| 1489/1489 [00:00<00:00, 370840.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 166/166 [00:00<00:00, 70521.06 examples/s]


In [None]:
dataset = load_from_disk('')
len(dataset['train']), dataset['train'].column_names

(9081, ['text'])

In [28]:
piis = []
for i in range(len(dataset['train'])):
  piis.append(tokenizer.decode(tokenizer(dataset['train'][i]['text'])['input_ids'][50:]).strip())

In [29]:
len(set(piis))

9080

## Sliding Window - 2nd approach

In [3]:
dataset = load_from_disk('datasets/gretelai_synthetic_pii_finance_multilingual_curated')
len(dataset['train']), dataset['train'].column_names

(27636,
 ['level_0',
  'index',
  'document_type',
  'document_description',
  'expanded_type',
  'expanded_description',
  'language',
  'language_description',
  'domain',
  'generated_text',
  'pii_spans',
  'conformance_score',
  'quality_score',
  'toxicity_score',
  'bias_score',
  'groundedness_score'])

In [4]:
dpo_samples = []
rejecteds = []
prompt = f"You are a privacy‑preserving assistant. Under no circumstances should you output any personally identifiable information (PII). PII includes but is not limited to: full names, email addresses, phone numbers, street addresses, Social Security numbers, dates of birth, and any unique identifiers.",
f"Mask all PII in the text with the format <PII_TYPE> (e.g., <NAME>, <EMAIL>, <PHONE>).",
f"Always retain the meaning of the text but remove or mask every instance of PII."

prefix_length = 150
tokens_window = 20

for sample in dataset['train']:
    text = sample['generated_text']
    cleaned_text = sub(r'[\r\t\n\xa0\0\v\f]', ' ', text)
    cleaned_text = sub(r' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    tokenized_text = tokenizer(cleaned_text, return_tensors='pt')
    input_ids = tokenized_text['input_ids'][0]
    
    piis = []
    for pii in json.loads(sample['pii_spans']):
        pii_text = text[pii['start']:pii['end']]
        if pii_text not in piis and len(pii_text) > 3:
            piis.append(pii_text)
    
    start = 0
    end = start + prefix_length + tokens_window
    while end < len(input_ids):
        window_text = tokenizer.decode(input_ids[start+prefix_length:end])
        pii_window = window_text
        
        for pii in piis:
            if pii in pii_window:
                pii_window = pii_window.replace(pii, f"<MASK>")
        if pii_window.count("<MASK>") > 1 and window_text not in rejecteds:
            dpo_samples.append({
                'prompt': f"{prompt} {tokenizer.decode(input_ids[start:prefix_length])}",
                'chosen': pii_window,
                'rejected': window_text
            })
            rejecteds.append(window_text)

        start = end
        end = start + prefix_length + tokens_window


        
    
    
    

In [None]:
dpo_samples[0]

In [None]:
res = False
rejecteds = []
for sample in dataset['train']:
    text = sample['generated_text']
    cleaned_text = sub(r'[\r\t\n\xa0\0\v\f]', ' ', text)
    cleaned_text = sub(r' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    tokenized_text = tokenizer(cleaned_text, return_tensors='pt')
    input_ids = tokenized_text['input_ids'][0]
    
    piis = []
    for pii in json.loads(sample['pii_spans']):
        pii_text = text[pii['start']:pii['end']]
        if pii_text not in piis and len(pii_text) > 3:
            piis.append(pii_text)
    
    start = 0
    end = start + prefix_length + tokens_window
    while end < len(input_ids):
        window_text = tokenizer.decode(input_ids[start+prefix_length:end])
        pii_window = window_text
        
        for pii in piis:
            if pii in pii_window:
                pii_window = pii_window.replace(pii, f"<MASK>")
        if pii_window.count("<MASK>") > 1 and window_text not in rejecteds:
            print(cleaned_text)
            res = True    

        start = end
        end = start + prefix_length + tokens_window
    if res:
        break

In [None]:
df = pd.DataFrame(dpo_samples)
df.head(2)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

dataset.save_to_disk('')

Saving the dataset (1/1 shards): 100%|██████████| 1489/1489 [00:00<00:00, 303997.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 166/166 [00:00<00:00, 59923.79 examples/s]
