In [2]:
!pip install transformers[torch]
!pip install -U accelerator
!pip install torch



In [3]:
!pip install datasets



In [4]:
!pip install evaluate rouge_score



In [1]:
from datasets import load_dataset


masking_dataset = load_dataset("ai4privacy/pii-masking-300k")

masking_dataset

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 177677
    })
    validation: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 47728
    })
})

In [2]:
new_training_dataset = masking_dataset["train"].remove_columns(['privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'set']).filter(lambda x: x["language"] == "English")
new_evaluation_dataset = masking_dataset["validation"].remove_columns(['privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id','set']).filter(lambda x: x["language"] == "English")

In [3]:
import datasets
overall_dataset = datasets.DatasetDict({"train": new_training_dataset,"test":new_evaluation_dataset})

In [4]:
overall_dataset

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', 'language'],
        num_rows: 29908
    })
    test: Dataset({
        features: ['source_text', 'target_text', 'language'],
        num_rows: 7946
    })
})

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
overall_dataset.push_to_hub()

TypeError: push_to_hub() missing 1 required positional argument: 'repo_id'

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

## Dataset
convert to Train[{text, target}] for training t5

In [2]:
prefix = "mask PII: "
def preprocessing_function_for_dataset(dataset):
    changed_input = [prefix + doc for doc in dataset["source_text"]]
    model_inputs = tokenizer(changed_input, max_length = 1024, truncation = True)

    labels = tokenizer(dataset["target_text"], max_length = 1024, truncation = True)
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


In [4]:
masking_dataset["train"]

Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
    num_rows: 177677
})

In [5]:
new_training_dataset = masking_dataset["train"].remove_columns(['privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'])
new_evaluation_dataset = masking_dataset["validation"].remove_columns(['privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'])

In [6]:
tokenized_training_dataset = new_training_dataset.map(preprocessing_function_for_dataset, batched = True)
tokenized_validation_dataset = new_evaluation_dataset.map(preprocessing_function_for_dataset, batched = True)

Map:   0%|          | 0/47728 [00:00<?, ? examples/s]

In [7]:
tokenized_training_dataset[0]

{'source_text': 'Subject: Group Messaging for Admissions Process\n\nGood morning, everyone,\n\nI hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:\n\n- wynqvrh053 - Meeting at 10:20am\n- luka.burg - Meeting at 21\n- qahil.wittauer - Meeting at quarter past 13\n- gholamhossein.ruschke - Meeting at 9:47 PM\n- pdmjrsyoz1460 ',
 'target_text': 'Subject: Group Messaging for Admissions Process\n\nGood morning, everyone,\n\nI hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:\n\n- [USERNAME] - Meeting at [TIME]\n- [USERNAME] - Meeting at [TIME]\n- [USERNAME] - Meeting at [TIME]\n- [USERNAME] - Meeting at [TIME]\n- [USERNAME] ',
 'input_ids': [8181,
  3,
  4111,
  196,
  10,
  19237,


In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model="t5-small", return_tensors = "pt")

In [9]:
import numpy as np
import evaluate

In [10]:
rouge = evaluate.load("rouge")

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    decoded_logits = tokenizer.batch_decode(logits, skip_special_tokens = True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    result = rouge.compute(predictions = decoded_logits, references = decoded_labels, use_stemmer = True)
    return {key:round(value, 4) for key, value in result.items()}
    

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir = "new_output",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 10,
    per_device_eval_batch_size = 10,
    weight_decay = 0.01,
    num_train_epochs = 2,
    predict_with_generate = True,
    save_total_limit = 3,
    fp16 = True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_training_dataset,
    eval_dataset = tokenized_validation_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.14.336, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0628,0.033757,0.2237,0.1848,0.2232,0.2232
2,0.0494,0.028046,0.2245,0.1862,0.2241,0.2241




TrainOutput(global_step=35536, training_loss=0.0971784194757787, metrics={'train_runtime': 8860.1574, 'train_samples_per_second': 40.107, 'train_steps_per_second': 4.011, 'total_flos': 2.176859660088115e+16, 'train_loss': 0.0971784194757787, 'epoch': 2.0})

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [19]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aprab/new_output/commit/cec9736c463807db12dd0210a6895cc92c310749', commit_message='End of training', commit_description='', oid='cec9736c463807db12dd0210a6895cc92c310749', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
from transformers import pipeline

In [33]:
summarizer_sec = pipeline("fillmask", model = "aprab/new_output")

KeyError: "Unknown task fillmask, available tasks are ['audio-classification', 'automatic-speech-recognition', 'conversational', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-to-image', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation', 'text-to-audio', 'text-to-speech', 'text2text-generation', 'token-classification', 'translation', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']"

In [26]:
output_string = summarizer_sec("Subject: Group Messaging for Admissions Process\n\nGood morning, everyone,\n\nI hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:\n\n- wynqvrh053 - Meeting at 10:20am\n- luka.burg - Meeting at 21\n- qahil.wittauer - Meeting at quarter past 13\n- gholamhossein.ruschke - Meeting at 9:47 PM\n- pdmjrsyoz1460")
print(output_string)

Your max_length is set to 200, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)


[{'summary_text': 'Subject: Group Messaging for Admissions Process Good morning, everyone, I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings: - wynqvrh053 - Meeting at [TIME] - [USERNAME]'}]


In [27]:
summarizer_sec("You will be given your meeting ID at 5:00PM at google.com")

Your max_length is set to 200, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


[{'summary_text': 'you will be given your meeting ID at 5:00PM at google.com . if you have a meeting ID, please contact us for more information.'}]

In [4]:
def mask_prediction(text):
    text = "mask PII" + text
    tokenized = tokenizer([text], truncation = True, padding="longest", return_tensors = 'pt')
    #tokenized = {k: v.to('cuda') for k, v in tokenized.items()}
    tokenized_result = model.generate(**tokenized, max_length = 128)
    tokenized_result = tokenized_result.to('cpu')
    predicted_summary = tokenizer.decode(tokenized_result[0])
    return predicted_summary


'<pad> Group Messaging for Admissions Process Good morning, everyone, I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings: - [USERNAME] - Meeting at [TIME] - [USERNAME] - Meeting at [TIME] - [USERNAME] - Meeting at [TIME] - [USERNAME]</s>'

In [39]:
mask_prediction(masking_dataset["validation"][0]["source_text"])

"<pad> On the video sharing platform for educational content, a lively discussion unfolded among users from different locales within the UK. The comment thread began with [USERNAME] expressing admiration for the video's insightful content, followed by [USERNAME] adding a clarification on a complex topic. [USERNAME] chimed in with a question for clarification, an</s>"

In [40]:
masking_dataset["validation"][0]["target_text"]

"On the video sharing platform for educational content, a lively discussion unfolded among users from different locales within the UK.\n\nThe comment thread began with [USERNAME] expressing admiration for the video's insightful content, followed by [USERNAME] adding a clarification on a complex topic. [USERNAME] chimed in with a question for clarification, an"

In [41]:
mask_prediction("Mark Davis works at XYZ Corporation, located at 789 Maple Avenue, Boston, MA 02115.")

'<pad> Mark Davis works at [COUNTRY] [STREET], located at [BUILDING], [STREET], [CITY], [STATE].</s>'

In [42]:
mask_prediction("Sarah Williams' date of birth is June 15, 1990.")

"<pad> Sarah Williams' date of birth is [BOD].</s>"

In [63]:
mask_prediction("Emily Johnson lives at 456 Elm Street, Springfield, IL 62701")

'<pad> [LASTNAME1] [LASTNAME2] lives at [BUILDING], [STREET], [CITY], [STATE] 62701</s>'

In [8]:
mask_prediction("Evaluation Report: Candidate Suitability for Admission Date: 29/06/2013 Time: 7:59 PM \
Location: CM21 Candidate A: - Sex: M - Date of Birth: October/97 - Email: MVC@tutanota.com  \
- ID Card Number: RF69601MW - Driver's License: MASCU910077MV815 - \
IP Address: 7836:3dcf:9edf:692:fd5f:4de5:a9d6:da24 - Password: Be~o}.zq8^1")

"<pad> Candidate Suitability for Admission Date: [DATE] Time: [TIME] Location: [POSTCODE] Candidate A: - Sex: [SEX] - Date of Birth: [BOD] - Email: [EMAIL] - ID Card Number: [IDCARD] - Driver's License: [DRIVERLICENSE] - IP Address: [IP] - Password: [PASS]</s>"

In [61]:
mask_prediction("</instructor> <instructor> <name> <title>Mayoress</title> <social_number>3341955554</social_number> <id_card>TY41985ST</id_card> </name> </instructor> <instructor> <name> <title>Father</title> <social_number>299.463.5913</social_number> <id_card>EA38031TP</id_card> </name> </instructor> <instructor>")

'<pad> <unk>/instructor> <unk>instructor> <unk>name> <unk>title>[TITLE]<unk>/title> <unk>social_number>[SOCIALNUMBER]<unk>/social_number> <unk>id_card>[IDCARD]<unk>/id_card> <unk>/name> <unk>/instructor> <unk>instructor> <unk>name> <unk>title>[TITLE]<unk>/title> <unk>social_number>[SOCIALNUMBER]<unk>/social_number'

In [5]:
mask_prediction("Hello david, your social security number is 123123")

'<pad> [GIVENNAME1], your social security number is [SOCIALNUMBER]</s>'

In [6]:
mask_prediction("Hello liza, your social security number is 123123")

'<pad> liza, your social security number is 123123</s>'

In [1]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("new_output/checkpoint-35500", return_dict = False)

In [7]:
mask_prediction("John Carter's phone Number is 9840304234")

"<pad> [LASTNAME1]'s phone Number is [TEL]</s>"

In [14]:
!conda install -y -c conda-forge zip
zip -r -X  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.10.3
  latest version: 24.3.0

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /home/studio-lab-user/.conda/envs/default

  added / updated specs:
    - zip


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.2.2   |       hbcca054_0         152 KB  conda-forge
    certifi-2024.2.2           |     pyhd8ed1ab_0         157 KB  conda-forge
    openssl-3.2.1              |       hd590300_1         2.7 MB  conda-forge
    zip-3.0                    |       hd590300_3         173 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

  zip                conda-forge/lin

NameError: name 'r' is not defined

In [15]:
!zip -r -X output.zip new_output

  adding: new_output/ (stored 0%)
  adding: new_output/checkpoint-34500/ (stored 0%)
  adding: new_output/checkpoint-34500/config.json (deflated 62%)
  adding: new_output/checkpoint-34500/generation_config.json (deflated 29%)
  adding: new_output/checkpoint-34500/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 9%)
  adding: new_output/checkpoint-34500/tokenizer_config.json (deflated 95%)
  adding: new_output/checkpoint-34500/special_tokens_map.json (deflated 85%)
  adding: new_output/checkpoint-34500/tokenizer.json (deflated 74%)
  adding: new_output/checkpoint-34500/training_args.bin (deflated 51%)
  adding: new_output/checkpoint-34500/optimizer.pt (deflated 7%)
  adding: new_output/checkpoint-34500/scheduler.pt (deflated 55%)
  adding: new_output/checkpoint-34500/rng_state.pth (deflated 25%)
  adding: new_output/checkpoint-34500/trainer_state.json (deflated 78%)
  adding: new_output/checkpoint-35000/ (stored 0%)
  adding: new_output/checkpoint-35000/config.json (deflated 62%)
  adding: new_output/checkpoint-35000/generation_config.json (deflated 29%)
  adding: new_output/checkpoint-35000/model.safetensors (deflated 9%)
  adding: new_output/checkpoint-35000/tokenizer_config.json (deflated 95%)
  adding: new_output/checkpoint-35000/special_tokens_map.json (deflated 85%)
  adding: 

In [9]:
mask_prediction("I used to go to Kathmandu when I was a kid, my home is there.")

'<pad> [TITLE] [PASS] used to go to Kathmandu when I was a kid, my home is there.</s>'