In [1]:
!pip install datasets



In [2]:
import torch
import json
import numpy as np
import pandas as pd
import nltk
import random
import itertools
import collections
from datasets import load_dataset
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification


In [3]:
device=torch.backends.cpu

In [4]:
model_name = "ab-ai/pii_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
data = load_dataset("ai4privacy/pii-masking-200k",split='train')
data

Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
    num_rows: 209261
})

In [6]:
train=pd.DataFrame(data)
train=train.drop(['id','set'],axis=1)
train.head()

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,language
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"[{'value': '06-184755-866851-3', 'start': 57, ...","[[0, 57, ""O""], [57, 75, ""PHONEIMEI""], [75, 138...","[A, student, ', s, assessment, was, found, on,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P...",en
1,"Dear Omer, as per our records, your license 78...","Dear [FIRSTNAME], as per our records, your lic...","[{'value': 'Omer', 'start': 5, 'end': 9, 'labe...","[[0, 5, ""O""], [5, 9, ""FIRSTNAME""], [9, 44, ""O""...","[Dear, Omer, ,, as, per, our, records, ,, your...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-VEH...",en
2,Kattie could you please share your recomndatio...,[FIRSTNAME] could you please share your recomn...,"[{'value': 'Kattie', 'start': 0, 'end': 6, 'la...","[[0, 6, ""FIRSTNAME""], [6, 75, ""O""], [75, 77, ""...","[Kat, ##tie, could, you, pl, ##eas, ##e, share...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...",en
3,Emergency supplies in 16356 need a refill. Use...,Emergency supplies in [BUILDINGNUMBER] need a ...,"[{'value': '16356', 'start': 22, 'end': 27, 'l...","[[0, 22, ""O""], [22, 27, ""BUILDINGNUMBER""], [27...","[Emergency, supplies, in, 1635, ##6, need, a, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...",en
4,"The 88 old child at 5862, has showcased an unu...","The [AGE] old child at [BUILDINGNUMBER], has s...","[{'value': '88', 'start': 4, 'end': 6, 'label'...","[[0, 4, ""O""], [4, 6, ""AGE""], [6, 20, ""O""], [20...","[The, 88, old, child, at, 586, ##2, ,, has, sh...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...",en


In [7]:
train.columns=['unmasked_text','masked_text','privacy_mask','span_labels','tokenised_text','bio_labels','language']

In [8]:
train.head()

Unnamed: 0,unmasked_text,masked_text,privacy_mask,span_labels,tokenised_text,bio_labels,language
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"[{'value': '06-184755-866851-3', 'start': 57, ...","[[0, 57, ""O""], [57, 75, ""PHONEIMEI""], [75, 138...","[A, student, ', s, assessment, was, found, on,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P...",en
1,"Dear Omer, as per our records, your license 78...","Dear [FIRSTNAME], as per our records, your lic...","[{'value': 'Omer', 'start': 5, 'end': 9, 'labe...","[[0, 5, ""O""], [5, 9, ""FIRSTNAME""], [9, 44, ""O""...","[Dear, Omer, ,, as, per, our, records, ,, your...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-VEH...",en
2,Kattie could you please share your recomndatio...,[FIRSTNAME] could you please share your recomn...,"[{'value': 'Kattie', 'start': 0, 'end': 6, 'la...","[[0, 6, ""FIRSTNAME""], [6, 75, ""O""], [75, 77, ""...","[Kat, ##tie, could, you, pl, ##eas, ##e, share...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...",en
3,Emergency supplies in 16356 need a refill. Use...,Emergency supplies in [BUILDINGNUMBER] need a ...,"[{'value': '16356', 'start': 22, 'end': 27, 'l...","[[0, 22, ""O""], [22, 27, ""BUILDINGNUMBER""], [27...","[Emergency, supplies, in, 1635, ##6, need, a, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...",en
4,"The 88 old child at 5862, has showcased an unu...","The [AGE] old child at [BUILDINGNUMBER], has s...","[{'value': '88', 'start': 4, 'end': 6, 'label'...","[[0, 4, ""O""], [4, 6, ""AGE""], [6, 20, ""O""], [20...","[The, 88, old, child, at, 586, ##2, ,, has, sh...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...",en


In [9]:
train_en=train[train['language'] == 'en']

In [10]:
len(train_en)

43501

In [11]:
# Santizing bio labels
all_bio_labels = train_en.bio_labels.tolist()
# function that takes a list of bio tags, checks if they are valid, and sanitizes them if not.
# There are two types of invalid tags:
# 1. if a I tag that is not preceded by a B tag
# 2. if the last tag is a I tag, check if its preceded by a B tag. If not, change it to a O tag.

def _sanitize_bio_labels(bio_tags):
    # Check for invalid tags that start with "I" but are not preceded by "B"
    for i in range(len(bio_tags)):
        tag = bio_tags[i]
        if tag != "O":
            tag = tag.split("-")[0]
            label = bio_tags[i].split("-")[1]
        if bio_tags[i].startswith("I") and bio_tags[i - 1].startswith("O"):
        # Remove the invalid I tag
            bio_tags[i - 1] = "B" + "-" + label

    # Check for invalid tags if the last tag is an I tag and not preceded by a B tag
    # if bio_tags[-1].startswith("I") and not bio_tags[-2].startswith("B"):
    #     # Replace the invalid I tag with an O tag
    #     bio_tags[-1] = "O" + "-" + label

    return bio_tags

In [12]:
from tqdm.notebook import tqdm
new_bio_labels = []
for i in tqdm(range(len(all_bio_labels))):
     new_bio_labels.append(_sanitize_bio_labels(all_bio_labels[i]))

  0%|          | 0/43501 [00:00<?, ?it/s]

In [13]:
train_en.bio_labels = new_bio_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_en.bio_labels = new_bio_labels


In [14]:
train_en.head()

Unnamed: 0,unmasked_text,masked_text,privacy_mask,span_labels,tokenised_text,bio_labels,language
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"[{'value': '06-184755-866851-3', 'start': 57, ...","[[0, 57, ""O""], [57, 75, ""PHONEIMEI""], [75, 138...","[A, student, ', s, assessment, was, found, on,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P...",en
1,"Dear Omer, as per our records, your license 78...","Dear [FIRSTNAME], as per our records, your lic...","[{'value': 'Omer', 'start': 5, 'end': 9, 'labe...","[[0, 5, ""O""], [5, 9, ""FIRSTNAME""], [9, 44, ""O""...","[Dear, Omer, ,, as, per, our, records, ,, your...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-VEH...",en
2,Kattie could you please share your recomndatio...,[FIRSTNAME] could you please share your recomn...,"[{'value': 'Kattie', 'start': 0, 'end': 6, 'la...","[[0, 6, ""FIRSTNAME""], [6, 75, ""O""], [75, 77, ""...","[Kat, ##tie, could, you, pl, ##eas, ##e, share...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...",en
3,Emergency supplies in 16356 need a refill. Use...,Emergency supplies in [BUILDINGNUMBER] need a ...,"[{'value': '16356', 'start': 22, 'end': 27, 'l...","[[0, 22, ""O""], [22, 27, ""BUILDINGNUMBER""], [27...","[Emergency, supplies, in, 1635, ##6, need, a, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...",en
4,"The 88 old child at 5862, has showcased an unu...","The [AGE] old child at [BUILDINGNUMBER], has s...","[{'value': '88', 'start': 4, 'end': 6, 'label'...","[[0, 4, ""O""], [4, 6, ""AGE""], [6, 20, ""O""], [20...","[The, 88, old, child, at, 586, ##2, ,, has, sh...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...",en


In [15]:
df=train_en.copy()

In [16]:
# remove more than 512
tt = df.tokenised_text.tolist()
tt_lens = [len(tokenizer(list(t), is_split_into_words=True, truncation=True, max_length=512)['input_ids']) for t in tqdm(tt)]
df[['tt_lens']] = 0
df.tt_lens = tt_lens
df = df.loc[df.tt_lens <= 510].reset_index(drop=True)

  0%|          | 0/43501 [00:00<?, ?it/s]

In [17]:
nt = df.bio_labels.tolist()
nt = list(itertools.chain.from_iterable(nt)) # merge the list of lists into one list
nt = collections.Counter(nt) # Get count of each tag
all_labels = list(nt.keys()) # get all unique tags(labels)

In [18]:
source_texts = df.unmasked_text.tolist()
target_texts = df.masked_text.tolist()
tokenized_texts = df.tokenised_text.tolist()
ner_tags = df.bio_labels.tolist()

In [19]:
len(all_labels)

113

In [20]:
source_texts[0]

"A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?"

In [21]:
target_texts[0]

"A student's assessment was found on device bearing IMEI: [PHONEIMEI]. The document falls under the various topics discussed in our [JOBAREA] curriculum. Can you please collect it?"

In [22]:
# Create label dict
label2id = dict([(value,key) for key, value in enumerate(all_labels)])
id2label = dict(map(reversed, label2id.items()))

label2id, id2label

({'O': 0,
  'B-PHONEIMEI': 1,
  'I-PHONEIMEI': 2,
  'B-JOBAREA': 3,
  'I-JOBAREA': 4,
  'B-FIRSTNAME': 5,
  'B-VEHICLEVIN': 6,
  'I-VEHICLEVIN': 7,
  'I-FIRSTNAME': 8,
  'B-AGE': 9,
  'B-GENDER': 10,
  'I-GENDER': 11,
  'B-HEIGHT': 12,
  'I-HEIGHT': 13,
  'B-BUILDINGNUMBER': 14,
  'I-BUILDINGNUMBER': 15,
  'B-MASKEDNUMBER': 16,
  'I-MASKEDNUMBER': 17,
  'B-PASSWORD': 18,
  'I-PASSWORD': 19,
  'B-DOB': 20,
  'I-DOB': 21,
  'B-IPV6': 22,
  'I-IPV6': 23,
  'B-NEARBYGPSCOORDINATE': 24,
  'I-NEARBYGPSCOORDINATE': 25,
  'B-USERAGENT': 26,
  'I-USERAGENT': 27,
  'B-TIME': 28,
  'I-TIME': 29,
  'B-JOBTITLE': 30,
  'I-JOBTITLE': 31,
  'B-COUNTY': 32,
  'B-EMAIL': 33,
  'I-EMAIL': 34,
  'B-ACCOUNTNUMBER': 35,
  'I-ACCOUNTNUMBER': 36,
  'B-PIN': 37,
  'I-PIN': 38,
  'B-EYECOLOR': 39,
  'I-EYECOLOR': 40,
  'B-LASTNAME': 41,
  'I-LASTNAME': 42,
  'B-IPV4': 43,
  'I-IPV4': 44,
  'B-DATE': 45,
  'I-DATE': 46,
  'B-STREET': 47,
  'I-STREET': 48,
  'B-CITY': 49,
  'I-CITY': 50,
  'B-PREFIX': 51,
  'I-P

In [23]:
for j in tqdm(range(len(ner_tags))):
    tags = ner_tags[j]
    for i in range(len(tags)):
        for k,v in label2id.items():
            if tags[i] == k:
                tags[i] = v
    ner_tags[j] = tags
df.ner_tags = ner_tags

  0%|          | 0/43489 [00:00<?, ?it/s]

  df.ner_tags = ner_tags


In [24]:
ner_tags = [list(ner) for ner in ner_tags]
ner_tags[0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [25]:
df[['source_words']] = "source_words"
source_words = [text.split(" ") for text in source_texts]
df.source_words = source_words

In [26]:
# removing rows where the len(tokenized_texts[i]) does not match len(ner_tags[i])
idx = [i for i in range(len(ner_tags)) if len(tokenized_texts[i]) != len(ner_tags[i])]
df = df.drop(index=idx).reset_index(drop=True)

In [27]:
import datasets
dataset = datasets.Dataset.from_pandas(df)
dataset

Dataset({
    features: ['unmasked_text', 'masked_text', 'privacy_mask', 'span_labels', 'tokenised_text', 'bio_labels', 'language', 'tt_lens', 'source_words'],
    num_rows: 43489
})

In [28]:
dataset[0]

{'unmasked_text': "A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?",
 'masked_text': "A student's assessment was found on device bearing IMEI: [PHONEIMEI]. The document falls under the various topics discussed in our [JOBAREA] curriculum. Can you please collect it?",
 'privacy_mask': [{'end': 75,
   'label': 'PHONEIMEI',
   'start': 57,
   'value': '06-184755-866851-3'},
  {'end': 150, 'label': 'JOBAREA', 'start': 138, 'value': 'Optimization'}],
 'span_labels': '[[0, 57, "O"], [57, 75, "PHONEIMEI"], [75, 138, "O"], [138, 150, "JOBAREA"], [150, 189, "O"]]',
 'tokenised_text': ['A',
  'student',
  "'",
  's',
  'assessment',
  'was',
  'found',
  'on',
  'device',
  'bearing',
  'IM',
  '##E',
  '##I',
  ':',
  '06',
  '-',
  '1847',
  '##55',
  '-',
  '866',
  '##85',
  '##1',
  '-',
  '3',
  '.',
  'The',
  'document',
  'falls',
  'under',
  'th

In [29]:
def align_labels(example):
    tokenized_input = tokenizer(example["tokenised_text"], is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
    word_ids = tokenized_input.word_ids()
    aligned_labels = [-100 if i is None else example["bio_labels"][i] for i in word_ids]
    tokenized_input['labels'] = aligned_labels
    return tokenized_input

In [30]:
al = align_labels(dataset[0])
print(len(al['input_ids']), len(al['attention_mask']), len(al['labels']))

72 72 72


In [31]:
tokenizer("hello")

{'input_ids': [101, 7592, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [32]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer.encode_plus(examples["tokenised_text"], is_split_into_words=True, truncation=True, max_length=512)

    labels = []
    for i, label in enumerate(examples["bio_labels"]):
        word_ids = tokenized_inputs.word_ids(i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [33]:
x = dataset.map(align_labels, num_proc=8, remove_columns=dataset.column_names)

  self.pid = os.fork()


Map (num_proc=8):   0%|          | 0/43489 [00:00<?, ? examples/s]

In [34]:
tokenized_dataset = x.train_test_split(test_size=0.2)

In [35]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 34791
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8698
    })
})

In [36]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [37]:
!pip install seqeval



In [38]:
metric = datasets.load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }

    for k in results.keys():
        if (k not in flattened_results.keys()):
            flattened_results[f"{k}_f1"] = results[k]["f1"]

    return flattened_results

  metric = datasets.load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [39]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(all_labels), label2id=label2id, id2label=id2label,ignore_mismatched_sizes=True)
print(model.config)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ab-ai/pii_model and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([63]) in the checkpoint and torch.Size([113]) in the model instantiated
- classifier.weight: found shape torch.Size([63, 768]) in the checkpoint and torch.Size([113, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "ab-ai/pii_model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PHONEIMEI",
    "2": "I-PHONEIMEI",
    "3": "B-JOBAREA",
    "4": "I-JOBAREA",
    "5": "B-FIRSTNAME",
    "6": "B-VEHICLEVIN",
    "7": "I-VEHICLEVIN",
    "8": "I-FIRSTNAME",
    "9": "B-AGE",
    "10": "B-GENDER",
    "11": "I-GENDER",
    "12": "B-HEIGHT",
    "13": "I-HEIGHT",
    "14": "B-BUILDINGNUMBER",
    "15": "I-BUILDINGNUMBER",
    "16": "B-MASKEDNUMBER",
    "17": "I-MASKEDNUMBER",
    "18": "B-PASSWORD",
    "19": "I-PASSWORD",
    "20": "B-DOB",
    "21": "I-DOB",
    "22": "B-IPV6",
    "23": "I-IPV6",
    "24": "B-NEARBYGPSCOORDINATE",
    "25": "I-NEARBYGPSCOORDINATE",
    "26": "B-USERAGENT",
    "27": "I-USERAGENT",
    "28": "B-TIME",
  

In [40]:
# # Install Pytorch
# %pip install "torch==2.2.2" tensorboard

# # Install Hugging Face libraries
# %pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"

In [42]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.2.0-py2.py3-none-any.whl (281 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.1/281.1 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [46]:
args = TrainingArguments(
    output_dir="Gagan_mishra_finetuned_ai4privacy",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    overwrite_output_dir=True,
    warmup_ratio=0.2,
    weight_decay=0.01,
    save_strategy='epoch',
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    lr_scheduler_type='cosine_with_restarts',
    report_to='wandb',
    push_to_hub=False,
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["test"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [48]:
train_result = trainer.train()
test_result = trainer.evaluate(tokenized_dataset['test'])


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Accountname F1,Accountnumber F1,Age F1,Amount F1,Bic F1,Bitcoinaddress F1,Buildingnumber F1,City F1,Companyname F1,County F1,Creditcardcvv F1,Creditcardissuer F1,Creditcardnumber F1,Currency F1,Currencycode F1,Currencyname F1,Currencysymbol F1,Date F1,Dob F1,Email F1,Ethereumaddress F1,Eyecolor F1,Firstname F1,Gender F1,Height F1,Iban F1,Ip F1,Ipv4 F1,Ipv6 F1,Jobarea F1,Jobtitle F1,Jobtype F1,Lastname F1,Litecoinaddress F1,Mac F1,Maskednumber F1,Middlename F1,Nearbygpscoordinate F1,Ordinaldirection F1,Password F1,Phoneimei F1,Phonenumber F1,Pin F1,Prefix F1,Secondaryaddress F1,Sex F1,Ssn F1,State F1,Street F1,Time F1,Url F1,Useragent F1,Username F1,Vehiclevin F1,Vehiclevrm F1,Zipcode F1
1,0.1407,0.104639,0.937475,0.94874,0.943074,0.968606,0.985581,0.966921,0.944099,0.92623,0.91906,0.914234,0.968575,0.966272,0.983165,0.990291,0.987567,0.98725,0.964581,0.758798,0.781538,0.022222,0.858854,0.959131,0.928189,0.990955,0.956012,0.944444,0.974047,0.972035,0.991736,0.965675,0.0,0.800306,0.83046,0.934505,0.986183,0.942907,0.950442,0.790576,0.989429,0.939636,0.938959,0.995683,0.972516,0.95709,0.987552,0.976271,0.954167,0.969565,0.989831,0.986175,0.993498,0.957725,0.960136,0.969492,0.986376,0.988286,0.981738,0.955381,0.953704,0.973973


In [54]:
train_metrics = train_result.metrics
test_metrics = {"eval_loss": test_result["eval_loss"]}



In [55]:
max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])



In [56]:
train_metrics["train_samples"] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)



***** train metrics *****
  epoch                    =        1.0
  total_flos               =   437080GF
  train_loss               =     0.1597
  train_runtime            = 0:09:05.59
  train_samples            =      34791
  train_samples_per_second =     15.942
  train_steps_per_second   =      7.971


In [57]:
test_metrics["eval_samples"] = min(max_eval_samples, len(tokenized_dataset['test']))
trainer.log_metrics("eval", test_metrics)



***** eval metrics *****
  eval_loss    = 0.1046
  eval_samples =   8698


In [58]:
trainer.save_metrics("train", train_metrics)
trainer.save_metrics("eval", test_metrics)



In [59]:
trainer.save_state()
trainer.save_model(args.output_dir)

In [None]:
# using the model
