<a href="https://colab.research.google.com/github/mit1280/fined-tuning/blob/main/Fine_Tune_GLiNER_Token_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install required libraries

In [None]:
! pip install -q gliner datasets
! pip install -qU accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.6/46.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/527.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Import libraries

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
import re
import torch
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding, DataCollator
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset
import json
import ast
import random

## load dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("gretelai/synthetic_pii_finance_multilingual")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50346 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5594 [00:00<?, ? examples/s]

## Functions for data preprocessing

In [None]:
def tokenize_text(text):
    """Tokenize the input text into a list of tokens."""
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def tokenize_with_positions(text):
    """Tokenize the text and keep track of token positions in the original text."""
    tokens = tokenize_text(text)
    tokens_with_positions = []
    current_pos = 0

    for token in tokens:
        start_pos = text.find(token, current_pos)
        end_pos = start_pos + len(token)
        tokens_with_positions.append({"token": token, "start": start_pos, "end": end_pos})
        current_pos = end_pos

    return tokens, tokens_with_positions

def map_entities_to_tokens(example):
    """Map entity positions to tokenized text spans."""
    text, entities = example['generated_text'], ast.literal_eval(example['pii_spans'])
    tokens, tokens_with_positions = tokenize_with_positions(text)
    spans = []

    for entity in entities:
        entity_start, entity_end, label = entity['start'], entity['end'], entity['label']
        token_start_idx = token_end_idx = None

        for i, token_info in enumerate(tokens_with_positions):
            if token_info['start'] == entity_start:
                token_start_idx = i
            if token_info['end'] == entity_end:
                token_end_idx = i

        if token_start_idx is not None and token_end_idx is not None:
            spans.append((token_start_idx, token_end_idx, label))
    example['tokenized_text'] = tokens
    example['ner'] = str(spans)

    return example

## See training data

In [None]:
example = ds['train'][0]

In [None]:
text = example['generated_text']
tokens = tokenize_text(text)
entities = ast.literal_eval(example['pii_spans'])
example = map_entities_to_tokens(example)
spans = ast.literal_eval(example['ner'])

In [None]:
for i, k in zip(entities, spans):
    print(text[i['start']:i['end']])

1st day of March, 2021
Cameron-Mcknight
81685 Lopez Lodge, Apt. 6502
Cameron-Mcknight
Jann N. Butte
81685 Lopez Lodge, Apt. 6502
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
one (1) year


In [None]:
for i in spans:
    print(" ".join(tokens[i[0]:i[1]+1]))

1st day of March , 2021
Cameron-Mcknight
81685 Lopez Lodge , Apt . 6502
Cameron-Mcknight
Jann N . Butte
81685 Lopez Lodge , Apt . 6502
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
Cameron-Mcknight
one ( 1 ) year


In [None]:
for i, k in zip(ast.literal_eval(example['pii_spans']), spans):
    if text[i['start']:i['end']].replace(" ", "") != "".join(tokens[k[0]:k[1]+1]):
        print(text[i['start']:i['end']], " ".join(tokens[k[0]:k[1]+1]))

## Clean and transform data

In [None]:
filtered_ds = ds.filter(lambda x: len(x['pii_spans']) > 3).remove_columns([
        'level_0', 'document_type', 'document_description', 'expanded_type',
        'expanded_description', 'language', 'language_description', 'domain',
        'conformance_score', 'quality_score', 'toxicity_score', 'bias_score',
        'groundedness_score'
    ])

Filter:   0%|          | 0/50346 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5594 [00:00<?, ? examples/s]

In [None]:
filtered_ds = filtered_ds.map(map_entities_to_tokens)
filtered_ds

Map:   0%|          | 0/46890 [00:00<?, ? examples/s]

Map:   0%|          | 0/5240 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'generated_text', 'pii_spans', 'tokenized_text', 'ner'],
        num_rows: 46890
    })
    test: Dataset({
        features: ['index', 'generated_text', 'pii_spans', 'tokenized_text', 'ner'],
        num_rows: 5240
    })
})

In [None]:
final_df = filtered_ds.filter(lambda x: len(x['ner']) > 3).remove_columns(['index', 'generated_text', 'pii_spans'])

Filter:   0%|          | 0/46890 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5240 [00:00<?, ? examples/s]

In [None]:
# Combine the lists from both 'train'
train_dataset = []

# Extract and extend the combined list from the 'train' split
for example in final_df['train']:
    train_dataset.extend([{'tokenized_text': example['tokenized_text'], 'ner': ast.literal_eval(example['ner'])}])

In [None]:
# Combine the lists from both 'train'
test_dataset = []

# Extract and extend the combined list from the 'train' split
for example in final_df['test']:
    test_dataset.extend([{'tokenized_text': example['tokenized_text'], 'ner': ast.literal_eval(example['ner'])}])

In [None]:
for i, j in test_dataset[0].items():
    print(i, j)

tokenized_text ['UNB', '+', 'UNOC', ':', '300', '+', 'IA1234567890', ':', '1234567890', '+', 'IA9876543210', ':', '9876543210', '+', '161223', ':', '1230', '+', '0001', '+', 'EA', '+', 'UNH', '+', 'SHP', '+', '20121212121212', ':', '1234567890', '+', 'SHP', ':', 'D', ':', '9B', ':', 'UN', ':', 'EAN008', ':', '2', ':', '200', '+', 'IA1234567890', '+', 'IA9876543456', 'BGM', '+', '220', '+', '345678901234567890', '+', '161223', '+', '1230', '+', '1234567890X', '+', '233', 'NAD', '+', 'BY', '+', '9876543210', '+', '123', 'Main', 'Street', '+', 'London', '+', 'EC3A', '8DS', '+', 'UK', '+', 'GB', '+', '8888888888888', 'NAD', '+', 'SU', '+', '1234567890', '+', '030', 'Campbell', 'Motorway', '+', 'London', '+', 'E14', '5JP', '+', 'UK', '+', 'GB', '+', '8888888888888', 'NAD', '+', 'DP', '+', '1234567890', '+', '3512', 'West', '12th', 'Street', '+', 'New', 'York', '+', 'NY', '+', '10011', '+', 'US', '+', '1', '+', '8888888888888', 'LIN', '+', '1', '+', '1234567890', '+', '20', '+', '1', '+', 'E

## Train

In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

model = GLiNER.from_pretrained("urchade/gliner_small")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.84k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/611M [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

  state_dict = torch.load(model_file, map_location=torch.device(map_location))


In [None]:
model.to(device)
print("done")

done


In [None]:
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)

In [None]:
# calculate number of epochs
num_steps = 500
batch_size = 8
data_size = len(train_dataset)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=5e-6,
    weight_decay=0.01,
    others_lr=1e-5,
    others_weight_decay=0.01,
    lr_scheduler_type="linear", #cosine
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    focal_loss_alpha=0.75,
    focal_loss_gamma=2,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    save_steps = 100,
    save_total_limit=10,
    dataloader_num_workers = 0,
    use_cpu = False,
    report_to="none",
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=model.data_processor.transformer_tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
trained_model = GLiNER.from_pretrained("models/checkpoint-2300", load_tokenizer=True)

config.json not found in /content/models/checkpoint-2300
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  state_dict = torch.load(model_file, map_location=torch.device(map_location))


In [None]:
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import create_repo, Repository, login

# Log in to Hugging Face
login()

trained_model.push_to_hub("gliner-fine-tuned-pii-finance-multilingual")

## Model Test

In [None]:
example = ds['test'][5]
example

{'level_0': 31725,
 'index': 31725,
 'document_type': 'Loan Application',
 'document_description': 'A detailed form completed by an individual or business applying for a loan, including personal and financial information.',
 'expanded_type': 'Student',
 'expanded_description': 'A student applying for an educational loan. Gather information about the educational institution, program of study, and future career plans.',
 'language': 'English',
 'language_description': 'English language as spoken in the United States, the UK, or Canada',
 'domain': 'finance',
 'generated_text': "Loan Application\n\nFull Legal Name: Luigi Clelia Togliatti\nDate of Birth: 11/27/1967\n\nMailing Address:\n4893 Justin Terrace\n[City, State, Zip Code]\n\nPhone Number: [(123) 456-7890]\nEmail Address: [luigi.togliatti@email.com]\n\nEducational Institution: University of Toronto\nExpected Graduation Date: [Graduation Year]\n\nProgram of Study: Bachelor of Science in Computer Science\n\nFuture Career Plans: After 

### Test with fine tuned model

In [None]:
fine_tuned_model = GLiNER.from_pretrained("Mit1208/gliner-fine-tuned-pii-finance-multilingual")

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/3.23k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/611M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  state_dict = torch.load(model_file, map_location=torch.device(map_location))


In [None]:
text = example['generated_text']

# Labels for entity prediction
labels = ["street_address", "company", "date_of_birth", "email", "date", "name"] # for v2.1 use capital case for better performance

# Perform entity prediction
entities = fine_tuned_model.predict_entities(text, labels, threshold=0.85)

# Display predicted entities and their labels
for entity in entities:
    print("(", entity["text"], "=>", entity["label"], ") (start & end ==>", entity["start"], "&", entity["end"], ")")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


( Luigi Clelia Togliatti => name ) (start & end ==> 35 & 57 )
( 11/27/1967 => date_of_birth ) (start & end ==> 73 & 83 )
( 4893 Justin Terrace => street_address ) (start & end ==> 102 & 121 )
( luigi.togliatti@email.com => email ) (start & end ==> 194 & 219 )
( Luigi Clelia Togliatti => name ) (start & end ==> 842 & 864 )


### Test with base model

In [None]:
base_model = GLiNER.from_pretrained("urchade/gliner_small")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/4.84k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/611M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
text = example['generated_text']

# Labels for entity prediction
labels = ["street_address", "company", "date_of_birth", "email", "date", "name"] # for v2.1 use capital case for better performance

# Perform entity prediction
entities = base_model.predict_entities(text, labels, threshold=0.85)

# Display predicted entities and their labels
for entity in entities:
    print("(", entity["text"], "=>", entity["label"], ") (start & end ==>", entity["start"], "&", entity["end"], ")")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


( Luigi Clelia Togliatti => name ) (start & end ==> 35 & 57 )
( Luigi Clelia Togliatti => name ) (start & end ==> 842 & 864 )


## References
1. https://github.com/urchade/GLiNER/blob/main/examples/finetune.ipynb
2. https://github.com/urchade/GLiNER/blob/main/examples/synthetic_data_generation.ipynb

## Test dataset creation process

In [None]:
for i in filtered_ds['test']:
  if(i['index']==36927):
    text = i['generated_text']
    entities = ast.literal_eval(i['pii_spans'])
    break

In [None]:
text = "I am MIT PATEL. MIT is one of the best University"
tokens = tokenize_text(text)
entities = ast.literal_eval("""[{'start': 5, 'end': 8, 'label': 'first_name'},
 {'start': 9, 'end': 14, 'label': 'last_name'},
 {'start': 16, 'end': 19, 'label': 'univeristy'}]""")

In [None]:
entities

[{'start': 445, 'end': 453, 'label': 'name'},
 {'start': 898, 'end': 934, 'label': 'api_key'},
 {'start': 1179, 'end': 1190, 'label': 'ipv4'},
 {'start': 1627, 'end': 1635, 'label': 'name'}]

In [None]:
tokens, spans = map_entities_to_tokens(text, entities)
spans