In [6]:
import json
import os
from gliner import GLiNER
import torch
# @title Fast Mertics

# Load the test.json file
with open('anon_data/test.json', 'r') as file:
    test_data = json.load(file)

with open('anon_data/test_data_baseline.json', 'r') as file:
    test_data_baseline = json.load(file)


# Extract all labels from each example

def get_all_labels(data):
    all_labels = []
    for example in data:
        ner_data = example.get("ner", [])
        for entity in ner_data:
            label = entity[2]  # Assuming the label is the third element in the entity list
            if label not in all_labels:
                all_labels.append(label)
    return all_labels

all_test_labels = get_all_labels(test_data)
all_test_baseline_labels = get_all_labels(test_data_baseline)


device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device, all_test_labels, all_test_baseline_labels

(device(type='cuda', index=0),
 ['a_name', 'a_organisation', 'a_place'],
 ['person', 'organization', 'place'])

In [7]:
model_anonv0_path = 'models_anonymizer/AnonymizerV0'

model_anonv0 = GLiNER.from_pretrained(model_anonv0_path, load_tokenizer=True, local_files_only=True)
model_anonv0 = model_anonv0.to(device)

LocalEntryNotFoundError: Cannot find an appropriate cached snapshot folder for the specified revision on the local disk and outgoing traffic has been disabled. To enable repo look-ups and downloads online, pass 'local_files_only=False' as input.

In [7]:
results, f1 = model_anonv0.evaluate(test_data, flat_ner=True, threshold=0.5, batch_size=1, entity_types=all_test_labels)
output_info = f"F1 Score: {f1:.2f}" + "\n" + results
print(output_info)

F1 Score: 0.99
P: 99.55%	R: 99.24%	F1: 99.40%



In [11]:
model_baseline1 = GLiNER.from_pretrained('knowledgator/gliner-multitask-large-v0.5')
model_baseline1 = model_baseline1.to(device)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [15]:
results, f1 = model_baseline1.evaluate(test_data_baseline, flat_ner=True, threshold=0.5, batch_size=1, entity_types=all_test_baseline_labels)
output_info = f"F1 Score: {f1:.2f}" + "\n" + results
print(output_info)

F1 Score: 0.43
P: 32.30%	R: 66.46%	F1: 43.48%



In [8]:
# urchade/gliner_multi_pii-v1
model_baseline2 = GLiNER.from_pretrained('urchade/gliner_multi_pii-v1')
model_baseline2 = model_baseline2.to(device)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
results, f1 = model_baseline2.evaluate(test_data_baseline, flat_ner=True, threshold=0.5, batch_size=1, entity_types=all_test_baseline_labels)
output_info = f"F1 Score: {f1:.2f}" + "\n" + results
print(output_info)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


F1 Score: 0.43
P: 31.75%	R: 66.91%	F1: 43.07%



# Do Inference on one sample

In [26]:
def join_tokens(tokens):
    # code from Gliner_Studio: https://colab.research.google.com/drive/1Kl3TrpiGBpMw569ek_AL6Ee3uqBK-Gfw?usp=sharing
    # Joining tokens with space, but handling special characters correctly
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

example_index = 0
text = join_tokens(test_data[example_index]["tokenized_text"])
text, test_data[example_index]["ner"]

('11 ) unter anderem strafbar, wer sich die Marke eines anderen anmasst oder diese nachmacht oder nachahmt ( lit. a ) oder wer unter der angemassten, nachgemachten oder nachgeahmten Marke Waren in Verkehr setzt, solche Waren anbietet, ein -, aus - oder durchführt, sie zum Zweck des Inverkehrbringens lagert oder für sie wirbt ( lit. b ). Vor diesem Hintergrund ist nicht ohne Weiteres klar, worin das Interesse der Beschwerdeführerin an der Herausgabe dieser nur sehr begrenzt verkehrsfähigen Gegenstände liegen soll. Auch wenn sie als beschuldigte Person prinzipiell zur Beschwerde in Strafsachen legitimiert ist, hätte sie sich daher dazu äussern müssen, woraus sie ihr rechtlich geschütztes Interesse an der Anfechtung der Einziehungen ableitet. Wo sie nicht offensichtlich ist, bezieht sich die Begründungsobliegenheit von Art. 42 Abs. 2 BGG auch auf die Legitimation ( vgl. BGE 141 IV 1 E. 1. 1 mit Hinweisen; Urteil 7B_183 / 2023 vom 26. Juli 2023 E. 1. 3 ). Da die Beschwerdeführerin dieser O

In [24]:
def view(tokenized_text, ners):
    for ner in ners:
        start, end, label = ner
        print(f"{label}: {tokenized_text[start:end+1]}")

view(test_data[example_index]["tokenized_text"], test_data[example_index]["ner"])

a_name: ['Michal', 'Lutz']
a_name: ['Michal', 'Lutz']
a_name: ['Belal', 'Fernandez']


In [30]:
example_index = 15

text = join_tokens(test_data[example_index]["tokenized_text"])
expected_ner = test_data[example_index]["ner"]

# Labels for entity prediction
labels = all_test_labels
print(labels)

# Perform entity prediction
entities = model_anonv0.predict_entities(text, labels, threshold=0.5)

# Display predicted entities and their labels
print("Expected Entities:")
view(test_data[example_index]["tokenized_text"], test_data[example_index]["ner"])

print("\nPredicted Entities:")
for entity in entities:
    print(entity["text"], "=>", entity["label"])

['a_name', 'a_organisation', 'a_place']
Expected Entities:
a_name: ['Chloé', 'Fendt-Newlin']
a_name: ['Chloé', 'Fendt-Newlin']
a_name: ['Chloé', 'Fendt-Newlin']
a_name: ['Chloé', 'Fendt-Newlin']

Predicted Entities:
Chloé Fendt-Newlin => a_name
Chloé Fendt-Newlin => a_name
Chloé Fendt-Newlin => a_name
Chloé Fendt-Newlin => a_name


In [31]:
# count all labels in the test dataset
def count_labels(data):
    label_count = {}
    for example in data:
        ner_data = example.get("ner", [])
        for entity in ner_data:
            label = entity[2]  # Assuming the label is the third element in the entity list
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
    return label_count

label_count = count_labels(test_data)
label_count

{'a_name': 49722, 'a_organisation': 6945, 'a_place': 1897}

In [48]:


# create test data for each specific labels (filter data points for a specific label, throw out examples that do not have the label)
def create_test_data_for_label(data, label):
    test_data_label = []
    for example in data:
        ner_data = example.get("ner", [])
        new_ners = []
        for entity in ner_data:
            if entity[2] == label:
                new_ners.append(entity)
        if len(new_ners) > 0:
            test_data_label.append({"tokenized_text": example["tokenized_text"], "ner": new_ners})
    return test_data_label

# create test data for each specific labels
labelled_test_data = dict() 
for label in all_test_labels:
    labelled_test_data[label] = create_test_data_for_label(test_data, label)

for k,v in labelled_test_data.items():
    print(k, len(v))

a_name 13902
a_organisation 2817
a_place 1121


In [49]:
labelled_test_data['a_place'][0]

{'tokenized_text': ['Participants',
  'à',
  'la',
  'procédure',
  'Stefan',
  'Michaud',
  ',',
  'représenté',
  'par',
  'Me',
  'Manuela',
  'Ryter',
  'Godel',
  ',',
  'avocate',
  ',',
  'recourant',
  ',',
  'contre',
  'Ministère',
  'public',
  'central',
  'du',
  'canton',
  'de',
  'Vaud',
  ',',
  'avenue',
  'de',
  'Longemalle',
  '1',
  ',',
  '1020',
  'Renens',
  'VD',
  ',',
  'intimé',
  '.',
  'Objet',
  'Ordonnance',
  'de',
  'classement',
  '(',
  'mise',
  'en',
  'danger',
  'de',
  'la',
  'vie',
  'd',
  "'",
  'autrui',
  ',',
  'abus',
  'd',
  "'",
  'autorité',
  ')',
  ',',
  'recours',
  'contre',
  'l',
  "'",
  'arrêt',
  'du',
  'Tribunal',
  'cantonal',
  'du',
  'canton',
  'de',
  'Vaud',
  ',',
  'Chambre',
  'des',
  'recours',
  'pénale',
  ',',
  'du',
  '17',
  'février',
  '2020',
  '(',
  '.',
  '.',
  '.',
  '[',
  'PE1',
  ']',
  ')',
  '.',
  'Faits',
  ':',
  'A',
  '.',
  'Le',
  '7',
  'mars',
  '2019',
  ',',
  'à',
  'Bottmingen'

In [50]:
# run evaluation for each label
results = {}

for label, data in labelled_test_data.items():
    results[label], f1 = model_anonv0.evaluate(data, flat_ner=True, threshold=0.5, batch_size=1, entity_types=[label])
    print(f"Label: {label}, Results: {results[label]}")


Label: a_name, Results: P: 99.78%	R: 99.43%	F1: 99.60%

Label: a_organisation, Results: P: 96.70%	R: 99.16%	F1: 97.92%

Label: a_place, Results: P: 83.95%	R: 98.15%	F1: 90.50%



In [53]:
# run evaluation for each label
results = {}

for label, data in labelled_test_data.items():
    threshold = 0.5
    print(f"Label: {label}")
    if label == "a_name":
        continue
    else:
        threshold = 0.95
    results[label], f1 = model_anonv0.evaluate(data, flat_ner=True, threshold=threshold, batch_size=1, entity_types=[label])
    print(f"Results: {results[label]}")


Label: a_name
Label: a_organisation
Results: P: 97.15%	R: 99.09%	F1: 98.11%

Label: a_place
Results: P: 88.28%	R: 98.10%	F1: 92.93%



: 

In [40]:
import pandas as pd
import ast

ood_data = dict()
keys = ['french', 'german', 'italian']

for key in keys:
    ood_data[key] = pd.read_csv(f'data/{key}_ood.csv')
    # read entities and labels with ast.literal_eval
    ood_data[key]['entities'] = ood_data[key]['entities'].apply(lambda x: ast.literal_eval(x))
    ood_data[key]['labels'] = ood_data[key]['labels'].apply(lambda x: ast.literal_eval(x))

ood_data['french'].head()

Unnamed: 0,text,entities,labels
0,La société Énergie Verte SA et l'association S...,"[Énergie Verte SA, Solidarité Environnement, L...","[a_organisation, a_organisation, a_place]"
1,Madame Claire Dupont et Monsieur Jean-Louis Ma...,"[Claire Dupont, Jean-Louis Martin]","[a_name, a_name]"
2,Un contrat a été signé entre l'organisation Mé...,"[Médecins Sans Frontières, Conseil Municipal d...","[a_organisation, a_place]"
3,"Monsieur Paul Durand, représentant de l'associ...","[Paul Durand, Culture et Patrimoine]","[a_name, a_organisation]"
4,La Fondation pour la Recherche Médicale et l'u...,"[Fondation pour la Recherche Médicale, univers...","[a_organisation, a_organisation, a_place]"


In [42]:
# take 5 random samples from each ood dataset and test the model

for key, data in ood_data.items():
    print(f"Key: {key}")
    data = data.sample(5)
    for index, row in data.iterrows():
        text = row['text']
        labels = row['labels']
        entities = model_anonv0.predict_entities(text, labels, threshold=0.5)
        print(f"Text: {text}")
        print("Predicted Entities:")
        for entity in entities:
            print(entity["text"], "=>", entity["label"])
        print("True Entities:")
        for entry in row['entities']:
            print(entry)
        print("\n")

Key: french
Text: Madame Anne Leroy et Monsieur Jacques Rolland ont été choisis pour présider le comité de l'événement.
Predicted Entities:
Anne Leroy => a_name
Jacques Rolland => a_name
True Entities:
Anne Leroy
Jacques Rolland


Text: La collaboration entre le Parc National des Pyrénées et le Ministère de la Culture a été officialisée.
Predicted Entities:
True Entities:
Parc National des Pyrénées
Ministère de la Culture


Text: Le contrat entre l'entreprise SolarFrance et la Région PACA a été finalisé.
Predicted Entities:
SolarFrance => a_organisation
True Entities:
SolarFrance
Région PACA


Text: Le groupe Développement Durable et la ville de Rouen ont validé leur partenariat pour l'année à venir.
Predicted Entities:
Développement Durable => a_organisation
Rouen => a_place
True Entities:
Développement Durable
ville de Rouen
Rouen


Text: Le contrat entre l'association Humanité et Progrès et l'Université de Strasbourg a été ratifié sans amendement.
Predicted Entities:
Strasbourg => a