In [41]:
from datasets import load_dataset
import os
import json
from collections import Counter
import numpy as np

from utils import join_tokens

import random

In [None]:
pretrain_dir = f"data_pretrain/"

In [25]:
def explore_dataset(data : list[dict]):
    print(f"length : {len(data)}")
    print(f"avg tokenized text length : {np.mean([len(sample['tokenized_text']) for sample in data])}")
    print(f"max tokenized text length : {max([len(sample['tokenized_text']) for sample in data])}")
    print(f"min tokenized text length : {min([len(sample['tokenized_text']) for sample in data])}")
    labels = set([ner[2] for sample in data for ner in sample['ner']])
    print(f"labels : {labels}")

In [37]:
def get_label_dict(data : list[dict],most_common = 0):
    output =  Counter([ner[2] for sample in data for ner in sample['ner']])

    if most_common:
        return output.most_common(most_common)
    
    else:
        return output



In [None]:
def print_sample_text(data,num_samples = 10,seed = 42,joined = True):
    random.seed(seed)

    samples = random.sample(data,num_samples)

    for sample in samples:
        if joined:
            print(join_tokens(sample['tokenized_text']),'\n')
        else:
            print(sample['tokenized_text'],'\n')

In [94]:
def print_entities(data,num_samples = 10, seed = 42,neighboring = 1,label = None):
    random.seed(seed)

    if label:
        data = [sample for sample in data if label in [ner[2] for ner in sample['ner']]]
        
    samples = random.sample(data,num_samples)

    for i,sample in enumerate(samples):
        print(f"sample {i +1}")
        ners = sample['ner']
        if label:
            ners = [ner for ner in ners if ner[2]==label]
        tok_text = sample['tokenized_text']

        if neighboring:
            tokens = [str(tok_text[ner[0]-int(neighboring):ner[1]+1 + int(neighboring)]) for ner in ners]
            print(", ".join([f"label={ner[2]}, neighboring tokens={token}" for ner, token in zip(ners, tokens)]))

        if not neighboring:
            tokens = [str(tok_text[ner[0]:ner[1]]) for ner in ners]
            print(", ".join([f"label={ner[2]}, neighboring tokens={token}" for ner, token in zip(ners, tokens)]))
   


In [78]:
def explore_label(data,label : 'str', most_common = 20):
    entities_per_label = []

    for sample in data:
        ranges_per_label = [[ner_tag[0],ner_tag[1]] for ner_tag in sample['ner'] if ner_tag[2] == label]
        entities_per_label.extend([join_tokens(sample['tokenized_text'][index[0]:index[1]+1]) for index in ranges_per_label])

    tot_ent = len(entities_per_label)
    unique_ent = np.unique(entities_per_label)

    print(f"total number of entities for label {label}: {tot_ent}")
    print(f"total number of unique entities for label {label} : {len(unique_ent)}")
    print(f"Ratio of unique/total entities : {len(unique_ent)/tot_ent}")
    return Counter(entities_per_label).most_common(most_common)


# urchade/pile_mistral_v0.1
Used to train urchade/gliner_multi-v2.1

In [8]:
with open(os.path.join(pretrain_dir,"pile_mistral_v0.1.json"),'r') as f:
    pile_mistral = json.load(f)

In [26]:
explore_dataset(pile_mistral)

length : 19724
avg tokenized text length : 210.39429121881972
max tokenized text length : 711
min tokenized text length : 1


In [None]:
get_label_dict(pile_mistral,most_common = 15)

Counter({'Organization': 22122,
         'Person': 21283,
         'Location': 16524,
         'Event': 5233,
         'Quantity': 4725,
         'Country': 4353,
         'Time': 3120,
         'Group': 3042,
         'City': 2992,
         'Product': 2726,
         'Technology': 2623,
         'Date': 2546,
         'Politician': 2142,
         'Software': 1967,
         'Disease': 1758,
         'Company': 1474,
         'Substance': 1442,
         'Number': 1431,
         'Year': 1430,
         'Concept': 1427,
         'Variable': 1305,
         'Compound': 1264,
         'Protein': 1204,
         'State': 1182,
         'Process': 1166,
         'Object': 1124,
         'Action': 1123,
         'Material': 1071,
         'Condition': 1046,
         'Device': 1021,
         'Author': 875,
         'Entity': 813,
         'Artifact': 812,
         'University': 687,
         'Topic': 679,
         'Determiner': 668,
         'Animal': 653,
         'Sports Team': 645,
         'Pro

In [60]:
print_sample_text(pile_mistral,seed=25,joined = False)

['#', '!', '/', 'usr', '/', 'bin', '/', 'env', 'python', 'PACKAGE', '=', "'", 'orbit_pantilt', "'", 'import', 'roslib', ';', 'roslib', '.', 'load_manifest', '(', 'PACKAGE', ')', 'from', 'math', 'import', 'pi', 'from', 'driver_base', '.', 'msg', 'import', 'SensorLevels', 'from', 'dynamic_reconfigure', '.', 'parameter_generator', 'import', '*', 'gen', '=', 'ParameterGenerator', '(', ')', '#', 'gen', '.', 'add', '(', '"', 'max_range', '"', ',', 'double_t', ',', 'SensorLevels', '.', 'RECONFIGURE_RUNNING', ',', '#', '"', 'max_range', 'corresponding', 'to', 'max', 'sensor', 'output', '"', ',', '5', '.', '0', ',', '0', '.', '0', ',', '20', '.', '0', ')', 'gen', '.', 'add', '(', '"', 'pan_reset', '"', ',', 'bool_t', ',', 'SensorLevels', '.', 'RECONFIGURE_RUNNING', ',', '"', 'reset', 'pan', 'position', '"', ',', 'False', ')', 'gen', '.', 'add', '(', '"', 'tilt_reset', '"', ',', 'bool_t', ',', 'SensorLevels', '.', 'RECONFIGURE_RUNNING', ',', '"', 'reset', 'tilt', 'position', '"', ',', 'False', '

## mutliconer_v2

In [45]:
german_data = load_dataset('MultiCoNER/multiconer_v2', 'German (DE)')

Downloading data:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/512 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20145 [00:00<?, ? examples/s]

In [56]:
german_data['train'].features['ner_tags_index'].feature

ClassLabel(names=['O', 'B-Facility', 'I-Facility', 'B-OtherLOC', 'I-OtherLOC', 'B-HumanSettlement', 'I-HumanSettlement', 'B-Station', 'I-Station', 'B-VisualWork', 'I-VisualWork', 'B-MusicalWork', 'I-MusicalWork', 'B-WrittenWork', 'I-WrittenWork', 'B-ArtWork', 'I-ArtWork', 'B-Software', 'I-Software', 'B-OtherCW', 'I-OtherCW', 'B-MusicalGRP', 'I-MusicalGRP', 'B-PublicCorp', 'I-PublicCorp', 'B-PrivateCorp', 'I-PrivateCorp', 'B-OtherCorp', 'I-OtherCorp', 'B-AerospaceManufacturer', 'I-AerospaceManufacturer', 'B-SportsGRP', 'I-SportsGRP', 'B-CarManufacturer', 'I-CarManufacturer', 'B-TechCORP', 'I-TechCORP', 'B-ORG', 'I-ORG', 'B-Scientist', 'I-Scientist', 'B-Artist', 'I-Artist', 'B-Athlete', 'I-Athlete', 'B-Politician', 'I-Politician', 'B-Cleric', 'I-Cleric', 'B-SportsManager', 'I-SportsManager', 'B-OtherPER', 'I-OtherPER', 'B-Clothing', 'I-Clothing', 'B-Vehicle', 'I-Vehicle', 'B-Food', 'I-Food', 'B-Drink', 'I-Drink', 'B-OtherPROD', 'I-OtherPROD', 'B-Medication/Vaccine', 'I-Medication/Vaccine

## our Anon-dataset

In [61]:
anon_datapath = f"anon_data/"

In [62]:
with open(os.path.join(anon_datapath,"complete_ner_data.json"),'r') as f:
    data = json.load(f)

In [63]:
explore_dataset(data)

length : 124089
avg tokenized text length : 1767.450749059143
max tokenized text length : 93834
min tokenized text length : 0
labels : {'location', 'person', 'organization'}


In [64]:
get_label_dict(data)

Counter({'person': 986117, 'organization': 135304, 'location': 38122})

In [68]:
print_sample_text(data,joined = False)

['Participants', 'à', 'la', 'procédure', 'Maria', 'Berisha', ',', 'représenté', 'par', 'Me', 'Jillian', 'Fauguel', ',', 'avocate', ',', 'recourant', ',', 'contre', 'Elena', 'Stettler', ',', 'représentée', 'par', 'Me', 'Jean-Christophe', 'Oberson', ',', 'avocat', ',', 'intimée', '.', 'Objet', 'divorce', '(', 'contribution', 'd', "'", 'entretien', 'en', 'faveur', 'des', 'enfants', 'mineurs', ')', ',', 'recours', 'contre', 'l', "'", 'arrêt', 'de', 'la', 'Ie', 'Cour', 'd', "'", 'appel', 'civil', 'du', 'Tribunal', 'cantonal', 'de', 'l', "'", 'Etat', 'de', 'Fribourg', 'du', '8', 'juin', '2018', '(', '101', '2017', '317', ')', '.', 'Faits', ':', 'A', '.', 'Le', '24', 'juillet', '2017', ',', 'le', 'Tribunal', 'civil', 'de', 'l', "'", 'arrondissement', 'de', 'la', 'Glâne', 'a', 'prononcé', 'le', 'divorce', 'de', 'Maria', 'Berisha', '(', '1979', ')', 'et', 'Elena', 'Stettler', '(', '1982', ')', '(', '1', ')', '.', 'Il', 'a', 'notamment', 'maintenu', 'l', "'", 'autorité', 'parentale', 'conjointe'

In [97]:
print_entities(data,neighboring=5,label='location',num_samples=20)

sample 1
label=location, neighboring tokens=['procès-verbal', 'authentique', 'dressé', 'par', 'Me', 'Lotzwil', ',', 'notaire', 'à', '.', '.'], label=location, neighboring tokens=['pour', '15', '%', 'et', 'à', 'Escholzmatt-Marbach', 'SA', 'de', 'siège', 'social', 'à'], label=location, neighboring tokens=['pour', '15', '%', 'et', 'à', 'Escholzmatt-Marbach', 'S', '.', 'A', '.', ',']
sample 2
label=location, neighboring tokens=[], label=location, neighboring tokens=['Konkursverfahren', 'mangels', 'Aktiven', 'eingestellt', '.', 'Gibloux', 'amtete', 'ab', 'November', '2002', 'bis'], label=location, neighboring tokens=['die', 'Ausgleichskasse', 'des', 'Kantons', 'Zug', 'Gibloux', 'als', 'ehemaligen', 'Verwaltungsrat', 'der', 'konkursiten'], label=location, neighboring tokens=['der', 'Firma', '.', 'Die', 'von', 'Gibloux', 'und', 'Luana', 'Mutzner', 'hiegegen', 'erhobenen'], label=location, neighboring tokens=['.', 'B', '.', 'Die', 'von', 'Gibloux', 'gegen', 'den', 'Entscheid', 'eingereichte', 

In [93]:
print_entities(data,neighboring=True,label='organization')

sample 1
label=organization, neighboring tokens=['société', 'Jobs', 'Grischun', 'Sàrl'], label=organization, neighboring tokens=['société', 'Baksi', 'Group', 'Sàrl'], label=organization, neighboring tokens=['société', 'Baksi', 'Group', 'Sàrl'], label=organization, neighboring tokens=['société', 'Baksi', 'Group', 'Sàrl'], label=organization, neighboring tokens=['société', 'Baksi', 'Group', 'Sàrl'], label=organization, neighboring tokens=['société', 'Baksi', 'Group', 'Sàrl']
sample 2
label=organization, neighboring tokens=['Firma', 'Business', 'Operation', 'Systems', 'AG']
sample 3
label=organization, neighboring tokens=['ditta', 'Sdplus', 'Holding', 'SA'], label=organization, neighboring tokens=['la', 'Sdplus', 'Holding', 'SA'], label=organization, neighboring tokens=['ditta', 'Sdplus', 'Holding', 'SA'], label=organization, neighboring tokens=['Ditta', 'Sdplus', 'Holding', 'SA']
sample 4
label=organization, neighboring tokens=['société', 'Buchdruckerei', 'Davos', 'SA'], label=organizati

In [79]:
explore_label(data,'person',most_common=20)

total number of entities for label person: 986117
total number of unique entities for label person : 231048
Ratio of unique/total entities : 0.2343007979783332


[('Isabelle Henseler', 283),
 ('Claudia Habenicht', 251),
 ('Marc Planta', 243),
 ('Enna Bühler', 242),
 ('Yves Loosli', 224),
 ('Adhanom Schweizer', 218),
 ('Sara Imsand', 214),
 ('Chiara Feuz', 203),
 ('Rainer Mabilama', 194),
 ('Nan Saraiva', 188),
 ('Florian Ridoux', 175),
 ('Michael Gremaud', 169),
 ('Dean Jakupovic', 168),
 ('Noel Alili', 165),
 ('Gioia Skenderi', 162),
 ('Leona Koudelka', 160),
 ('Stéphanie Kellenberger', 158),
 ('Deogratias Aeberhard', 157),
 ('Emma Forestier', 154),
 ('Elsbeth Wüscher', 152)]

In [80]:
explore_label(data,'location')

total number of entities for label location: 38122
total number of unique entities for label location : 2130
Ratio of unique/total entities : 0.05587324904254761


[('Stansstad', 220),
 ('Chavannes-De-Bogis', 159),
 ('Hüttikon', 156),
 ('Wittenbach', 142),
 ('Gilly', 134),
 ('Herbetswil', 124),
 ('Movelier', 117),
 ('Signy-Avenex', 100),
 ('Mühleberg', 90),
 ('Orvin', 86),
 ('Neuheim', 84),
 ('Gerzensee', 84),
 ('Pfäfers', 83),
 ('Gossau', 79),
 ('Wynau', 77),
 ('Düdingen', 77),
 ('Riehen', 76),
 ('Degersheim', 76),
 ('Nunningen', 73),
 ('Oberrieden', 72)]

In [81]:
explore_label(data,'organization')

total number of entities for label organization: 135304
total number of unique entities for label organization : 12585
Ratio of unique/total entities : 0.09301277124105717


[('Die Werke Versorgung Wallisellen', 700),
 ('Lemo', 637),
 ('Adnovum Informatik', 221),
 ('Ssm Architekten', 214),
 ('Jobconsult', 192),
 ('Netsafe', 155),
 ('Kaulquappe', 145),
 ('Promodal. Ch', 141),
 ('Kaiser Engineering', 139),
 ('Tri Dental Implants Int', 139),
 ('Gebr. Estermann', 127),
 ('Standout - Online Marketing Agentur', 123),
 ('Swissparts', 122),
 ('Catmar', 118),
 ('Innflow', 113),
 ('First Choice Consulting', 113),
 ('Elvefin', 112),
 ('Tca Thomann Distribution', 108),
 ('Beltronic It', 108),
 ('Edelway', 108)]