In [41]:
from datasets import load_dataset
import os
import json
from collections import Counter
import numpy as np

from utils import join_tokens

import random

In [None]:
pretrain_dir = f"data_pretrain/"

In [25]:
def explore_dataset(data : list[dict]):
    print(f"length : {len(data)}")
    print(f"avg tokenized text length : {np.mean([len(sample['tokenized_text']) for sample in data])}")
    print(f"max tokenized text length : {max([len(sample['tokenized_text']) for sample in data])}")
    print(f"min tokenized text length : {min([len(sample['tokenized_text']) for sample in data])}")
    labels = set([ner[2] for sample in data for ner in sample['ner']])
    print(f"labels : {labels}")

In [37]:
def get_label_dict(data : list[dict],most_common = 0):
    output =  Counter([ner[2] for sample in data for ner in sample['ner']])

    if most_common:
        return output.most_common(most_common)
    
    else:
        return output



In [59]:
def print_sample_text(data,num_samples = 10,seed = 42,joined = True):
    random.seed(seed)

    samples = random.sample(data,num_samples)

    for sample in samples:
        if joined:
            print(join_tokens(sample['tokenized_text']),'\n')
        else:
            print(sample['tokenized_text'],'\n')

# urchade/pile_mistral_v0.1
Used to train urchade/gliner_multi-v2.1

In [8]:
with open(os.path.join(pretrain_dir,"pile_mistral_v0.1.json"),'r') as f:
    pile_mistral = json.load(f)

In [26]:
explore_dataset(pile_mistral)

length : 19724
avg tokenized text length : 210.39429121881972
max tokenized text length : 711
min tokenized text length : 1


In [None]:
get_label_dict(pile_mistral,most_common = 15)

Counter({'Organization': 22122,
         'Person': 21283,
         'Location': 16524,
         'Event': 5233,
         'Quantity': 4725,
         'Country': 4353,
         'Time': 3120,
         'Group': 3042,
         'City': 2992,
         'Product': 2726,
         'Technology': 2623,
         'Date': 2546,
         'Politician': 2142,
         'Software': 1967,
         'Disease': 1758,
         'Company': 1474,
         'Substance': 1442,
         'Number': 1431,
         'Year': 1430,
         'Concept': 1427,
         'Variable': 1305,
         'Compound': 1264,
         'Protein': 1204,
         'State': 1182,
         'Process': 1166,
         'Object': 1124,
         'Action': 1123,
         'Material': 1071,
         'Condition': 1046,
         'Device': 1021,
         'Author': 875,
         'Entity': 813,
         'Artifact': 812,
         'University': 687,
         'Topic': 679,
         'Determiner': 668,
         'Animal': 653,
         'Sports Team': 645,
         'Pro

In [60]:
print_sample_text(pile_mistral,seed=25,joined = False)

['#', '!', '/', 'usr', '/', 'bin', '/', 'env', 'python', 'PACKAGE', '=', "'", 'orbit_pantilt', "'", 'import', 'roslib', ';', 'roslib', '.', 'load_manifest', '(', 'PACKAGE', ')', 'from', 'math', 'import', 'pi', 'from', 'driver_base', '.', 'msg', 'import', 'SensorLevels', 'from', 'dynamic_reconfigure', '.', 'parameter_generator', 'import', '*', 'gen', '=', 'ParameterGenerator', '(', ')', '#', 'gen', '.', 'add', '(', '"', 'max_range', '"', ',', 'double_t', ',', 'SensorLevels', '.', 'RECONFIGURE_RUNNING', ',', '#', '"', 'max_range', 'corresponding', 'to', 'max', 'sensor', 'output', '"', ',', '5', '.', '0', ',', '0', '.', '0', ',', '20', '.', '0', ')', 'gen', '.', 'add', '(', '"', 'pan_reset', '"', ',', 'bool_t', ',', 'SensorLevels', '.', 'RECONFIGURE_RUNNING', ',', '"', 'reset', 'pan', 'position', '"', ',', 'False', ')', 'gen', '.', 'add', '(', '"', 'tilt_reset', '"', ',', 'bool_t', ',', 'SensorLevels', '.', 'RECONFIGURE_RUNNING', ',', '"', 'reset', 'tilt', 'position', '"', ',', 'False', '

## mutliconer_v2

In [45]:
german_data = load_dataset('MultiCoNER/multiconer_v2', 'German (DE)')

Downloading data:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/512 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20145 [00:00<?, ? examples/s]

In [56]:
german_data['train'].features['ner_tags_index'].feature

ClassLabel(names=['O', 'B-Facility', 'I-Facility', 'B-OtherLOC', 'I-OtherLOC', 'B-HumanSettlement', 'I-HumanSettlement', 'B-Station', 'I-Station', 'B-VisualWork', 'I-VisualWork', 'B-MusicalWork', 'I-MusicalWork', 'B-WrittenWork', 'I-WrittenWork', 'B-ArtWork', 'I-ArtWork', 'B-Software', 'I-Software', 'B-OtherCW', 'I-OtherCW', 'B-MusicalGRP', 'I-MusicalGRP', 'B-PublicCorp', 'I-PublicCorp', 'B-PrivateCorp', 'I-PrivateCorp', 'B-OtherCorp', 'I-OtherCorp', 'B-AerospaceManufacturer', 'I-AerospaceManufacturer', 'B-SportsGRP', 'I-SportsGRP', 'B-CarManufacturer', 'I-CarManufacturer', 'B-TechCORP', 'I-TechCORP', 'B-ORG', 'I-ORG', 'B-Scientist', 'I-Scientist', 'B-Artist', 'I-Artist', 'B-Athlete', 'I-Athlete', 'B-Politician', 'I-Politician', 'B-Cleric', 'I-Cleric', 'B-SportsManager', 'I-SportsManager', 'B-OtherPER', 'I-OtherPER', 'B-Clothing', 'I-Clothing', 'B-Vehicle', 'I-Vehicle', 'B-Food', 'I-Food', 'B-Drink', 'I-Drink', 'B-OtherPROD', 'I-OtherPROD', 'B-Medication/Vaccine', 'I-Medication/Vaccine