
Datasets:
- with LOAD_DATASETS=full, argilla loads `gutenberg_spacy-ner-monitoring` for Token Classification with default spaCy predictions ; which is a fork of https://huggingface.co/datasets/gutenberg_time
- default NER dataset in papers is CoNLL-2003 https://huggingface.co/datasets/conll2003
- https://huggingface.co/datasets/DFKI-SLT/few-nerd
- https://huggingface.co/datasets/tner/ontonotes5
- Look for argilla compatible NER datasets with this search: https://huggingface.co/datasets?task_categories=task_categories:token-classification&sort=trending&search=argilla


- ✨ Provide suggested spans with a confidence score, so your team doesn't need to start from scratch.


In [140]:
from typing import List, Tuple, Union, Dict
import types

In [108]:
import argilla as rg

rg.init(api_url="http://localhost:6900", api_key="admin.apikey")



# Load CoNLL2003 research dataset into Argilla

In [144]:
def template_for_token_classification(
    labels: Dict[str, str] = {"PER": "Person", "ORG": "Organization", "LOC": "Location", "MISC": "Other"}
) -> rg.FeedbackDataset:
    """Create a dataset with a span question for NER or POS tagging or information retrieval tasks.
    
    There is no pre-defined template in argilla yet, so we define a custom dataset instead.
    The high-level API of this method is TBD.
    ref: https://docs.argilla.io/en/latest/practical_guides/create_update_dataset/create_dataset.html#define-questions + click on Span
    """
    dataset = rg.FeedbackDataset(
        fields=[rg.TextField(name="text")],
        questions=[
            rg.SpanQuestion(
                name="entities",
                title="Highlight the entities in the text:",
                labels=labels,
                field="text", # the field where you want to do the span annotation
                required=True,
                allow_overlapping=True
            )
        ]
    )
    return dataset

In [251]:
dataset = template_for_token_classification()



In [111]:
from datasets import load_dataset, Features, Sequence, ClassLabel, Value, DatasetDict

def load_conll():
    classmap = ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
    return (
        load_dataset("conll2003")
        .map(lambda sample: {"parsed_ner_tags": classmap.int2str(sample["ner_tags"])})
    )

In [112]:
conll2003 = load_conll()

In [135]:
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.training.iob_utils import iob_to_biluo, biluo_tags_to_offsets
from argilla.client.feedback.schemas import SpanValueSchema

def tags_to_entities(row: dict, tokens="tokens", parsed_ner_tags="parsed_ner_tags") -> List[SpanValueSchema]:
    doc = Doc(Vocab(), words=row[tokens])
    offsets = biluo_tags_to_offsets(doc, iob_to_biluo(row["parsed_ner_tags"]))

    return [
        SpanValueSchema(
            start=start, # position of the first character of the span
            end=stop, # position of the character right after the end of the span
            label=entity,
            score=1.0
        ) for start, stop, entity in offsets
    ]

In [256]:
from tqdm import tqdm
from typing import Iterator

def dataset_to_records(dataset: DatasetDict, agent, tokens="tokens") -> Iterator[rg.FeedbackRecord]:
    for row in tqdm(dataset):
        text = " ".join(row[tokens])  # we assume the tokens are clean, and we disregard more tokenizer details

        # Seems like we have "empty" rows
        if not text.strip():
            continue

        yield rg.FeedbackRecord(
            fields={"text": text},
            responses = [
                {
                    "values":{
                        "entities":{
                            "value": tags_to_entities(row),
                        }
                    }
                }
            ]
            # suggestions = [
            #     {
            #         "question_name": "entities",
            #         "value": tags_to_entities(row),
            #         "agent": agent,
            #     }
            # ]
        )

In [116]:
# add records to the dataset and push to Argilla
dataset.add_records(list(dataset_to_records(conll2003['validation'], "gold_labels")))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3250/3250 [00:00<00:00, 6520.85it/s]


In [117]:
dataset.push_to_argilla(name="dev-ner-conll2003", workspace="admin")

RemoteFeedbackDataset(
   id=60d31698-9ad9-47db-9e5f-d1e4f7d8b71b
   name=my-first-dataset
   workspace=Workspace(id=84a8fb6f-3350-4e9b-97c0-043cfedef934, name=admin, inserted_at=2024-05-14 17:08:20.825501, updated_at=2024-05-14 17:08:20.825501)
   url=http://localhost:6900/dataset/60d31698-9ad9-47db-9e5f-d1e4f7d8b71b/annotation-mode
   fields=[RemoteTextField(id=UUID('6b46a2c5-72ec-42d8-8392-9173007a025e'), client=None, name='text', title='Text', required=True, type='text', use_markdown=False)]
   questions=[RemoteSpanQuestion(id=UUID('12bbb02d-c899-4d42-ba40-8dc95985cbaf'), client=None, name='entities', title='Highlight the entities in the text:', description=None, required=True, type='span', field='text', labels=[SpanLabelOption(value='PER', text='Person', description=None), SpanLabelOption(value='ORG', text='Organization', description=None), SpanLabelOption(value='LOC', text='Location', description=None), SpanLabelOption(value='MISC', text='Other', description=None)], visible_label

# Load OntoNotes research dataset into Argilla

In [252]:
import collections

def load_ontonotes():
    ontonotes5_labels_raw = {"O": 0, "B-CARDINAL": 1, "B-DATE": 2, "I-DATE": 3, "B-PERSON": 4, "I-PERSON": 5, "B-NORP": 6, "B-GPE": 7, "I-GPE": 8, "B-LAW": 9, "I-LAW": 10, "B-ORG": 11, "I-ORG": 12, "B-PERCENT": 13, "I-PERCENT": 14, "B-ORDINAL": 15, "B-MONEY": 16, "I-MONEY": 17, "B-WORK_OF_ART": 18, "I-WORK_OF_ART": 19, "B-FAC": 20, "B-TIME": 21, "I-CARDINAL": 22, "B-LOC": 23, "B-QUANTITY": 24, "I-QUANTITY": 25, "I-NORP": 26, "I-LOC": 27, "B-PRODUCT": 28, "I-TIME": 29, "B-EVENT": 30, "I-EVENT": 31, "I-FAC": 32, "B-LANGUAGE": 33, "I-PRODUCT": 34, "I-ORDINAL": 35, "I-LANGUAGE": 36}
    ontonotes5_labels = collections.OrderedDict(sorted(ontonotes5_labels_raw.items(), key=lambda x: x[1]))
    classmap = ClassLabel(names=list(ontonotes5_labels.keys()))
    return (
        load_dataset("tner/ontonotes5")
        .rename_column("tags", "ner_tags")
        .map(lambda sample: {"parsed_ner_tags": classmap.int2str(sample["ner_tags"])})
    )

In [253]:
ontonotes = load_ontonotes()

In [259]:
verbose_labels = {
    "CARDINAL": "Numerals that do not fall under another type", 
    "DATE": "Absolute or relative dates or periods", 
    "PERSON": "People, including fictional", 
    "NORP": "Nationalities or religious or political groups", 
    "GPE": "Countries, cities, states",
    "LAW": "Named documents made into laws", 
    "ORG": "Companies, agencies, institutions, etc.", 
    "PERCENT": "Percentage (including “%”)",
    "ORDINAL": "“first”, “second”",
    "MONEY": "Monetary values, including unit",
    "WORK_OF_ART": "Titles of books, songs, etc.",
    "FAC": "Facilities like Buildings, airports, highways, bridges, etc.",
    "TIME": "Times smaller than a day",
    "LOC": "Non-GPE locations, mountain ranges, bodies of water",
    "QUANTITY": "Measurements, as of weight or distance",
    "PRODUCT": "Vehicles, weapons, foods, etc. (Not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "LANGUAGE": "Any named language"
}
labels = {
    "CARDINAL": "CARDINAL", 
    "DATE": "DATE", 
    "PERSON": "PERSON", 
    "GPE": "GPE",
    "LAW": "LAW", 
    "ORG": "ORGANIZATION", 
    "PERCENT": "PERCENT",
    "ORDINAL": "ORDINAL",
    "MONEY": "MONEY",
    "WORK_OF_ART": "WORK OF ART",
    "FAC": "FACILITY",
    "TIME": "TIME",
    "LOC": "LOCATION",
    "QUANTITY": "QUANTITY",
    "NORP": "NORP",
    "PRODUCT": "PRODUCT",
    "EVENT": "EVENT",
    "LANGUAGE": "LANGUAGE"
}
dataset = template_for_token_classification(labels=labels)

In [260]:
dataset.add_records(list(dataset_to_records(ontonotes['validation'].select(range(100)), "gold_labels")))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 2943.58it/s]


In [261]:
dataset.push_to_argilla(name="dev-ner-ontonotes", workspace="admin")

RemoteFeedbackDataset(
   id=d107868e-e9e3-4419-bcbe-95e511df4376
   name=dev-ner-ontonotes
   workspace=Workspace(id=84a8fb6f-3350-4e9b-97c0-043cfedef934, name=admin, inserted_at=2024-05-14 17:08:20.825501, updated_at=2024-05-14 17:08:20.825501)
   url=http://localhost:6900/dataset/d107868e-e9e3-4419-bcbe-95e511df4376/annotation-mode
   fields=[RemoteTextField(id=UUID('91c2b56d-6563-4dbe-91bc-647e32f9d3a6'), client=None, name='text', title='Text', required=True, type='text', use_markdown=False)]
   questions=[RemoteSpanQuestion(id=UUID('3422865a-a1fa-4ab5-8c65-f24cc49ccde8'), client=None, name='entities', title='Highlight the entities in the text:', description=None, required=True, type='span', field='text', labels=[SpanLabelOption(value='CARDINAL', text='CARDINAL', description=None), SpanLabelOption(value='DATE', text='DATE', description=None), SpanLabelOption(value='PERSON', text='PERSON', description=None), SpanLabelOption(value='GPE', text='GPE', description=None), SpanLabelOption

## Push to Huggingface

In [155]:
dataset.push_to_huggingface(
    repo_id="louisguitton/dev-ner-ontonotes",split="validation"
)

Uploading the dataset shards:   0%|                                                                                                                                    | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 411.00ba/s][A
Uploading the dataset shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.73it/s]
README.md: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10.0k/10.0k [00:00<00:00, 8.44MB/s]


## Add suggestions from spacy

In [262]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    name="dev-ner-ontonotes",
    workspace="admin",
    with_vectors="all"
)

In [263]:
from typing import Type
from argilla.client.feedback.dataset.remote.dataset import RemoteFeedbackDataset
from argilla.client.feedback.schemas.remote.records import RemoteFeedbackRecord, RemoteSuggestionSchema
from argilla.client.feedback.schemas.suggestions import SuggestionSchema

def labeller(nlp: Type[spacy.language.Language], text: str) -> List[SpanValueSchema]:
    """Generate NER preditions from a spaCy model in the Argilla format."""
    doc = nlp(text)
    return [
        SpanValueSchema(
            start=ent.start_char,
            end=ent.end_char,
            label=ent.label_,
            score=0
        )  for ent in doc.ents
    ]
    
def add_suggestions_to_remote_dataset(remote_dataset: RemoteFeedbackDataset, nlp: Type[spacy.language.Language]) -> None:
    """Add suggestions from a spaCy NER model to a remote instance of an existing Argilla dataset.
    
    ref: https://docs.argilla.io/en/latest/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-suggestions-and-responses-005.html#For-the-RemoteFeedbackDataset"""
    modified_records: List[RemoteFeedbackRecord] = [record for record in remote_dataset.records]
    
    for record in modified_records:
        pred: List[SpanValueSchema] = labeller(nlp, record.fields["text"])
        # passing more than 1 suggestion fails with this error:
        # ValidationApiError: Argilla server returned an error with http status: 422. Error details: {'response': 'Record at 
        # position 0 is not valid because found duplicate suggestions question IDs', 'params': None}
        record.suggestions: Union[Tuple[Union[RemoteSuggestionSchema, SuggestionSchema]], List[Union[RemoteSuggestionSchema, SuggestionSchema]]] = [{
                "question_name": "entities",
                "value": pred,
                "agent": nlp.meta['name']
            }]
    
    remote_dataset.update_records(modified_records)

In [171]:
import spacy

nlp = spacy.load("en_core_web_sm")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [264]:
add_suggestions_to_remote_dataset(remote_dataset, nlp)

In [270]:
def test_one_suggestion_and_no_response():
    r = remote_dataset.records[2]
    pred: List[SpanValueSchema] = labeller(nlp, r.fields["text"])
    r.responses = []
    r.suggestions = [{
                "question_name": "entities",
                "value": pred,
                "agent": nlp.meta['name']
            }]
    remote_dataset.update_records([r])

In [271]:
test_one_suggestion_and_no_response()