In [84]:
from typing import List, Tuple, Union

In [108]:
import argilla as rg

rg.init(api_url="http://localhost:6900", api_key="admin.apikey")



In [109]:
def template_for_token_classification() -> rg.FeedbackDataset:
    """Create a dataset with a span question for NER or POS tagging or information retrieval tasks.
    
    There is no pre-defined template in argilla yet, so we define a custom dataset instead.
    The high-level API of this method is TBD.
    ref: https://docs.argilla.io/en/latest/practical_guides/create_update_dataset/create_dataset.html#define-questions + click on Span
    """
    dataset = rg.FeedbackDataset(
        fields=[rg.TextField(name="text")],
        questions=[
            rg.SpanQuestion(
                name="entities",
                title="Highlight the entities in the text:",
                labels={"PER": "Person", "ORG": "Organization", "LOC": "Location", "MISC": "Other"},
                field="text", # the field where you want to do the span annotation
                required=True,
                allow_overlapping=True
            )
        ]
    )
    return dataset

In [110]:
dataset = template_for_token_classification()




Datasets:
- with LOAD_DATASETS=full, argilla loads `gutenberg_spacy-ner-monitoring` for Token Classification with default spaCy predictions ; which is a fork of https://huggingface.co/datasets/gutenberg_time
- default NER dataset in papers is CoNLL-2003 https://huggingface.co/datasets/conll2003
- https://huggingface.co/datasets/DFKI-SLT/few-nerd
- https://huggingface.co/datasets/tner/ontonotes5
- Look for argilla compatible NER datasets with this search: https://huggingface.co/datasets?task_categories=task_categories:token-classification&sort=trending&search=argilla


- ✨ Provide suggested spans with a confidence score, so your team doesn't need to start from scratch.


In [111]:
from datasets import load_dataset, Features, Sequence, ClassLabel, Value, DatasetDict

def load_conll():
    classmap = ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
    return load_dataset("conll2003").map(lambda sample: {"parsed_ner_tags": classmap.int2str(sample["ner_tags"])})

In [112]:
conll2003 = load_conll()

In [113]:
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.training.iob_utils import iob_to_biluo, biluo_tags_to_offsets
from argilla.client.feedback.schemas import SpanValueSchema

def conll2003_tags_to_entities(row: dict) -> List[SpanValueSchema]:
    doc = Doc(Vocab(), words=row["tokens"])
    classmap = ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
    ner_tags = classmap.int2str(row["ner_tags"])
    offsets = biluo_tags_to_offsets(doc, iob_to_biluo(ner_tags))

    return [
        SpanValueSchema(
            start=start, # position of the first character of the span
            end=stop, # position of the character right after the end of the span
            label=entity,
            score=1.0
        ) for start, stop, entity in offsets
    ]

In [115]:
from tqdm import tqdm

def dataset_to_records(dataset: DatasetDict):
    for row in tqdm(dataset):
        text = " ".join(row["tokens"])

        # Seems like we have "empty" rows
        if not text.strip():
            continue

        yield rg.FeedbackRecord(
            fields={"text": text},
            suggestions = [
                {
                    "question_name": "entities",
                    "value": conll2003_tags_to_entities(row),
                    "agent": "gold_labels",
                }
            ]
        )

In [116]:
# add records to the dataset and push to Argilla
dataset.add_records(list(dataset_to_records(conll2003['validation'])))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3250/3250 [00:00<00:00, 6520.85it/s]


In [117]:
dataset.push_to_argilla(name="my-first-dataset", workspace="admin")

RemoteFeedbackDataset(
   id=60d31698-9ad9-47db-9e5f-d1e4f7d8b71b
   name=my-first-dataset
   workspace=Workspace(id=84a8fb6f-3350-4e9b-97c0-043cfedef934, name=admin, inserted_at=2024-05-14 17:08:20.825501, updated_at=2024-05-14 17:08:20.825501)
   url=http://localhost:6900/dataset/60d31698-9ad9-47db-9e5f-d1e4f7d8b71b/annotation-mode
   fields=[RemoteTextField(id=UUID('6b46a2c5-72ec-42d8-8392-9173007a025e'), client=None, name='text', title='Text', required=True, type='text', use_markdown=False)]
   questions=[RemoteSpanQuestion(id=UUID('12bbb02d-c899-4d42-ba40-8dc95985cbaf'), client=None, name='entities', title='Highlight the entities in the text:', description=None, required=True, type='span', field='text', labels=[SpanLabelOption(value='PER', text='Person', description=None), SpanLabelOption(value='ORG', text='Organization', description=None), SpanLabelOption(value='LOC', text='Location', description=None), SpanLabelOption(value='MISC', text='Other', description=None)], visible_label