
Datasets:
- with LOAD_DATASETS=full, argilla loads `gutenberg_spacy-ner-monitoring` for Token Classification with default spaCy predictions ; which is a fork of https://huggingface.co/datasets/gutenberg_time
- default NER dataset in papers is CoNLL-2003 https://huggingface.co/datasets/conll2003
- https://huggingface.co/datasets/DFKI-SLT/few-nerd
- https://huggingface.co/datasets/tner/ontonotes5
- Look for argilla compatible NER datasets with this search: https://huggingface.co/datasets?task_categories=task_categories:token-classification&sort=trending&search=argilla


- ✨ Provide suggested spans with a confidence score, so your team doesn't need to start from scratch.


In [140]:
from typing import List, Tuple, Union, Dict
import types

In [108]:
import argilla as rg

rg.init(api_url="http://localhost:6900", api_key="admin.apikey")



## Push to Huggingface

In [155]:
dataset.push_to_huggingface(
    repo_id="louisguitton/dev-ner-ontonotes",split="validation"
)

Uploading the dataset shards:   0%|                                                                                                                                    | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 411.00ba/s][A
Uploading the dataset shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.73it/s]
README.md: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10.0k/10.0k [00:00<00:00, 8.44MB/s]


## Add suggestions to a remote dataset

In [262]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    name="dev-ner-ontonotes",
    workspace="admin",
    with_vectors="all"
)

In [263]:
from typing import Type
from argilla.client.feedback.dataset.remote.dataset import RemoteFeedbackDataset
from argilla.client.feedback.schemas.remote.records import RemoteFeedbackRecord, RemoteSuggestionSchema
from argilla.client.feedback.schemas.suggestions import SuggestionSchema

def labeller(nlp: Type[spacy.language.Language], text: str) -> List[SpanValueSchema]:
    """Generate NER preditions from a spaCy model in the Argilla format."""
    doc = nlp(text)
    return [
        SpanValueSchema(
            start=ent.start_char,
            end=ent.end_char,
            label=ent.label_,
            score=0
        )  for ent in doc.ents
    ]
    
def add_suggestions_to_remote_dataset(remote_dataset: RemoteFeedbackDataset, nlp: Type[spacy.language.Language]) -> None:
    """Add suggestions from a spaCy NER model to a remote instance of an existing Argilla dataset.
    
    ref: https://docs.argilla.io/en/latest/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-suggestions-and-responses-005.html#For-the-RemoteFeedbackDataset"""
    modified_records: List[RemoteFeedbackRecord] = [record for record in remote_dataset.records]
    
    for record in modified_records:
        pred: List[SpanValueSchema] = labeller(nlp, record.fields["text"])
        # passing more than 1 suggestion fails with this error:
        # ValidationApiError: Argilla server returned an error with http status: 422. Error details: {'response': 'Record at 
        # position 0 is not valid because found duplicate suggestions question IDs', 'params': None}
        record.suggestions: Union[Tuple[Union[RemoteSuggestionSchema, SuggestionSchema]], List[Union[RemoteSuggestionSchema, SuggestionSchema]]] = [{
                "question_name": "entities",
                "value": pred,
                "agent": nlp.meta['name']
            }]
    
    remote_dataset.update_records(modified_records)

In [171]:
import spacy

nlp = spacy.load("en_core_web_sm")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [264]:
add_suggestions_to_remote_dataset(remote_dataset, nlp)

In [270]:
def test_one_suggestion_and_no_response():
    r = remote_dataset.records[2]
    pred: List[SpanValueSchema] = labeller(nlp, r.fields["text"])
    r.responses = []
    r.suggestions = [{
                "question_name": "entities",
                "value": pred,
                "agent": nlp.meta['name']
            }]
    remote_dataset.update_records([r])

In [271]:
test_one_suggestion_and_no_response()

## Compute metrics

In [289]:
from argilla.client.feedback.metrics.utils import get_responses_and_suggestions_per_user

In [291]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    name="ner-lvl2",
    workspace="admin",
    with_vectors="all"
)

# responses_and_suggestions_per_user = get_responses_and_suggestions_per_user(dataset=remote_dataset, question_name="entities")

Extracting responses and suggestions per user: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 861253.39it/s]


In [298]:
hf_dataset = remote_dataset.format_as("datasets")

In [300]:
hf_dataset[0]

{'text': 'A Russian diver has found the bodies of three of the 118 sailors who were killed when the nuclear submarine Kursk sank in the Barents Sea .',
 'entities': [{'user_id': 'fe7c1b6a-5d30-41d5-bb56-6675cfbad12f',
   'value': {'start': [2, 40, 53, 108, 122],
    'end': [9, 45, 56, 113, 137],
    'label': ['NORP', 'CARDINAL', 'CARDINAL', 'PRODUCT', 'LOC'],
    'text': ['Russian', 'three', '118', 'Kursk', 'the Barents Sea']},
   'status': 'submitted'}],
 'entities-suggestion': {'start': [2, 40, 53, 108, 122],
  'end': [9, 45, 56, 113, 137],
  'label': ['NORP', 'CARDINAL', 'CARDINAL', 'PRODUCT', 'LOC'],
  'text': ['Russian', 'three', '118', 'Kursk', 'the Barents Sea'],
  'score': [None, None, None, None, None]},
 'entities-suggestion-metadata': {'type': None, 'score': None, 'agent': None},
 'external_id': None,
 'metadata': '{}'}

In [None]:
from spacy.tokens import Doc
from spacy.training import Example

examples: List[Example] = []
for row in hf_dataset:
    text = row["text"]
    gold = row["entities"]
    pred = row["entities-suggestion"]

    # generate Doc with Doc.set_ents from a list of spans for the predicted suggestions
    # ref: https://spacy.io/api/doc#set_ents

    # generate Doc with Doc.set_ents from a list of spans for the gold responses
    
    example = Example(predicted, reference)
    examples.append(example)

In [296]:
from spacy.scorer import Scorer
scorer = Scorer()

In [None]:
scores = scorer.score(examples)

## Load NuNER from Huggingface

In [None]:
text = """
Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaldu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for and captains both Saudi Pro League club Al Nassr and the Portugal national team. Widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's Player of the Year Awards, and four European Golden Shoes, the most by a European player. He has won 33 trophies in his career, including seven league titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA Nations League. Ronaldo holds the records for most appearances (183), goals (140) and assists (42) in the Champions League, goals in the European Championship (14), international goals (128) and international appearances (205). He is one of the few players to have made over 1,200 professional career appearances, the most by an outfield player, and has scored over 850 official senior career goals for club and country, making him the top goalscorer of all time.
"""
labels = ["person", "award", "date", "competitions", "teams"]
entities = model.predict_entities(text, labels)
for entity in entities:
    print(entity["text"], "=>", entity["label"])

In [307]:
text = "A Russian diver has found the bodies of three of the 118 sailors who were killed when the nuclear submarine Kursk sank in the Barents Sea ."

In [312]:
ents = token_classifier(text)

In [313]:
ents

[{'entity': 'LABEL_1',
  'score': 0.52182305,
  'index': 1,
  'word': 'A',
  'start': 0,
  'end': 1},
 {'entity': 'LABEL_0',
  'score': 0.59524924,
  'index': 2,
  'word': 'ĠRussian',
  'start': 2,
  'end': 9},
 {'entity': 'LABEL_0',
  'score': 0.6182698,
  'index': 3,
  'word': 'Ġdiver',
  'start': 10,
  'end': 15},
 {'entity': 'LABEL_0',
  'score': 0.51315767,
  'index': 4,
  'word': 'Ġhas',
  'start': 16,
  'end': 19},
 {'entity': 'LABEL_1',
  'score': 0.5029928,
  'index': 5,
  'word': 'Ġfound',
  'start': 20,
  'end': 25},
 {'entity': 'LABEL_1',
  'score': 0.5072447,
  'index': 6,
  'word': 'Ġthe',
  'start': 26,
  'end': 29},
 {'entity': 'LABEL_0',
  'score': 0.56687266,
  'index': 7,
  'word': 'Ġbodies',
  'start': 30,
  'end': 36},
 {'entity': 'LABEL_0',
  'score': 0.5374049,
  'index': 8,
  'word': 'Ġof',
  'start': 37,
  'end': 39},
 {'entity': 'LABEL_0',
  'score': 0.58160913,
  'index': 9,
  'word': 'Ġthree',
  'start': 40,
  'end': 45},
 {'entity': 'LABEL_0',
  'score': 0.

In [None]:
def spacy_to_argilla(
    row: dict,
    nlp: Type[Language],
    tokens_field: str = "tokens",
    score: Callable[[Span], float] = None,
) -> List[SpanValueSchema]:
    """Generate argilla-compatible annotations from a spaCy NER model."""
    text = " ".join(row[tokens_field])
    doc = nlp(text)
    return [
        SpanValueSchema(
            start=ent.start_char,
            end=ent.end_char,
            label=ent.label_,
            score=score(ent) if score else None,
        )
        for ent in doc.ents
    ]


## Add suggestions from KB and LLM

In [None]:
ontonotes = load_ontonotes()
dataset = template_for_token_classification()
records = dataset_to_records(ontonotes['validation'].select(range(100)), "gold_labels")
dataset.add_records(list(records))
dataset.push_to_argilla(name="dev-ner-ontonotes", workspace="admin")