
Datasets:
- with LOAD_DATASETS=full, argilla loads `gutenberg_spacy-ner-monitoring` for Token Classification with default spaCy predictions ; which is a fork of https://huggingface.co/datasets/gutenberg_time
- default NER dataset in papers is CoNLL-2003 https://huggingface.co/datasets/conll2003
- https://huggingface.co/datasets/DFKI-SLT/few-nerd
- https://huggingface.co/datasets/tner/ontonotes5
- Look for argilla compatible NER datasets with this search: https://huggingface.co/datasets?task_categories=task_categories:token-classification&sort=trending&search=argilla


- ✨ Provide suggested spans with a confidence score, so your team doesn't need to start from scratch.


In [2]:
from typing import List, Tuple, Union, Dict
import types

In [3]:
import argilla as rg

rg.init(api_url="http://localhost:6900", api_key="admin.apikey")



## Push to Huggingface

In [155]:
dataset.push_to_huggingface(
    repo_id="louisguitton/dev-ner-ontonotes",split="validation"
)

Uploading the dataset shards:   0%|                                                                                                                                    | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 411.00ba/s][A
Uploading the dataset shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.73it/s]
README.md: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10.0k/10.0k [00:00<00:00, 8.44MB/s]


## Add suggestions to a remote dataset

In [262]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    name="dev-ner-ontonotes",
    workspace="admin",
    with_vectors="all"
)

In [263]:
from typing import Type
from argilla.client.feedback.dataset.remote.dataset import RemoteFeedbackDataset
from argilla.client.feedback.schemas.remote.records import RemoteFeedbackRecord, RemoteSuggestionSchema
from argilla.client.feedback.schemas.suggestions import SuggestionSchema

def labeller(nlp: Type[spacy.language.Language], text: str) -> List[SpanValueSchema]:
    """Generate NER preditions from a spaCy model in the Argilla format."""
    doc = nlp(text)
    return [
        SpanValueSchema(
            start=ent.start_char,
            end=ent.end_char,
            label=ent.label_,
            score=0
        )  for ent in doc.ents
    ]
    
def add_suggestions_to_remote_dataset(remote_dataset: RemoteFeedbackDataset, nlp: Type[spacy.language.Language]) -> None:
    """Add suggestions from a spaCy NER model to a remote instance of an existing Argilla dataset.
    
    ref: https://docs.argilla.io/en/latest/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-suggestions-and-responses-005.html#For-the-RemoteFeedbackDataset"""
    modified_records: List[RemoteFeedbackRecord] = [record for record in remote_dataset.records]
    
    for record in modified_records:
        pred: List[SpanValueSchema] = labeller(nlp, record.fields["text"])
        # passing more than 1 suggestion fails with this error:
        # ValidationApiError: Argilla server returned an error with http status: 422. Error details: {'response': 'Record at 
        # position 0 is not valid because found duplicate suggestions question IDs', 'params': None}
        record.suggestions: Union[Tuple[Union[RemoteSuggestionSchema, SuggestionSchema]], List[Union[RemoteSuggestionSchema, SuggestionSchema]]] = [{
                "question_name": "entities",
                "value": pred,
                "agent": nlp.meta['name']
            }]
    
    remote_dataset.update_records(modified_records)

In [171]:
import spacy

nlp = spacy.load("en_core_web_sm")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [264]:
add_suggestions_to_remote_dataset(remote_dataset, nlp)

In [270]:
def test_one_suggestion_and_no_response():
    r = remote_dataset.records[2]
    pred: List[SpanValueSchema] = labeller(nlp, r.fields["text"])
    r.responses = []
    r.suggestions = [{
                "question_name": "entities",
                "value": pred,
                "agent": nlp.meta['name']
            }]
    remote_dataset.update_records([r])

In [271]:
test_one_suggestion_and_no_response()

## Compute metrics

In [289]:
from argilla.client.feedback.metrics.utils import get_responses_and_suggestions_per_user

In [291]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    name="ner-lvl2",
    workspace="admin",
    with_vectors="all"
)

# responses_and_suggestions_per_user = get_responses_and_suggestions_per_user(dataset=remote_dataset, question_name="entities")

Extracting responses and suggestions per user: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 861253.39it/s]


In [298]:
hf_dataset = remote_dataset.format_as("datasets")

In [300]:
hf_dataset[0]

{'text': 'A Russian diver has found the bodies of three of the 118 sailors who were killed when the nuclear submarine Kursk sank in the Barents Sea .',
 'entities': [{'user_id': 'fe7c1b6a-5d30-41d5-bb56-6675cfbad12f',
   'value': {'start': [2, 40, 53, 108, 122],
    'end': [9, 45, 56, 113, 137],
    'label': ['NORP', 'CARDINAL', 'CARDINAL', 'PRODUCT', 'LOC'],
    'text': ['Russian', 'three', '118', 'Kursk', 'the Barents Sea']},
   'status': 'submitted'}],
 'entities-suggestion': {'start': [2, 40, 53, 108, 122],
  'end': [9, 45, 56, 113, 137],
  'label': ['NORP', 'CARDINAL', 'CARDINAL', 'PRODUCT', 'LOC'],
  'text': ['Russian', 'three', '118', 'Kursk', 'the Barents Sea'],
  'score': [None, None, None, None, None]},
 'entities-suggestion-metadata': {'type': None, 'score': None, 'agent': None},
 'external_id': None,
 'metadata': '{}'}

In [None]:
from spacy.tokens import Doc
from spacy.training import Example

examples: List[Example] = []
for row in hf_dataset:
    text = row["text"]
    gold = row["entities"]
    pred = row["entities-suggestion"]

    # generate Doc with Doc.set_ents from a list of spans for the predicted suggestions
    # ref: https://spacy.io/api/doc#set_ents

    # generate Doc with Doc.set_ents from a list of spans for the gold responses
    
    example = Example(predicted, reference)
    examples.append(example)

In [296]:
from spacy.scorer import Scorer
scorer = Scorer()

In [None]:
scores = scorer.score(examples)

## Football Articles
### Create (and Delete) empty dataset in Argilla for our task

In [246]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")
remote_dataset.delete()

In [249]:
dataset = rg.FeedbackDataset(
    fields=[
        rg.TextField(name="text")
    ],
    questions=[
        rg.LabelQuestion(
            name="category",
            title="What is the category of the article?",
            labels=[
                "Coach Commentary", "Transfer News", "Match Report",
                "Player Profile", "League Updates", "Injury Updates",
                "Tactical Analysis", "Social Media Reaction",
                "Historical Milestone", "Match Incident"
            ],
            required=False,
            visible_labels=None
        ),
        rg.SpanQuestion(
            name="entities",
            title="Highlight the entities in the content:",
            labels=["Competition", "Team", "Player", "Match", "Transfer"],
            field="text",
            required=True,
            allow_overlapping=True
        )
    ],
    metadata_properties = [
        rg.TermsMetadataProperty(name="link"),
        rg.TermsMetadataProperty(name="source"),
    ],
    vectors_settings=[], # we will add sentence embeddings a posteriori
    guidelines="Please, read the question carefully and try to answer it as accurately as possible."
)

In [250]:
remote_dataset = dataset.push_to_argilla(name="football-news", workspace="admin")

### Add bare records to the remote Argilla dataset

In [251]:
from typing import Iterator
from tqdm import tqdm

def records_generator(filepath: str = '../../data/football-news-articles/final-articles.csv') -> Iterator[rg.FeedbackRecord]:
    """Create Argilla records from the Football News kaggle dataset.
    
    Notes:
    - articles with source="all-football-app" have encoding issues
    """
    dataset: pd.DataFrame = (
        pd.read_csv(filepath)
        .loc[lambda d: d.source.isin(["skysports", "the-analyst"])]
    )

    for index, row in tqdm(dataset.iterrows()):
        record = rg.FeedbackRecord(
            fields={
                "text": "\n".join([row['title'], row['content']])
            },
            metadata={
                "link": row['link'],
                "source": row['source'],
            },
            vectors={},
            responses=[],
            suggestions=[],
            external_id=index,
        )

        yield record

In [252]:
LIMIT = 50

In [253]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")
remote_dataset.add_records(list(records_generator())[:LIMIT])

2157it [00:00, 31841.35it/s]




### Add vectors to records to enable Similarity Search in Argilla

In [254]:
from argilla.client.feedback.integrations.sentencetransformers import SentenceTransformersExtractor

In [255]:
FAST_AND_SMALL = "sentence-transformers/all-MiniLM-L6-v2"

ste = SentenceTransformersExtractor(
    model=FAST_AND_SMALL,
    show_progress=True,
)



In [256]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

# Update the dataset
remote_dataset = ste.update_dataset(
    dataset=remote_dataset,
    fields=None, # None means using all fields
    update_records=True, # Also, update the records in the dataset
    overwrite=True, # Whether to overwrite existing vectors
)

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.43it/s]


## Add suggestions for the text classification question using a zero-shot classification model from HuggingFace
If we had labels, we could train a model that uses sentence embeddings (for more details, see [this argilla tutorial](https://docs.argilla.io/en/v1.28.0/tutorials_and_integrations/tutorials/feedback/labelling-feedback-setfit.html))

```python
from setfit import SetFitModel, SetFitTrainer

model = SetFitModel.from_pretrained("all-MiniLM-L6-v2")
trainer = SetFitTrainer(model=model, train_dataset=train_dataset)
```

But for the cold start, we look for a small model on [HuggingFace](https://huggingface.co/models?pipeline_tag=zero-shot-classification)

In [236]:
from transformers import pipeline

# model_name = "sileod/deberta-v3-small-tasksource-nli"
# model_name = "cointegrated/rubert-tiny-bilingual-nli"
# model_name = "typeform/distilbert-base-uncased-mnli"
model_name = "valhalla/distilbart-mnli-12-3"
classifier = pipeline("zero-shot-classification", model=model_name)

In [237]:
text = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']
classifier(text, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.985403299331665, 0.007384597323834896, 0.007212089374661446]}

In [257]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

modified_records = [record for record in remote_dataset.records]
for record in tqdm(modified_records):
    x = record.fields["text"]
    # if x: str, then y = {"sequence": 'one day I will see the world', "labels": ['travel', 'dancing', 'cooking'], 'scores': [0.8434, 0.0814, 0.0750]}
    y = classifier(
        sequences=x,
        candidate_labels=remote_dataset.question_by_name("category").labels
    )
    # we overwrite the suggestions
    record.suggestions = [
        {
            "question_name": "category",
            "value": y["labels"][0],
            "agent": model_name,
            "score": y["scores"][0]
        }
    ]

remote_dataset.update_records(modified_records)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:53<00:00,  5.88s/it]


Problems:
- accuracy is not great in practice
- inference time is high

Approach: 
- label a few (10s) articles and train a classifier with SetFit 

## Add suggestions for the token classification question using a zero-shot NER model from HuggingFace

In [10]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

In [46]:
import spacy
from gliner_spacy.pipeline import (  # noqa: F401 because we need to register the factory with spacy
    GlinerSpacy,
)

candidate_labels =[label.value.lower() for label in remote_dataset.question_by_name("entities").labels] # NuZero requires labels to be lower-cased

model_name = "numind/NuZero_token"

nlp = spacy.load("en_core_web_sm", disable=["ner"])
nlp.add_pipe("gliner_spacy", config={
    "gliner_model": model_name,
    "chunk_size": 250,
    "labels": candidate_labels,  
    "style": "ent",
    "threshold": 0.3,
})



<gliner_spacy.pipeline.GlinerSpacy at 0x2d2417550>

In [262]:
from typing import Tuple, Union
from argilla.client.feedback.schemas import SpanValueSchema, SuggestionSchema
from argilla.client.feedback.schemas.remote.records import RemoteSuggestionSchema


modified_records = [record for record in remote_dataset.records]
for record in tqdm(modified_records):
    doc = nlp(record.fields["text"]) 
    # we append the suggestions to the existing suggestions
    record.suggestions: Tuple[Union[dict, SuggestionSchema, RemoteSuggestionSchema], ...]
    record.suggestions += (
        {
            "question_name": "entities",
            "value": [
                SpanValueSchema(
                    start=ent.start_char,
                    end=ent.end_char,
                    label=ent.label_.capitalize(),
                )
                for ent in doc.ents
            ],
            "agent": model_name,
            # "score": ...
        }
        ,)

remote_dataset.update_records(modified_records)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00,  2.25s/it]




## Craft a better NER model by combining multiple weak supervision annotators using skweak

In [34]:
import re
from typing import Iterator

from skweak import heuristics, gazetteers, generative, utils, base
from spacy.tokens import Doc, Span

### Annotate with Regex

In [30]:
def _match_pattern(doc: Doc, pattern: str, group_idx: int, label: str) -> Iterable[Tuple[int, int, str]]:
    for match in re.finditer(pattern, doc.text):
        start, end = match.span(group_idx)
        span: Span = doc.char_span(start, end, label=label)
        if span:
            yield span.start, span.end, span.label

def football_game_scores_detector(doc: Doc) -> Iterable[Tuple[int, int, str]]:
    _SCORE_RGX = r"\D{2}((\d)\s*[-|to]\s*(\d))\D{2}"  # first group
    return _match_pattern(doc, _SCORE_RGX, 1, "SCORE")

def football_team_formations_detector(doc: Doc) -> Iterable[Tuple[int, int, str]]:
    _FORMATION_RGX = r"(\d)([\s-](\d)){2,3}"  # full match
    return _match_pattern(doc, _FORMATION_RGX, 0, "FORMATION")

In [31]:
# we define labelling functions LFs
lf1 = heuristics.FunctionAnnotator("scores", football_game_scores_detector)
lf2 = heuristics.FunctionAnnotator("formations", football_team_formations_detector)

In [27]:
text = "Burnley 2-2 Fulham: David Datro Fofana's late double earns Clarets dramatic draw"
doc = nlp(text)
utils.display_entities(lf1(doc), "scores")

In [33]:
text = "They played in 4-4-2."
doc = nlp(text)
utils.display_entities(lf2(doc), "formations")

### heuristics.VicinityAnnotator

In [77]:
from typing import Iterable
from spacy.tokens import Doc, Span, Token

def _is_match_token(tok: Token, doc: Doc) -> bool:
    MATCH_CUE_WORDS = ["win", "loss", "score", "against", "tie", "clash", "playoff", "fixture", "game", "draw", "knockout", "defeat", "performance", "derby", "victory", "match"]
    is_fixture_keyword = (
        tok.lemma_ in MATCH_CUE_WORDS
    )
    is_score = any(tok in span for span in doc.spans['scores'])
    return is_fixture_keyword or is_score

def match_cue_detector(doc: Doc) -> Iterable[Tuple[int, int, str]]:
    # high recall approach
    mentions: List[Span] = []
    for tok in filter(lambda tk: _is_match_token(tk, doc), doc):
        # Here we use the token's substree as the extracted span
        # we add the label for visualisation purposes
        word = tok.head
        mention: Span = Span(
            doc, start=word.left_edge.i, end=word.right_edge.i + 1, label="MATCH"
        )
        if mention:
            mentions.append(mention)

    # deduplicate mentions e.g. a score and 'against' created 2 times the same mention
    # some subtrees contain other subtrees => they are duplicate mentions
    # https://spacy.io/api/top-level#util.filter_spans
    # When spans overlap, the (first) longest span is preferred over shorter spans.
    mentions = sorted(set(mentions), key=lambda m: m.start)
    mentions = spacy.util.filter_spans(mentions)

    for mention in mentions:
        yield mention.start, mention.end, "MATCH"

In [78]:
lf3 = heuristics.FunctionAnnotator("match_cues", match_cue_detector)

In [79]:
text = "Scotland's 2-0 shock win over Spain back last March was the last game that Rodri lost."
doc = nlp(text)
utils.display_entities(lf3(lf1(doc)), "match_cues")

### gazetteers.GazetteerAnnotator

### Applying labelling functions

In [None]:
combined = base.CombinedAnnotator()
combined.add_annotator(lf1)
combined.add_annotator(lf3)

In [None]:
docs = list(combined.pipe(docs))