# 3. Extracting Football club transfers from news


In [1]:
import argilla as rg

rg.init(api_url="http://localhost:6900", api_key="admin.apikey")



## 3.2. Start NER project and create ontology


In [241]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")
remote_dataset.delete()

In [242]:
dataset = rg.FeedbackDataset(
    fields=[rg.TextField(name="text")],
    questions=[
        rg.LabelQuestion(
            name="category",
            title="What is the category of the article?",
            labels=[
                "Coach Commentary",
                "Transfer News",
                "Match Report",
                "Player Profile",
                "League Updates",
                "Injury Updates",
                "Tactical Analysis",
                "Social Media Reaction",
                "Historical Milestone",
                "Match Incident",
            ],
            required=False,
            visible_labels=None,
        ),
        rg.SpanQuestion(
            name="entities",
            title="Highlight the entities in the content:",
            labels=["Competition", "Team", "Player", "Match", "Transfer"],
            field="text",
            required=True,
            allow_overlapping=True,
        ),
    ],
    metadata_properties=[
        rg.TermsMetadataProperty(name="link"),
        rg.TermsMetadataProperty(name="source"),
    ],
    vectors_settings=[],  # we will add sentence embeddings a posteriori
    guidelines="Please, read the question carefully and try to answer it as accurately as possible.",
)



In [243]:
remote_dataset = dataset.push_to_argilla(name="football-news", workspace="admin")

## 3.3. Import data


In [247]:
from typing import Iterator
from tqdm import tqdm
import pandas as pd


def records_generator(
    filepath: str = "../../data/football-news-articles/final-articles.csv",
) -> Iterator[rg.FeedbackRecord]:
    """Create Argilla records from the Football News kaggle dataset.

    Notes:
    - articles with source="all-football-app" have encoding issues
    """
    dataset: pd.DataFrame = pd.read_csv(filepath).loc[
        lambda d: d.source.isin(["skysports", "the-analyst"])
    ]

    for index, row in tqdm(dataset.iterrows()):
        record = rg.FeedbackRecord(
            fields={"text": "\n".join([row["title"], row["content"]])},
            metadata={
                "link": row["link"],
                "source": row["source"],
            },
            vectors={},
            responses=[],
            suggestions=[],
            external_id=index,
        )

        yield record

In [248]:
LIMIT = 50

In [249]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")
remote_dataset.add_records(list(records_generator())[:LIMIT])

2157it [00:00, 30247.38it/s]




## 3.4. Annotate data and filter by metadata

- Annotate some data by hand (see [here](http://localhost:6900/dataset/d56f31e3-e809-4b16-a86b-ca2414a77497/annotation-mode?page=1&status=submitted))
- Demo overlapping spans with a Transfer of a Player, and a Match of 2 Teams
- Search for records with a Score like `2-0`
- Argilla old dataset types allowed for Regex filtering but not anymore [ref](https://docs.argilla.io/en/latest/practical_guides/filter_dataset.html#regular-expressions)

```python
_SCORE_RGX = r"\D{2}((\d)\s*[-|to]\s*(\d))\D{2}"
```


## 3.5. Find similar records by vectors


In [250]:
from argilla.client.feedback.integrations.sentencetransformers import SentenceTransformersExtractor

In [251]:
FAST_AND_SMALL = "sentence-transformers/all-MiniLM-L6-v2"

ste = SentenceTransformersExtractor(
    model=FAST_AND_SMALL,
    show_progress=True,
)

In [252]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

# Update the dataset
remote_dataset = ste.update_dataset(
    dataset=remote_dataset,
    fields=None,  # None means using all fields
    update_records=True,  # Also, update the records in the dataset
    overwrite=True,  # Whether to overwrite existing vectors
)

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.64it/s]


## 3.6. Adding suggestions with model

### 3.6.1. Add suggestions for the text classification question using a zero-shot classification model from HuggingFace

If we had labels, we could train a model that uses sentence embeddings (for more details, see [this argilla tutorial](https://docs.argilla.io/en/v1.28.0/tutorials_and_integrations/tutorials/feedback/labelling-feedback-setfit.html))

```python
from setfit import SetFitModel, SetFitTrainer

model = SetFitModel.from_pretrained("all-MiniLM-L6-v2")
trainer = SetFitTrainer(model=model, train_dataset=train_dataset)
```

But for the cold start, we look for a small model on [HuggingFace](https://huggingface.co/models?pipeline_tag=zero-shot-classification)


In [253]:
from transformers import pipeline

# model_name = "sileod/deberta-v3-small-tasksource-nli"
# model_name = "cointegrated/rubert-tiny-bilingual-nli"
# model_name = "typeform/distilbert-base-uncased-mnli"
model_name = "valhalla/distilbart-mnli-12-3"
classifier = pipeline("zero-shot-classification", model=model_name)

In [254]:
text = "one day I will see the world"
candidate_labels = ["travel", "cooking", "dancing"]
classifier(text, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.985403299331665, 0.007384597323834896, 0.007212089374661446]}

In [255]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

modified_records = [record for record in remote_dataset.records]
for record in tqdm(modified_records):
    x = record.fields["text"]
    # if x: str, then y = {"sequence": 'one day I will see the world', "labels": ['travel', 'dancing', 'cooking'], 'scores': [0.8434, 0.0814, 0.0750]}
    y = classifier(sequences=x, candidate_labels=remote_dataset.question_by_name("category").labels)
    # we overwrite the suggestions
    record.suggestions = [
        {
            "question_name": "category",
            "value": y["labels"][0],
            "agent": model_name,
            "score": y["scores"][0],
        }
    ]

remote_dataset.update_records(modified_records)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:54<00:00,  5.90s/it]


Problems:

- accuracy is not great in practice
- inference time is high

Approach:

- label a few (10s) articles and train a classifier with SetFit


### 3.6.2. Add suggestions for the token classification question using a zero-shot NER model from HuggingFace


In [256]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

In [257]:
import spacy
from gliner_spacy.pipeline import (  # noqa: F401 because we need to register the factory with spacy
    GlinerSpacy,
)

candidate_labels = [
    label.value.lower() for label in remote_dataset.question_by_name("entities").labels
]  # NuZero requires labels to be lower-cased

model_name = "numind/NuZero_token"

nlp = spacy.load("en_core_web_sm", disable=["ner"])
nlp.add_pipe(
    "gliner_spacy",
    config={
        "gliner_model": model_name,
        "chunk_size": 250,
        "labels": candidate_labels,
        "style": "ent",
        "threshold": 0.3,
    },
)



<gliner_spacy.pipeline.GlinerSpacy at 0x3b7960f90>

In [258]:
from typing import Tuple, Union
from argilla.client.feedback.schemas import SpanValueSchema, SuggestionSchema
from argilla.client.feedback.schemas.remote.records import RemoteSuggestionSchema

modified_records = [record for record in remote_dataset.records]
for record in tqdm(modified_records):
    doc = nlp(record.fields["text"])
    # we append the suggestions to the existing suggestions
    record.suggestions: Tuple[Union[dict, SuggestionSchema, RemoteSuggestionSchema], ...]
    record.suggestions += (
        {
            "question_name": "entities",
            "value": [
                SpanValueSchema(
                    start=ent.start_char,
                    end=ent.end_char,
                    label=ent.label_.capitalize(),
                )
                for ent in doc.ents
            ],
            "agent": model_name,
            # "score": ...
        },
    )

remote_dataset.update_records(modified_records)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:44<00:00,  2.10s/it]


## 3.7. Craft a better NER model by combining multiple weak supervision annotators using skweak


In [34]:
import re
from typing import Iterator

from skweak import heuristics, gazetteers, generative, utils, base
from spacy.tokens import Doc, Span

### Annotate with Regex


In [157]:
def _match_pattern(
    doc: Doc, pattern: str, group_idx: int, label: str
) -> Iterable[Tuple[int, int, str]]:
    for match in re.finditer(pattern, doc.text):
        start, end = match.span(group_idx)
        span: Span = doc.char_span(start, end, label=label)
        if span:
            yield span.start, span.end, span.label


def football_game_scores_detector(doc: Doc) -> Iterable[Tuple[int, int, str]]:
    _SCORE_RGX = r"\D{2}((\d)\s*[-|to]\s*(\d))\D{2}"  # first group
    return _match_pattern(doc, _SCORE_RGX, 1, "Score")


def football_team_formations_detector(doc: Doc) -> Iterable[Tuple[int, int, str]]:
    _FORMATION_RGX = r"(\d)([\s-](\d)){2,3}"  # full match
    return _match_pattern(doc, _FORMATION_RGX, 0, "Formation")

In [158]:
# we define labelling functions LFs
lf1 = heuristics.FunctionAnnotator("scores", football_game_scores_detector)
lf2 = heuristics.FunctionAnnotator("formations", football_team_formations_detector)

In [159]:
text = "Burnley 2-2 Fulham: David Datro Fofana's late double earns Clarets dramatic draw"
doc = nlp(text)
utils.display_entities(lf1(doc), "scores")

In [160]:
text = "They played in 4-4-2."
doc = nlp(text)
utils.display_entities(lf2(doc), "formations")

### Annotate based on cue words and the syntactic tree


In [162]:
from typing import Iterable
from spacy.tokens import Doc, Span, Token


def _is_match_token(tok: Token, doc: Doc) -> bool:
    MATCH_CUE_WORDS = [
        "win",
        "loss",
        "score",
        "against",
        "tie",
        "clash",
        "playoff",
        "fixture",
        "game",
        "draw",
        "knockout",
        "defeat",
        "performance",
        "derby",
        "victory",
        "match",
    ]
    is_fixture_keyword = tok.lemma_ in MATCH_CUE_WORDS
    is_score = any(tok in span for span in doc.spans["scores"])
    return is_fixture_keyword or is_score


def match_cue_detector(doc: Doc) -> Iterable[Tuple[int, int, str]]:
    # high recall approach
    mentions: List[Span] = []
    for tok in filter(lambda tk: _is_match_token(tk, doc), doc):
        # Here we use the token's substree as the extracted span
        # we add the label for visualisation purposes
        word = tok.head
        mention: Span = Span(doc, start=word.left_edge.i, end=word.right_edge.i + 1, label="Match")
        if mention:
            mentions.append(mention)

    # deduplicate mentions e.g. a score and 'against' created 2 times the same mention
    # some subtrees contain other subtrees => they are duplicate mentions
    # https://spacy.io/api/top-level#util.filter_spans
    # When spans overlap, the (first) longest span is preferred over shorter spans.
    mentions = sorted(set(mentions), key=lambda m: m.start)
    mentions = spacy.util.filter_spans(mentions)

    for mention in mentions:
        yield mention.start, mention.end, "Match"

In [163]:
lf3 = heuristics.FunctionAnnotator("match_cues", match_cue_detector)

In [164]:
text = "Scotland's 2-0 shock win over Spain back last March was the last game that Rodri lost."
doc = nlp(text)
utils.display_entities(lf3(lf1(doc)), "match_cues")

### Annotate with keywords from WikiData with SPARQL

We will query Wikidata for competition names. This is just to demonstrate the principle.
We could then query for Players, Teams, Club Staff, Stadiums etc....


In [82]:
# https://rdflib.github.io/sparqlwrapper/
%pip install sparqlwrapper

Collecting sparqlwrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Installing collected packages: sparqlwrapper
Successfully installed sparqlwrapper-2.0.0
Note: you may need to restart the kernel to use updated packages.


In [167]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT ?item ?itemLabel ?countryLabel
WHERE
{
  ?item wdt:P31/wdt:P279* wd:Q15991303 ;
        wdt:P279 wd:Q3270632 ;
        wdt:P17 ?country .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY ?countryLabel"""


def get_results(endpoint_url, query):
    user_agent = "Argilla talk/%s.%s (https://guitton.co/)" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

COMPETITIONS = [
    result["itemLabel"]["value"].split(" ") for result in results["results"]["bindings"]
]

In [168]:
trie = gazetteers.Trie(COMPETITIONS)
lf4 = gazetteers.GazetteerAnnotator("competitions", {"Competition": trie})

In [169]:
text = "Ligue 1 should be worth more than Premier League."
doc = nlp(text)
utils.display_entities(lf4(doc), "competitions")

In [170]:
query = """# Ref: https://guitton.co/posts/wikidata/
SELECT ?item ?itemLabel ?itemAltLabel
WHERE
{
  ?item wdt:P31/wdt:P279* wd:Q847017;
        wdt:P118 ?league.
  VALUES ?league {
    wd:Q324867 # LaLiga
    wd:Q13394  # Ligue 1
    wd:Q9448  # Premier League
    wd:Q19510 # Championship
    wd:Q182994  # Primeira Liga
    wd:Q82595  # Bundesliga
    wd:Q15804  # Serie A
    wd:Q167541  # Eredivisie
    wd:Q216022 # Jupiler
  }.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "nl". }
}"""

results = get_results(endpoint_url, query)

TEAMS = [result["itemLabel"]["value"].split(" ") for result in results["results"]["bindings"]]

In [171]:
trie = gazetteers.Trie(TEAMS)
lf5 = gazetteers.GazetteerAnnotator("teams", {"Team": trie})

### Annotate with a custom spacy pipeline


In [132]:
from spacy.language import Language
from skweak.spacy import ModelAnnotator, LabelMapper


class SpacyAnnotator(ModelAnnotator):
    def __init__(
        self,
        name: str,
        nlp: Language,
        disabled: List[str] = ["parser", "tagger", "lemmatizer", "attribute_ruler"],
    ):
        super(ModelAnnotator, self).__init__(name)
        self.model = nlp
        self.model.disable_pipes(disabled)

In [172]:
import spacy
from gliner_spacy.pipeline import (  # noqa: F401 because we need to register the factory with spacy
    GlinerSpacy,
)

candidate_labels = ["competition", "team", "player", "match", "transfer"]

model_name = "numind/NuZero_token"

nlp = spacy.load("en_core_web_sm", disable=["ner"])
nlp.add_pipe(
    "gliner_spacy",
    config={
        "gliner_model": model_name,
        "chunk_size": 250,
        "labels": candidate_labels,
        "style": "ent",
        "threshold": 0.3,
    },
)

<gliner_spacy.pipeline.GlinerSpacy at 0x159cdf450>

In [173]:
lf6 = SpacyAnnotator("nuner", nlp, disabled=["tagger", "lemmatizer", "attribute_ruler"])

In [174]:
text = "Ligue 1 should be worth more than Premier League."
doc = nlp(text)
utils.display_entities(lf6(doc), "nuner")

In [177]:
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
nlp.add_pipe(
    "dbpedia_spotlight",
    config={
        "dbpedia_rest_endpoint": "http://localhost:2222/rest",
        "overwrite_ents": True,
    },
)

<spacy_dbpedia_spotlight.entity_linker.EntityLinker at 0x431668f90>

In [178]:
lf7 = SpacyAnnotator("dbpedia", nlp, disabled=["tagger", "lemmatizer", "attribute_ruler"])

In [179]:
text = "Ligue 1 should be worth more than Premier League."
doc = nlp(text)
utils.display_entities(lf7(doc), "dbpedia")

In [191]:
lf8 = ModelAnnotator("spacy", "en_core_web_sm")

### Applying labelling functions


In [193]:
combined = base.CombinedAnnotator()
combined.add_annotator(lf1)  # score regex
combined.add_annotator(lf3)  # match cues
combined.add_annotator(lf4)  # gazetteer competitions
combined.add_annotator(lf5)  # gazetteer teams
combined.add_annotator(lf7)  # dbpedia
# combined.add_annotator(lf6)  # nuner
combined.add_annotator(lf8)  # spacy

<skweak.base.CombinedAnnotator at 0x3c23f3250>

In [195]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

nlp = spacy.load("en_core_web_sm", disable=["ner"])
docs = nlp.pipe([record.fields["text"] for record in remote_dataset.records])

In [196]:
docs = list(combined.pipe(docs))

batch_size 128


### Aggregation

Once the labelling functions have been applied, we must then aggregate their results, to get a single annotation for each document. This is done in `skweak` by estimating a generative model. Aggregating the labels can be done in a few lines of code:


In [197]:
import skweak

# , "Other Person", "Other Organisation"
unified_model = skweak.aggregation.HMM(
    "hmm", ["Competition", "Team", "Player", "Match", "Transfer"]
)
unified_model.add_underspecified_label("DBPEDIA_ENT", ["Competition", "Team", "Player"])
unified_model.add_underspecified_label("PER", ["Player", "Other Person"])
unified_model.add_underspecified_label("ORG", ["Team", "Other Organisation"])

In [198]:
unified_model.fit(docs)

Starting iteration 1
Finished E-step with 50 documents
Starting iteration 2


         1  -51990.16316583             +nan


Finished E-step with 50 documents
Starting iteration 3


         2  -39517.19027457  +12472.97289125


Finished E-step with 50 documents
Starting iteration 4


         3  -39463.19821623     +53.99205835


Finished E-step with 50 documents


         4  -39459.83574987      +3.36246636


In [199]:
# Saving the model to a file
unified_model.save("../../data/hmm_football_small.pkl")

### Using the (hopefully) better NER model to add suggestions in Argilla


In [200]:
remote_dataset = rg.FeedbackDataset.from_argilla("football-news", workspace="admin")

In [205]:
from typing import Tuple, Union
from argilla.client.feedback.schemas import SpanValueSchema, SuggestionSchema
from argilla.client.feedback.schemas.remote.records import RemoteSuggestionSchema
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["ner"])

modified_records = [record for record in remote_dataset.records]
for record in tqdm(modified_records):
    doc = unified_model(nlp(record.fields["text"]))
    record.suggestions: Tuple[Union[dict, SuggestionSchema, RemoteSuggestionSchema], ...]
    record.suggestions = [
        {
            "question_name": "entities",
            "value": [
                SpanValueSchema(
                    start=ent.start_char,
                    end=ent.end_char,
                    label=ent.label_.capitalize(),
                )
                for ent in doc.ents
            ],
            "agent": "skweak_model",
            # "score": ...
        }
    ]

remote_dataset.update_records(modified_records)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:06<00:00,  8.23it/s]




In [222]:
[k for k in docs[0].spans.keys()]

['scores', 'match_cues', 'competitions', 'teams', 'dbpedia', 'spacy']

In [239]:
skweak.utils.display_entities(docs[1], "spacy")

In [207]:
record = modified_records[0]
doc = nlp(record.fields["text"])

In [211]:
doc2 = unified_model(combined(doc))

In [240]:
skweak.utils.display_entities(unified_model(docs[1]), "hmm")

## 3.8 Compute metrics


In [289]:
from argilla.client.feedback.metrics.utils import get_responses_and_suggestions_per_user

In [291]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    name="ner-lvl2", workspace="admin", with_vectors="all"
)

# responses_and_suggestions_per_user = get_responses_and_suggestions_per_user(dataset=remote_dataset, question_name="entities")

Extracting responses and suggestions per user: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 861253.39it/s]


In [298]:
hf_dataset = remote_dataset.format_as("datasets")

In [300]:
hf_dataset[0]

{'text': 'A Russian diver has found the bodies of three of the 118 sailors who were killed when the nuclear submarine Kursk sank in the Barents Sea .',
 'entities': [{'user_id': 'fe7c1b6a-5d30-41d5-bb56-6675cfbad12f',
   'value': {'start': [2, 40, 53, 108, 122],
    'end': [9, 45, 56, 113, 137],
    'label': ['NORP', 'CARDINAL', 'CARDINAL', 'PRODUCT', 'LOC'],
    'text': ['Russian', 'three', '118', 'Kursk', 'the Barents Sea']},
   'status': 'submitted'}],
 'entities-suggestion': {'start': [2, 40, 53, 108, 122],
  'end': [9, 45, 56, 113, 137],
  'label': ['NORP', 'CARDINAL', 'CARDINAL', 'PRODUCT', 'LOC'],
  'text': ['Russian', 'three', '118', 'Kursk', 'the Barents Sea'],
  'score': [None, None, None, None, None]},
 'entities-suggestion-metadata': {'type': None, 'score': None, 'agent': None},
 'external_id': None,
 'metadata': '{}'}

In [None]:
from spacy.tokens import Doc
from spacy.training import Example

examples: List[Example] = []
for row in hf_dataset:
    text = row["text"]
    gold = row["entities"]
    pred = row["entities-suggestion"]

    # generate Doc with Doc.set_ents from a list of spans for the predicted suggestions
    # ref: https://spacy.io/api/doc#set_ents

    # generate Doc with Doc.set_ents from a list of spans for the gold responses

    example = Example(predicted, reference)
    examples.append(example)

In [296]:
from spacy.scorer import Scorer

scorer = Scorer()

In [None]:
scores = scorer.score(examples)

## 3.10 Push to Huggingface


In [155]:
dataset.push_to_huggingface(repo_id="louisguitton/dev-ner-ontonotes", split="validation")

Uploading the dataset shards:   0%|                                                                                                                                    | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 411.00ba/s][A
Uploading the dataset shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.73it/s]
README.md: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10.0k/10.0k [00:00<00:00, 8.44MB/s]
