# Build HuggingFace dataset from TEI Publisher XMLs

This Jupyter notebook downloads and exports all XML documents in the `annotation` collection of the currently running TEI Publisher instance and additionally generates a shuffled Huggingface Dataset, which is stored as a Parquet file. HF datasets can be loaded using the training notebook `../training/training.py`.

In [None]:
!python -m spacy download de_dep_news_trf

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
import os
from itertools import chain
from typing import Collection, Dict, List, Sequence, Union

import numpy as np
import requests
import spacy
from datasets import ClassLabel, Dataset
from datasets import Sequence as Sequence_DS
from sklearn.utils import shuffle
from spacy.training import biluo_to_iob, offsets_to_biluo_tags
from span_to_dataset import spacy_to_hf
from tqdm.notebook import tqdm

spacy.require_gpu()

We define the mapping from IOB labels to indices, as well as a list of NEs to exclude from the annotations.

In [None]:
# From https://huggingface.co/stefan-it/span-marker-gelectra-large-germeval14/blob/main/config.json
idx_to_label = {
      "0": "O",
      "1": "B-LOC",
      "2": "I-LOC",
      "3": "B-LOCderiv",
      "4": "I-LOCderiv",
      "5": "B-LOCpart",
      "6": "I-LOCpart",
      "7": "B-ORG",
      "8": "I-ORG",
      "9": "B-ORGderiv",
      "10": "I-ORGderiv",
      "11": "B-ORGpart",
      "12": "I-ORGpart",
      "13": "B-OTH",
      "14": "I-OTH",
      "15": "B-OTHderiv",
      "16": "I-OTHderiv",
      "17": "B-OTHpart",
      "18": "I-OTHpart",
      "19": "B-PER",
      "20": "I-PER",
      "21": "B-PERderiv",
      "22": "I-PERderiv",
      "23": "B-PERpart",
      "24": "I-PERpart"
    }

label_to_idx = {v:int(k) for k, v in idx_to_label.items()}

# NEs, which are not consistently annotated in the dataset and thus need to be excluded
exclusion_list = [
    "Bezirksanwaltschaften",
    "Kantonsregierungen",
    "Verwaltungsräten",
    "Verwaltungsräte",
    "stadträtlichen",
    "Zweckverbandes",
    "Verwaltungsrat",
    "Gemeinderates",
    "Kommission",
    "Verwaltung",
    "Kantonen",
    "Rat"
]

We obtain both paragraphs and span annotations, as well as XMLs from the TEI Publisher API.

In [None]:
r = requests.get("http://localhost:8080/exist/apps/tei-publisher/api/nlp/data/annotate")
dataset = [t for t in r.json() if t["text"].strip()]
files = list({p["source"].replace("annotate/", "") for p in dataset})
os.makedirs('export_xml', exist_ok=True)

for f in tqdm(files):
    headers = {"Content-type": "application/xml"}
    r = requests.get(
        "http://localhost:8080/exist/apps/tei-publisher/api/document/annotate/{}".format(
            f
        ),
        headers=headers,
    )
    xml = r.content.decode("utf-8")

    with open('export_xml' + "/" + f, "w") as f:
        f.write(xml)

We shuffle the dataset.

In [None]:
dataset_arr = np.array(dataset, dtype="object")
dataset_arr = shuffle(dataset, random_state=42)
dataset = list(dataset)

We convert the span annotations to a Huggingface Dataset and export to a parquet file.

In [None]:
ds = spacy_to_hf(dataset_arr, label_to_idx)
ds.to_parquet('dataset.parquet')