Merge pull request #29 from mideind/fix/ifd_output_map_oddities

Modified IFD pos mapping to comply with standard
mideind · Apr 27, 2021 · eb0364a · eb0364a
2 parents e48c19f + 916d9ba
commit eb0364a
Show file tree

Hide file tree

Showing 12 changed files with 124 additions and 16 deletions.
diff --git a/.github/workflows/superlinter.yml b/.github/workflows/superlinter.yml
@@ -30,4 +30,5 @@ jobs:
           PYTHON_PYLINT_CONFIG_FILE: .python-lint
           VALIDATE_JSCPD: false
           VALIDATE_MARKDOWN: false
+          VALIDATE_PYTHON_MYPY: false
           LOG_LEVEL: NOTICE
diff --git a/Dockerfile b/Dockerfile
@@ -9,5 +9,4 @@ COPY ./requirements.txt /app/requirements.txt
 WORKDIR /app
 RUN apt-get update && apt-get install --no-install-recommends -y python-numpy==1.16.5 build-essential==12.9 && rm -rf /var/lib/apt/lists/*
 ENV CFLAGS="-I /usr/local/lib/python3.8/site-packages/numpy/core/include $CFLAGS"
-RUN pip install --no-cache-dir -r requirements.txt
-RUN cython nicenlp/utils/greynir/tree_dist.pyx
+RUN pip install --no-cache-dir -r requirements.txt && cython nicenlp/utils/greynir/tree_dist.pyx
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ The `greynirseq` CLI interface can be used to run state-of-the-art POS and NER t
 ❯ pip install greynirseq
 ❯ echo "Systurnar Guðrún og Monique átu einar um jólin á McDonalds ." | greynirseq pos --input -
 
-nvfng nven-s c ns sfg3fþ lvfnsf aff nhfog aff ns pl
+nvfng nven-s c n---s sfg3fþ lvfnsf af nhfog af n----s pl
 ```
 
 #### NER

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ testpaths = [
 
 [tool.poetry]
 name = "greynirseq"
-version = "0.1"
+version = "0.1.2"
 description = "Natural language processing for Icelandic"
 authors = ["Miðeind ehf <tauganet@mideind.is>"]
 classifiers=[

diff --git a/src/greynirseq/cli/greynirseq.py b/src/greynirseq/cli/greynirseq.py
@@ -74,7 +74,7 @@ def run(self, input: io.IOBase, output: io.IOBase) -> Union[None, io.IOBase]:
 
 class NER(GreynirSeqIO):
     def build_model(self) -> GeneratorHubInterface:
-        model = torch.hub.load("mideind/GreynirSeq:hub", "icebert.ner")
+        model = torch.hub.load("mideind/GreynirSeq:main", "icebert.ner")
         model.to(self.device)
         model.eval()
         return model
@@ -90,7 +90,7 @@ def infer(self, batch) -> List[str]:
 
 class POS(GreynirSeqIO):
     def build_model(self) -> GeneratorHubInterface:
-        model = torch.hub.load("mideind/GreynirSeq:hub", "icebert.pos")
+        model = torch.hub.load("mideind/GreynirSeq:main", "icebert.pos")
         model.to(self.device)
         model.eval()
         return model

diff --git a/src/greynirseq/nicenlp/examples/ner/README.md b/src/greynirseq/nicenlp/examples/ner/README.md
@@ -23,9 +23,9 @@ This will download the model from our servers and return an instance for inferen
 
 ```python
 import torch
-model = torch.hub.load("mideind/GreynirSeq", "icebert.ner")
+model = torch.hub.load("mideind/GreynirSeq:main", "icebert.ner")
 model.eval()
-labels = list(model.predict_labels(["Systurnar Guðrún og Monique átu einar um jólin á McDonalds."])
+labels = list(model.predict_labels(["Systurnar Guðrún og Monique átu einar um jólin á McDonalds ."])
 ```
 which returns the labels `['O', 'B-Person', 'O', 'B-Person', 'O', 'O', 'O', 'O', 'O', 'B-Organization', 'O']`.
 
@@ -41,6 +41,9 @@ model = MultiClassRobertaModel.from_pretrained(IceBERT_NER_PATH, **IceBERT_NER_C
 model.eval()
 ```
 
+Note that the length of the sentences has a ceiling set by the used model and direct inference may crash on long sentences. To run the models on GPU simply run `model.to("cuda")`, we refer to the pytorch documentation for further details. When running on GPU we recommend using the argument `batch_size` with `predict_labels` to speed up inference.
+
+
 ## Training
 
 ### Preprocessing

diff --git a/src/greynirseq/nicenlp/examples/pos/README.md b/src/greynirseq/nicenlp/examples/pos/README.md
@@ -1,9 +1,11 @@
 # POS tagging with IceBERT
 
-This example shows how to train an Icelandic POS tagger with ~98.2% accuracy on the [Tagged Icelandic Corpus](http://www.malfong.is/index.php?lang=en&pg=mim) (MIM) dataset . The output can be configure to use the MIM 2.0 label format which is the default for the CLI.
+This example shows how to train an Icelandic POS tagger with ~98.2% accuracy on the [Tagged Icelandic Corpus](http://www.malfong.is/index.php?lang=en&pg=mim) (MIM) dataset.
+
+The output can be configure to use the MIM 2.0 label format which is the default for the CLI. Note that the tags **e** (foreign) and **x** (unknown) tag are excluded from training and labeling since the tagger has no particular problem with labeling those words.
 
 ## Preprocessing
-See `./prep_mim_pos.sh` which is setup to process all data from the MIM pos set and prepare for crossvalidation.
+See `./prep_mim_pos.sh` which is setup to process all data from the MIM pos set and prepare for cross-validation.
 
 ## Training
 See `./train.sh` which trains all ten sets for cross-validation.
@@ -18,7 +20,7 @@ Using the CLI is the easiest way of using the tagger, this downloads the necessa
 ❯ pip install greynirseq
 ❯ echo "Systurnar Guðrún og Monique átu einar um jólin á McDonalds ." | greynirseq pos --input -
 
-nvfng nven-s c ns sfg3fþ lvfnsf aff nhfog aff ns pl
+nvfng nven-s c n---s sfg3fþ lvfnsf af nhfog af n----s pl
 ```
 
 It takes a while to load the model so if you need to tag many lines you should provide them all at once.
@@ -29,7 +31,7 @@ This will download the model from our servers and return an instance for inferen
 
 ```python
 import torch
-model = torch.hub.load("mideind/GreynirSeq", "icebert.pos")
+model = torch.hub.load("mideind/GreynirSeq:main", "icebert.pos")
 model.eval()
 labels = model.predict_labels(["Systurnar Guðrún og Monique átu einar um jólin á McDonalds ."])
 ```
@@ -50,12 +52,38 @@ which returns
   ('pl', [])]]
 ```
 
+or
+
+``` python
+
+labels = model.predict_ifd_labels(["Systurnar Guðrún og Monique átu einar um jólin á McDonalds ."])
+```
+
+which returns
+
+``` python
+[['nvfng',
+  'nven-s',
+  'c',
+  'ns',
+  'sfg3fþ',
+  'lvfnsf',
+  'af',
+  'nhfog',
+  'af',
+  'ns',
+  'pl']]
+
+```
+
+Note that the length of the sentences has a ceiling set by the used model and direct inference may crash on long sentences. To run the models on GPU simply run `model.to("cuda")`, we refer to the pytorch documentation for further details.
+
 ### Local inference
 
 Point the model class to the checkpoint (any of the splits or an averaged checkpoint) and auxiliary data as e.g.
 
 ```python
-from greynirseq.nicenlp.models.multilabel import MultiLabelRobertaMode
+from greynirseq.nicenlp.models.multilabel import MultiLabelRobertaModel
 from greynirseq.settings import IceBERT_POS_PATH, IceBERT_POS_CONFIG
 model = MultiLabelRobertaModel.from_pretrained(IceBERT_POS_PATH, **IceBERT_POS_CONFIG)
 

diff --git a/src/greynirseq/nicenlp/models/multiclass.py b/src/greynirseq/nicenlp/models/multiclass.py
@@ -22,6 +22,7 @@
 from torch.nn.utils.rnn import pad_sequence
 
 from greynirseq.nicenlp.data.encoding import get_word_beginnings
+from greynirseq.nicenlp.utils.ner_parser import BIOParser
 
 logger = logging.getLogger(__name__)
 
@@ -187,6 +188,7 @@ def predict_labels(self, sentences: List[str], batch_size=1) -> Iterable[List[st
         for ndx in range(0, length, batch_size):
             batch = sentences[ndx : min(ndx + batch_size, length)]
             labels, pred_idx = self._predict_labels(batch)
+            labels = BIOParser.parse(labels)
             yield from labels
 
     def _predict_labels(self, sentences: List[str]) -> Tuple[List[List[str]], torch.Tensor]:

diff --git a/src/greynirseq/nicenlp/models/multilabel.py b/src/greynirseq/nicenlp/models/multilabel.py
@@ -221,11 +221,23 @@ def predict_ifd_labels(self, sentences: List[str]) -> List[List[str]]:
             ifd_labels = []
             for labelset in sentence_labels:
                 cat, feats = labelset
-                idxs = [labdict.symbols.index(label) for label in [cat] + feats]
+                labels_to_map = [cat]
+                if len(feats) == 1 and feats[0] == "pos":
+                    # This label is used as a default for training but implied in mim format
+                    feats = []
+                elif cat == "sl" and "act" in feats:
+                    # Number and tense are not shown for sl act in mim format
+                    feats = [f for f in feats if f not in ["1", "sing", "pres"]]
+                labels_to_map += feats
+                idxs = [labdict.symbols.index(label) for label in labels_to_map]
                 oh = nn.functional.one_hot(torch.tensor(idxs), num_classes=len(labdict.symbols)).sum(dim=0)
                 # Add one since the sep token is not treated as a special token in the label dictionary
                 oh = oh[labdict.nspecial + 1 :]
-                ifd_labels.append(vec2ifd(oh.numpy()))
+                ifd_label = vec2ifd(oh.numpy())
+                if ifd_label == "ns":
+                    # This is to comply with the format
+                    ifd_label = "n----s"
+                ifd_labels.append(ifd_label)
             ifd_labels_batch.append(ifd_labels)
         return ifd_labels_batch
 

diff --git a/src/greynirseq/nicenlp/utils/ner_parser.py b/src/greynirseq/nicenlp/utils/ner_parser.py
@@ -0,0 +1,45 @@
+from typing import List
+
+
+class BIOParser:
+    def __init__(self, labels: List[str]):
+        self.labels = labels
+        self.idx = 0
+        self.last = None
+
+    def _over(self) -> str:
+        """Ensures legal BIO tags, i.e. I-tags have same
+        label as preceding B-tag and B-tags start new
+        spans.
+        """
+        self.idx += 1
+        cur_label = self.labels[self.idx - 1]
+
+        if self.last is None:
+            return cur_label
+
+        if cur_label == "O":
+            return cur_label
+
+        if self.last == "O":
+            # In case the label starts with I
+            return "B" + cur_label[1:]
+
+        _, last_cat = self.last.split("-")
+        cur_head, _ = cur_label.split("-")
+        if cur_head != "B":
+            return f"{cur_head}-{last_cat}"
+        return cur_label
+
+    def over(self) -> str:
+        label = self._over()
+        self.last = label
+        return label
+
+    @classmethod
+    def parse(cls, labels: List[str]) -> List[str]:
+        parser = cls(labels)
+        fixed_labels = []
+        while parser.idx != len(parser.labels):
+            fixed_labels.append(parser.over())
+        return fixed_labels
diff --git a/src/greynirseq/utils/ifd_utils.py b/src/greynirseq/utils/ifd_utils.py
@@ -198,7 +198,7 @@ def groups_to_label(groups):
 assert DIM == len(set(LABELS)), "tag collision"
 
 
-GENDER = {"k": "masc", "v": "fem", "h": "neut", "x": "gender_x", "-": "gender_x"}
+GENDER = {"k": "masc", "v": "fem", "h": "neut", "-": "gender_x"}
 NUMBER = {"e": "sing", "f": "plur"}
 PERSON = {"1": "1", "2": "2", "3": "3"}
 CASE = {"n": "nom", "o": "acc", "þ": "dat", "e": "gen"}

diff --git a/tests/ner/test_bioparser.py b/tests/ner/test_bioparser.py
@@ -0,0 +1,18 @@
+from greynirseq.nicenlp.utils.ner_parser import BIOParser
+
+# First value is the incorrect labels, the second the correct
+TEST_LABELS = [
+    ("B-X B-Y I-Y O", "B-X B-Y I-Y O"),
+    ("O I-a I-a B-u O", "O B-a I-a B-u O"),
+    ("O I-a O I-a B-u O", "O B-a O B-a B-u O"),
+    ("B-p I-r O I-a B-p", "B-p I-p O B-a B-p"),
+    (
+        "O O B-person I-money O O I-time I-time O O B-location I-location",
+        "O O B-person I-person O O B-time I-time O O B-location I-location",
+    ),
+]
+
+
+def test_bioparser():
+    for incorrect, correct in TEST_LABELS:
+        assert BIOParser.parse(incorrect.split()) == correct.split()