In [2]:
%load_ext autoreload
%autoreload 2

from metanno.recipes.ner import NERApp

colors = [
    "rgb(255,200,206)",
    "rgb(210,236,247)",
    "rgb(211,242,206)",
    "rgb(242,242,206)",
    "rgb(231,210,247)",
    "rgb(252,215,216)",
    "rgb(251,243,219)",
    "rgb(250,231,212)",
    "rgb(250,212,229)",
]

ModuleNotFoundError: No module named 'astunparse'

In [2]:
import spacy
nlp = spacy.blank('eds')
nlp.add_pipe("eds.normalizer")
nlp.add_pipe("eds.sentences")

# Entity extraction pipelines
nlp.add_pipe("eds.covid")
nlp.add_pipe("eds.dates")
nlp.add_pipe("eds.measures")
nlp.add_pipe("eds.charlson")
nlp.add_pipe("eds.SOFA")
nlp.add_pipe("eds.emergency.priority")

nlp.add_pipe(
    "eds.matcher",
    config=dict(
        regex=dict(custom=r"texte|asthmatique|difficult[ée]s?\srespiratoires?"),
        attr="NORM",
    ),
)
nlp.add_pipe("eds.negation")

<edsnlp.pipelines.qualifiers.negation.negation.Negation at 0x7f8e68d9d610>

In [3]:
texts = [
    """Motif :
Le patient est admis le 29 août pour des difficultés respiratoires.

Antécédents familiaux :
Le père du patient n'est pas asthmatique.

HISTOIRE DE LA MALADIE
Le patient dit avoir de la toux depuis trois jours. Elle a empiré jusqu'à nécessiter un passage aux urgences.
A noter deux petits kystes bénins de 1 et 2cm biopsiés en 2005.

Priorité: 2 (établie par l'IAO à l'entrée)

Conclusion
Possible infection au coronavirus
""" * 5,
    """Ceci n'est pas mon texte.""",
]
docs = []
for i, doc in enumerate(nlp.pipe(texts)):
    entry = {"id": f"{i}", "text": str(doc), "entities": []}
    for i, ent in enumerate(doc.ents):
        entry["entities"].append({
            "id": i,
            "begin": ent.start_char,
            "end": ent.end_char,
            "label": ent.label_,
            "negation": ent._.negation,
            "suggestions": {
                "concept": ["C9876"],
                "label": [""]
            }
        })
    docs.append(entry)

In [3]:
import os
import glob
import pathlib
import json

In [28]:
class EDSDataConnector:
    # TODO, fix this with Proxy states
    def __init__(self, path):
        self.path = pathlib.Path(path)
        os.makedirs(path, exist_ok=True)
        
    def load_one(self, filename):
        # Open the raw text file
        try:
            text_filepath = self.path.joinpath(filename)
            with open(text_filepath) as f:
                text = f.read()
        except:
            return
            
        # Open the annotations if any
        json_filepath = self.path.joinpath(f"{filename}.json")
        if os.path.exists(json_filepath):
            with open(json_filepath) as f:
                doc = json.load(f)
                doc["id"] = str(filename)
                doc["text"] = text
        else:
            doc = {"id": str(filename), "text": text, "entities": []}
            
        return doc
            
    def load(self):
        filenames = list(self.path.rglob("*.txt"))
        docs = []
        for filename in sorted(filenames):
            if ".ipynb_checkpoints" in str(filename):
                continue
            filename = pathlib.Path(filename).relative_to(self.path)
            doc = self.load_one(filename)
            if doc is not None:
                docs.append(doc)
        return docs
    
    def save_one(self, doc):
        with open(self.path.joinpath(f"{doc['id']}.json"), "w") as f:
            json.dump(doc, f)
            
    def save(self, docs):
        for doc in docs:
            self.save_one(doc)
            
class SpacySuggester:
    def __init__(self, model):
        self.model = model
        
    def __call__(self, doc):
        spacy_doc = self.model(doc["text"])
        return {
            **doc,
            "entities": [
                span
                for i, ent in enumerate(spacy_doc.ents)
                for span in (
                    {
                        "id": f"metanno-{doc['id']}-{i}",
                        "begin": ent.start_char,
                        "end": ent.end_char,
                        "label": ent.label_,
                        "negation": ent._.negation,
                        "suggestions": {
                            "concept": ["C1234"],
                            "label": [],
                        },
                        "scope": f"{i}-scope",
                    },
                    {
                        "id": f"{i}-scope",
                        "begin": ent.start_char,
                        "end": ent.end_char + 50,
                        "label": "scope",
                    }
                )
            ]
        }

import glob
import os
import re
from collections import defaultdict
from itertools import chain
from typing import Any, Dict, Iterator, List, Tuple, Union

import pandas as pd
from joblib import Parallel, delayed
from loguru import logger
from spacy import Language
from spacy.tokens import Doc, Span
from spacy.util import filter_spans
from tqdm import tqdm

REGEX_ENTITY = re.compile(r'^(T\d+)\t([^\s]+)([^\t]+)\t(.*)$')
REGEX_NOTE = re.compile(r'^(#\d+)\tAnnotatorNotes ([^\t]+)\t(.*)$')
REGEX_STATUS = re.compile(r'^(#\d+)\tStatus ([^\t]+)\t(.*)$')
REGEX_RELATION = re.compile(r'^(R\d+)\t([^\s]+) Arg1:([^\s]+) Arg2:([^\s]+)')
REGEX_ATTRIBUTE = re.compile(r'^([AM]\d+)\t(.+)$')
REGEX_EVENT = re.compile(r'^(E\d+)\t(.+)$')
REGEX_EVENT_PART = re.compile(r'([^\s]+):([TE]\d+)')


def load_from_brat(path: str, merge_spaced_fragments: bool = True) -> Dict:
    """
    Load a brat file

    Adapted from https://github.com/percevalw/nlstruct/blob/master/nlstruct/datasets/brat.py

    Parameters
    ----------
    path: str
        Path or glob path of the brat text file (.txt, not .ann)
    merge_spaced_fragments: bool
        Merge fragments of a entity that was splitted by brat because it overlapped an end of line

    Returns
    -------
    Iterator[Dict]
    """
    ann_filenames = []
    for filename in glob.glob(str(path).replace(".txt", ".a*"), recursive=True):
        ann_filenames.append(filename)

    entities = {}
    relations = []
    events = {}

    with open(path) as f:
        text = f.read()

    note_id = path#.split("/")[-1].rsplit(".", 1)[0]

    if not len(ann_filenames):
        return {
            "id": note_id,
            "text": text,
            "entities": [],
        }

    doc = {
        "id": note_id,
        "text": text,
    }
    
    for ann_file in ann_filenames:
        with open(ann_file) as f:
            for line_idx, line in enumerate(f):
                try:
                    if line.startswith('T'):
                        match = REGEX_ENTITY.match(line)
                        if match is None:
                            raise ValueError(f'File {ann_file}, unrecognized Brat line {line}')
                        ann_id = match.group(1)
                        entity = match.group(2)
                        span = match.group(3)
                        mention_text = match.group(4)
                        entities[ann_id] = {
                            "id": ann_id,
                            "label": entity,
                            "fragments": [],
                            "attributes": [],
                            "comments": [],
                        }
                        last_end = None
                        fragment_i = 0
                        begins_ends = sorted([(int(s.split()[0]), int(s.split()[1])) for s in span.split(';')])

                        for begin, end in begins_ends:
                            # If merge_spaced_fragments, merge two fragments that are only separated by a newline (brat automatically creates
                            # multiple fragments for a entity that spans over more than one line)
                            if merge_spaced_fragments and last_end is not None and len(text[last_end:begin].strip()) == 0:
                                entities[ann_id]["fragments"][-1]["end"] = end
                                continue
                            entities[ann_id]["begin"] = begin
                            entities[ann_id]["end"] = end
                            entities[ann_id]["fragments"].append({
                                "begin": begin,
                                "end": end,
                            })
                            fragment_i += 1
                            last_end = end
                    elif line.startswith('A') or line.startswith('M'):
                        match = REGEX_ATTRIBUTE.match(line)
                        if match is None:
                            raise ValueError(f'File {ann_file}, unrecognized Brat line {line}')
                        ann_id = match.group(1)
                        parts = match.group(2).split(" ")
                        if len(parts) >= 3:
                            entity, entity_id, value = parts
                        elif len(parts) == 2:
                            entity, entity_id = parts
                            value = None
                        else:
                            raise ValueError(f'File {ann_file}, unrecognized Brat line {line}')
                        (entities[entity_id] if entity_id.startswith('T') else events[entity_id])["attributes"].append({
                            "label": entity,
                            "value": value,
                        })
                    elif line.startswith('R'):
                        match = REGEX_RELATION.match(line)
                        if match is None:
                            raise ValueError(f'File {ann_file}, unrecognized Brat line {line}')
                        ann_id = match.group(1)
                        ann_name = match.group(2)
                        arg1 = match.group(3)
                        arg2 = match.group(4)
                        relations.append({
                            "id": ann_id,
                            "relation_label": ann_name,
                            "from_id": arg1,
                            "to_id": arg2,
                        })
                    elif line.startswith('E'):
                        match = REGEX_EVENT.match(line)
                        if match is None:
                            raise ValueError(f'File {ann_file}, unrecognized Brat line {line}')
                        ann_id = match.group(1)
                        arguments_txt = match.group(2)
                        arguments = []
                        for argument in REGEX_EVENT_PART.finditer(arguments_txt):
                            arguments.append({"entity_id": argument.group(2), "label": argument.group(1)})
                        events[ann_id] = {
                            "id": ann_id,
                            "attributes": [],
                            "arguments": arguments,
                        }
                    elif line.startswith('#'):
                        match = REGEX_STATUS.match(line)
                        if match:
                            comment = match.group(3)
                            doc["seen"] = comment == "CHECKED"
                            continue
                        
                        match = REGEX_NOTE.match(line)
                        if match is None:
                            raise ValueError(f'File {ann_file}, unrecognized Brat line {line}')
                        ann_id = match.group(1)
                        entity_id = match.group(2)
                        comment = match.group(3)
                        entities[entity_id]["comments"].append({
                            "comment": comment,
                        })
                except:
                    raise Exception("Could not parse line {} from {}: {}".format(line_idx, filename.replace(".txt", ".ann"), repr(line)))
                    
    doc.update({
        "entities": list(entities.values()),
        "relations": relations,
        "events": list(events.values()),
    })
    return doc


def export_to_brat(doc, txt_filename, overwrite_txt=False, overwrite_ann=False):
    txt_filename = str(txt_filename)
    parent_dir = txt_filename.rsplit("/", 1)[0]
    if parent_dir and not os.path.exists(parent_dir):
        os.makedirs(parent_dir, exist_ok=True)
    if not os.path.exists(txt_filename) or overwrite_txt:
        with open(txt_filename, "w") as f:
            f.write(doc["text"])

    ann_filename = txt_filename.replace(".txt", ".ann")
    attribute_idx = 1
    entities_ids = defaultdict(lambda: "T" + str(len(entities_ids) + 1))
    if not os.path.exists(ann_filename) or overwrite_ann:
        with open(ann_filename, "w") as f:
            if "entities" in doc:
                for entity in doc["entities"]:
                    idx = None
                    spans = []
                    brat_entity_id = entities_ids[entity["id"]]
                    if "begin" in entity and "end" in entity:
                        entity["fragments"] = [{
                            "begin": entity["begin"],
                            "end": entity["end"],
                        }]
                    for fragment in sorted(entity["fragments"], key=lambda frag: frag["begin"]):
                        idx = fragment["begin"]
                        entity_text = doc["text"][fragment["begin"]:fragment["end"]]
                        for part in entity_text.split("\n"):
                            begin = idx
                            end = idx + len(part)
                            idx = end + 1
                            if begin != end:
                                spans.append((begin, end))
                    print("{}\t{} {}\t{}".format(
                        brat_entity_id,
                        str(entity["label"]),
                        ";".join(" ".join(map(str, span)) for span in spans),
                        entity_text.replace("\n", " ")), file=f)
                    if "attributes" in entity:
                        for i, attribute in enumerate(entity["attributes"]):
                            if "value" in attribute and attribute["value"] is not None and attribute["value"] != "":
                                print("A{}\t{} {} {}".format(
                                    attribute_idx,
                                    str(attribute["label"]),
                                    brat_entity_id,
                                    attribute["value"]), file=f)
                            elif attribute["value"] is True:
                                print("A{}\t{} {}".format(
                                    attribute_idx,
                                    str(attribute["label"]),
                                    brat_entity_id), file=f)
                            attribute_idx += 1
            if "relations" in doc:
                for i, relation in enumerate(doc["relations"]):
                    entity_from = entities_ids[relation["from_id"]]
                    entity_to = entities_ids[relation["to_id"]]
                    print("R{}\t{} Arg1:{} Arg2:{}\t".format(
                        i + 1,
                        str(relation["label"]),
                        entity_from,
                        entity_to), file=f)


class BratDataConnector:
    # TODO, fix this with Proxy states
    def __init__(self, path):
        self.path = pathlib.Path(path)
        os.makedirs(path, exist_ok=True)
            
    def load(self):
        filenames = list(self.path.rglob("*.txt"))
        docs = []
        for filename in sorted(filenames):
            if ".ipynb_checkpoints" in str(filename):
                continue
            filename = pathlib.Path(filename).relative_to(self.path)
            doc = load_from_brat(str(self.path.joinpath(filename)))
            doc["id"] = str(filename)
            if doc is not None:
                docs.append(doc)
        return docs
    
    def save_one(self, doc):
        export_to_brat(doc, self.path.joinpath(doc['id']), overwrite_ann=True)
        # with open(self.path.joinpath(f"{doc['id']}.json"), "w") as f:
        #     json.dump(doc, f)
            
    def save(self, docs):
        for doc in docs:
            self.save_one(doc)

suggester = SpacySuggester(nlp)
data = BratDataConnector("dataset")

In [1]:
from collections import Counter
labels = sorted(set([ent["label"] for doc in data.load() for ent in doc["entities"]]))
keys = {}
for label in labels:
    keys[label] = next(letter for letter in label.lower() if letter not in keys.values())
app = NERApp(
    data=data,
    suggester=suggester,
    scheme={
        "labels": [
            {"name": label, "color": colors[i], "key": keys[label]}
            for i, label in enumerate(labels)
        ],
        "attributes": [{
            "name": "modality",
            "kind": "text",
            "key": "m",
            "color": "lightgrey",
            "choices": ["factual", "negated", "conditional", "counterindication", "uncertain", "suggested"]
        }, {
            "name": "experiencer",
            "kind": "text",
            "key": "x",
            "color": "lightgrey",
            "choices": ["self", "family", "other"],
        }, {
            "name": "time",
            "kind": "text",
            "key": "t",
            "color": "lightgrey",
            "choices": ["present", "past", "future"],
        }, {
            "name": "concept",
            "kind": "text",
            "color": "lightgrey",
            "choices": [f"C{n:04}" for n in range(10000)],
        }],
    },
)

NameError: name 'data' is not defined

In [50]:
app = NERApp(
    data=data,
    suggester=suggester,
    scheme={
        "labels": [{
            "name": "nom",
            "key": "a",
            "color": colors[3],
        }, {
            "name": "covid",
            "key": "c",
            "color": colors[1],
        }, {
            "name": "custom",
            "key": "u",
            "color": colors[2],
        }, {
            "name": "eds.emergency.priority",
            "key": "e",
            "color": colors[0],
        },
        #    {
        #    "name": "scope",
        #    "key": "",
        #    "color": "#ffedfa",
        #    "border": "#ffedfa",
        #    "alpha": 1.0,
        #    "shape": "fullHeight",
        #}
        ],
        "attributes": [],
        #{
        #    "name": "modality",
        #    "kind": "text",
        #    "key": "m",
        #    "color": "lightgrey",
        #    "choices": ["factual", "negated", "conditional", "counterindication", "uncertain", "suggested"]
        #}, {
        #    "name": "experiencer",
        #    "kind": "text",
        #    "key": "x",
        #    "color": "lightgrey",
        #    "choices": ["self", "family", "other"],
        #}, {
        #    "name": "time",
        #    "kind": "text",
        #    "key": "t",
        #    "color": "lightgrey",
        #    "choices": ["present", "past", "future"],
        #}, {
        #    "name": "concept",
        #    "kind": "text",
        #    "color": "lightgrey",
        #    "choices": [f"C{n:04}" for n in range(10000)],
        #}],
    },
)


Transcrypt (TM) Python to JavaScript Small Sane Subset Transpiler Version 3.9.0
Copyright (C) Geatec Engineering. License: Apache 2.0


Saving target code in: /Users/perceval/Development/metanno/examples/__target__/test.js



In [18]:
from metanno import manager

In [8]:
from metanno.manager import AppManager
AppManager()._state

{}

In [None]:
with open('test.json', 'w') as f:
    json.dump(app.state)

In [12]:
app.span_editor("text")

<metanno.views.SpanEditor object at 0x7fe84aa1bd60>

In [20]:
res = app.table_editor("docs")

In [21]:
res

<metanno.views.TableEditor object at 0x7f9066189790>

In [20]:
app.table_editor("entities")

<metanno.views.TableEditor object at 0x7f90ad4806a0>

In [11]:
app.state["docs"]["doc1-Copy1.txt"]["entities"]

{0: {'id': 0,
  'begin': 49,
  'end': 74,
  'label': 'custom',
  'negation': False,
  'suggestions': {'concept': ['C1234'], 'label': []},
  'scope': '0-scope'},
 '0-scope': {'id': '0-scope', 'begin': 49, 'end': 124, 'label': 'scope'},
 1: {'id': 1,
  'begin': 130,
  'end': 141,
  'label': 'custom',
  'negation': True,
  'suggestions': {'concept': ['C1234'], 'label': []},
  'scope': '1-scope'},
 '1-scope': {'id': '1-scope', 'begin': 130, 'end': 191, 'label': 'scope'},
 2: {'id': 2,
  'begin': 342,
  'end': 350,
  'label': 'eds.emergency.priority',
  'negation': False,
  'suggestions': {'concept': ['C1234'], 'label': []},
  'scope': '2-scope'},
 '2-scope': {'id': '2-scope', 'begin': 342, 'end': 400, 'label': 'scope'},
 3: {'id': 3,
  'begin': 406,
  'end': 430,
  'label': 'covid',
  'negation': False,
  'suggestions': {'concept': ['C1234'], 'label': []},
  'scope': '3-scope'},
 '3-scope': {'id': '3-scope', 'begin': 406, 'end': 480, 'label': 'scope'}}