In [1]:
import re
import numpy as np
import string
import pandas as pd
from tqdm import tqdm
from nlstruct.core.text import transform_text, apply_deltas, encode_as_tag, split_into_spans
from nlstruct.core.pandas import merge_with_spans, make_id_from_merged
from nlstruct.core.cache import get_cache
from nlstruct.core.environment import env
from nlstruct.chunking.spacy_tokenization import spacy_tokenize, SPACY_ATTRIBUTES

# from nlstruct.dataloaders.ncbi_disease import load_ncbi_disease
# from nlstruct.dataloaders.bc5cdr import load_bc5cdr
from nlstruct.dataloaders.n2c2_2019_task3 import load_n2c2_2019_task3
from nlstruct.dataloaders.brat import load_from_brat

## Load the dataset

In [2]:
# dataset = docs, mentions, labels, fragments = load_ncbi_disease()[["docs", "mentions", "labels", "fragments"]]
# dataset = docs, mentions, labels, fragments = load_bc5cdr()[["docs", "mentions", "labels", "fragments"]]
dataset = docs, mentions, fragments = load_n2c2_2019_task3()[["docs", "mentions", "fragments"]]
# dataset = docs, mentions, fragments = load_from_brat(env.resource("brat/my_brat_dataset/"))[["docs", "mentions", "fragments"]]
dataset

Dataset(
  (docs):        50 * ('doc_id', 'text', 'split')
  (mentions):  6684 * ('doc_id', 'mention_id', 'label')
  (fragments): 6792 * ('doc_id', 'mention_id', 'fragment_id', 'begin', 'end')
)

## Transform docs
Apply substitutions to the documents and translate spans accordingly

In [3]:
# Define subs as ("pattern", "replacements") list
subs = [
    (re.escape("<????-??-??>"), "MASKEDDATE"),
    (r"(?<=[{}\\])(?![ ])".format(string.punctuation), r" "),
    (r"(?<![ ])(?=[{}\\])".format(string.punctuation), r" "),
    ("(?<=[a-zA-Z])(?=[0-9])", r" "),
    ("(?<=[0-9])(?=[A-Za-z])", r" "),
    ("MASKEDDATE", "<????-??-??>"),
]
# Clean the text / perform substitutions
docs, deltas = transform_text.nocache(docs, *zip(*subs), return_deltas=True)

# Apply transformations to the spans
fragments = apply_deltas(fragments, deltas, on='doc_id')
fragments = fragments.merge(mentions)

## Tokenize the documents, and define fragments as spans of tokens

In [4]:
# Tokenize
tokens = (
    spacy_tokenize.nocache(docs, lang="en_core_web_sm", spacy_attributes=["orth_"])#, spacy_attributes=list((set(SPACY_ATTRIBUTES) - {"norm_"}) | {"lemma_"}),)
    #spm_tokenize.nocache(docs, "/Users/perceval/Development/data/resources/camembert.v0/sentencepiece.bpe.model")
)

# Perform token substitution to match CoNLL guidelines
tokens["token_orth"] = tokens["token_orth"].apply(lambda word: {
    "$": "${dollar}",
    "_": "${underscore}",
    "\t": "${tab}",
    "\n": "${newline}",
    " ": "${space}",
    "#": "${hash}"}.get(word, word))

tokenized_fragments = split_into_spans(fragments, tokens, pos_col="token_idx")

## Deal with overlaps

In [5]:
# Extract overlapping spans
conflicts = merge_with_spans(tokenized_fragments, tokenized_fragments, on=["doc_id", ("begin", "end")], how="outer", suffixes=("", "_other"))

# Assign a cluster (overlapping fragments) to each fragment
fragment_cluster_ids = make_id_from_merged(
    conflicts[["doc_id", "mention_id", "fragment_id"]], 
    conflicts[["doc_id", "mention_id_other", "fragment_id_other"]], 
    apply_on=[(0, tokenized_fragments[["doc_id", "mention_id", "fragment_id"]])])

# Group by cluster and set the biggest fragment to depth 0, next to 1, ...
split_fragments = (tokenized_fragments
 .groupby(fragment_cluster_ids, as_index=False, group_keys=False)
 .apply(lambda group: group.assign(depth=np.argsort(group["begin"]-group["end"]))))

## Encode mentions as tags on tokens

In [27]:
# Encode labels into tag on tokens, with respect to the fragments indices
tagged_tokens = tokens.copy()
tag_scheme="bio" # / "bioul"
label_col_names = []
for depth_i in range(split_fragments["depth"].max()):
    label_col_names.append(f'label-{depth_i}')
    tagged_tokens[f'label-{depth_i}'] = encode_as_tag(tokens[["doc_id", "token_id", "token_idx"]], 
                                                      split_fragments[split_fragments["depth"] == depth_i], 
                                                      tag_scheme=tag_scheme, label_cols=["label"], use_token_idx=True, verbose=1)['label']
tagged_tokens.head()

1569it [00:01, 1348.64it/s]                          


Unnamed: 0,doc_id,token_id,begin,end,token_idx,token_orth,label-0
0,doc_-1040562575731457347_1,0,0,7,0,Hôpital,O
1,doc_-1040562575731457347_1,1,8,13,1,TENON,O
2,doc_-1040562575731457347_1,2,13,14,2,${newline},O
3,doc_-1040562575731457347_1,3,14,20,3,Avenue,O
4,doc_-1040562575731457347_1,4,21,29,4,Patricia,O


## Write the CoNLL files

In [8]:
cache = get_cache("n2c2_conll")
for doc_id, doc_tokens in tqdm(tagged_tokens.groupby(["doc_id"], sort="begin")):
    with open(cache / (doc_id + ".conll"), "w") as file:
        for (token_idx, token, *token_labels) in doc_tokens[["token_idx", "token_orth", *label_col_names]].itertuples(index=False): # iter(zip(*df)) is way faster than df.iterrows()
            print(token_idx, "\t", token, "\t", "\t".join(token_labels), file=file)
for doc_id, doc_text in docs[["doc_id", "text"]].itertuples(index=False):
    with open(cache / (doc_id + ".txt"), "w") as file:
        print(doc_text, file=file)

 26%|██▌       | 13/50 [00:00<00:00, 124.54it/s]

Using cache /Users/perceval/Development/data/cache/n2c2_conll/0eece35480718b75


100%|██████████| 50/50 [00:00<00:00, 130.49it/s]
