In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import re
import string
import pandas as pd
from tqdm import tqdm
from nlstruct.core.environment import env
from nlstruct.core.text import transform_text, apply_deltas, encode_as_tag
from nlstruct.chunking.spacy_tokenization import spacy_tokenize, SPACY_ATTRIBUTES
from nlstruct.core.pandas import normalize_vocabularies
from nlstruct.core.cache import get_cache
from nlstruct.dataloaders.brat import load_from_brat

In [None]:
# Load the dataset
docs, mentions, labels, fragments = load_from_brat(env.resource("brat/my_brat_dataset/"))[["docs", "mentions", "labels", "fragments"]]

# Clean the text / perform substitutions
subs = [
    (re.escape("<????-??-??>"), "MASKEDDATE"),
    (r"(?<=[{}\\])(?![ ])".format(string.punctuation), r" "),
    (r"(?<![ ])(?=[{}\\])".format(string.punctuation), r" "),
    ("(?<=[a-zA-Z])(?=[0-9])", r" "),
    ("(?<=[0-9])(?=[A-Za-z])", r" "),
    ("MASKEDDATE", "<????-??-??>"),
]
docs, deltas = transform_text.nocache(docs, *zip(*subs), return_deltas=True)

# Apply transformations to the spans
fragments = apply_deltas(fragments, deltas, on='doc_id')
fragments = fragments.merge(mentions)

# TOKENIZE
tokens = (
    spacy_tokenize.nocache(docs, lang="fr_core_news_sm", spacy_attributes=["orth_"])#, spacy_attributes=list((set(SPACY_ATTRIBUTES) - {"norm_"}) | {"lemma_"}),)
    #spm_tokenize.nocache(docs, "/Users/perceval/Development/data/resources/camembert.v0/sentencepiece.bpe.model")
)
tokens["token_orth"] = tokens["token_orth"].apply(lambda word: {
    "$": "${dollar}",
    "_": "${underscore}",
    "\t": "${tab}",
    "\n": "${newline}",
    " ": "${space}",
    "#": "${hash}"}.get(word, word))

In [None]:
# Generate vocabularies, needed before encode_as_tag since each "tagified" label needs to have its category 
[tokens, fragments], vocabularies = normalize_vocabularies([tokens, fragments])

# Encode labels into tag on tokens, with respect to the fragments indices
if fragments is not None:
    tokens = encode_as_tag(tokens, fragments, tag_scheme="bio", use_token_idx=False, verbose=1)

In [None]:
cache = get_cache("brat_conll")
for doc_id, doc_tokens in tqdm(tokens.groupby(["doc_id"], sort="begin")):
    with open(cache / (doc_id + ".conll"), "w") as file:
        for token_idx, token, label in doc_tokens[["token_idx", "token_orth", "label"]].itertuples(index=False): # iter(zip(*df)) is way faster than df.iterrows()
            print(token_idx, "\t", token, "\t", label, file=file)
for doc_id, doc_text in docs[["doc_id", "text"]].itertuples(index=False):
    with open(cache / (doc_id + ".txt"), "w") as file:
        print(doc_text, file=file)