In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import re
import string
import pandas as pd
from tqdm import tqdm
from nlstruct.core.text import transform_text, apply_deltas, encode_as_tag
from nlstruct.chunking.spacy_tokenization import spacy_tokenize, SPACY_ATTRIBUTES
from nlstruct.core.pandas import normalize_vocabularies
from nlstruct.core.cache import get_cache
from nlstruct.dataloaders.ncbi_disease import load_ncbi_disease

# Load the dataset
docs, mentions, labels, fragments = load_ncbi_disease()

# Clean the text / perform substitutions
subs = [
    (re.escape("<????-??-??>"), "MASKEDDATE"),
    (r"(?<=[{}\\])(?![ ])".format(string.punctuation), r" "),
    (r"(?<![ ])(?=[{}\\])".format(string.punctuation), r" "),
    ("(?<=[a-zA-Z])(?=[0-9])", r" "),
    ("(?<=[0-9])(?=[A-Za-z])", r" "),
    ("MASKEDDATE", "<????-??-??>"),
]
docs, deltas = transform_text.nocache(docs, *zip(*subs), return_deltas=True)

# Apply transformations to the spans
fragments = apply_deltas(fragments, deltas, on='doc_id')
fragments = fragments.merge(mentions)

# TOKENIZE
tokens = (
    spacy_tokenize.nocache(docs, lang="en_core_web_sm", spacy_attributes=["orth_"])#, spacy_attributes=list((set(SPACY_ATTRIBUTES) - {"norm_"}) | {"lemma_"}),)
    #spm_tokenize.nocache(docs, "/Users/perceval/Development/data/resources/camembert.v0/sentencepiece.bpe.model")
)
tokens["token_orth"] = tokens["token_orth"].apply(lambda word: {
    "$": "${dollar}",
    "_": "${underscore}",
    "\t": "${tab}",
    "\n": "${newline}",
    " ": "${space}",
    "#": "${hash}"}.get(word, word))

Using cache /Users/perceval/Development/data/cache/nlstruct/dataloaders/ncbi_disease/load_ncbi_disease/0eece35480718b75
Loading /Users/perceval/Development/data/cache/nlstruct/dataloaders/ncbi_disease/load_ncbi_disease/0eece35480718b75/output.pkl... Done


In [4]:
!head /Users/perceval/Development/data/cache/nlstruct/dataloaders/ncbi_disease/load_ncbi_disease/0eece35480718b75/raw/NCBItrainset_corpus.txt


10192393|t|A common human skin tumour is caused by activating mutations in beta-catenin.
10192393|a|WNT signalling orchestrates a number of developmental programs. In response to this stimulus, cytoplasmic beta-catenin (encoded by CTNNB1) is stabilized, enabling downstream transcriptional activation by members of the LEF/TCF family. One of the target genes for beta-catenin/TCF encodes c-MYC, explaining why constitutive activation of the WNT pathway can lead to cancer, particularly in the colon. Most colon cancers arise from mutations in the gene encoding adenomatous polyposis coli (APC), a protein required for ubiquitin-mediated degradation of beta-catenin, but a small percentage of colon and some other cancers harbour beta-catenin-stabilizing mutations. Recently, we discovered that transgenic mice expressing an activated beta-catenin are predisposed to developing skin tumours resembling pilomatricomas. Given that the skin of these adult mice also exhibits signs of de novo hair-follic

In [5]:
# Generate vocabularies, needed before encode_as_tag since each "tagified" label needs to have its category 
[tokens, fragments], vocabularies = normalize_vocabularies([tokens, fragments])

# Encode labels into tag on tokens, with respect to the fragments indices
if fragments is not None:
    tokens = encode_as_tag(tokens, fragments, tag_scheme="bio", use_token_idx=False, verbose=1)

6882it [00:08, 796.29it/s]                           


In [46]:
cache = get_cache("ncbi_conll")
for doc_id, doc_tokens in tqdm(tokens.groupby(["doc_id"], sort="begin")):
    with open(cache / (doc_id + ".conll"), "w") as file:
        for token_idx, token, label in doc_tokens[["token_idx", "token_orth", "category"]].itertuples(index=False): # iter(zip(*df)) is way faster than df.iterrows()
            print(token_idx, "\t", token, "\t", label, file=file)
for doc_id, doc_text in docs[["doc_id", "text"]].itertuples(index=False):
    with open(cache / (doc_id + ".txt"), "w") as file:
        print(doc_text, file=file)

  2%|▏         | 19/792 [00:00<00:04, 179.58it/s]

Using cache /Users/perceval/Development/data/cache/ncbi_conll/0eece35480718b75


100%|██████████| 792/792 [00:02<00:00, 300.99it/s]
