In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [3]:
from custom_parsers import CustomAnnotationParser, CustomTokenParser, read_record
from ner_ehr.tokenizers import (
    ScispacyTokenizer, 
    SplitTokenizer, 
    NLTKTokenizer,
    _validate_token_idxs)

In [4]:
text_fp = "data/train/100039.txt"
ann_fp = "data/train/100039.ann"

# text_fp = "dummy.txt"
# ann_fp = "dummy.ann"

In [5]:
text = read_record(text_fp)

In [6]:
# tokenizer = SplitTokenizer(validate_token_idxs=True, splitlines=True)
# tokenizer = ScispacyTokenizer(validate_token_idxs=True) 
tokenizer = NLTKTokenizer(validate_token_idxs=True) 
tokenizer

<ner_ehr.tokenizers.NLTKTokenizer at 0x2b9d88c04898>

In [7]:
ann_parser = CustomAnnotationParser(tokenizer=tokenizer)
annotations = ann_parser(annotations_fp=ann_fp, record_fp=text_fp)
annotations[:5]

[Annotation(token='Prochlorperazine', start_idx=166, end_idx=182, tag='B-Drug'),
 Annotation(token='Heparin', start_idx=185, end_idx=192, tag='B-Drug'),
 Annotation(token='Agents', start_idx=193, end_idx=199, tag='I-Drug'),
 Annotation(token='anthracycline', start_idx=506, end_idx=519, tag='B-Drug'),
 Annotation(token='cardiomyopathy', start_idx=528, end_idx=542, tag='B-ADE')]

In [9]:
token_parser = CustomTokenParser(tokenizer=tokenizer,)
tokens = token_parser.parse(record_fp=text_fp, annotations=annotations)
tokens[:5]

[Token(token='Admission', start_idx=0, end_idx=9),
 Token(token='Date', start_idx=10, end_idx=14),
 Token(token=':', start_idx=14, end_idx=15),
 Token(token='[', start_idx=17, end_idx=18),
 Token(token='*', start_idx=18, end_idx=19)]

In [12]:
from ner_ehr.data.ehr import EHR
from ner_ehr.data.utils import df_to_namedtuples
fp = "out/train/100035-tokens-with-annotations.csv"

ehr = EHR()
ann_tokens_df = ehr.read_csv_tokens_with_annotations(fp)

In [17]:
ann_tokens = df_to_namedtuples(name="Annotation", df=ann_tokens_df)
ann_tokens[:4]

[Annotation(token='Admission', start_idx=0, end_idx=9, tag='O'),
 Annotation(token='Date', start_idx=10, end_idx=14, tag='O'),
 Annotation(token=':', start_idx=14, end_idx=15, tag='O'),
 Annotation(token='[', start_idx=17, end_idx=18, tag='O')]

In [48]:
from ner_ehr.data.utils import split_annotated_tokens_in_batches
seqs = split_annotated_tokens_in_batches(ann_tokens, seq_length=256)
[len(seq) for seq in seqs]

[256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 254,
 256,
 256,
 256,
 146]

In [56]:
rng = np.random.default_rng(42)
l = [8,1,2,3,4]
rng.shuffle(l)
l

[4, 2, 3, 1, 8]

[4, 2, 3, 1, 8]

In [9]:
_validate_token_idxs(tokens, text=text)

In [10]:
# import spacy
# from spacy import displacy

# for i, (start, end) in enumerate(para_start_indexes3):
#     print(f"{'='*50}{i}{'='*50}")
#     window = 10
#     idx = (end - start + 2*window)//2
    
#     string = text[start-window:end+window]
    
#     ex = [{"text": string, 
#        "ents": [{"start": idx-1, "end": idx+1, "label": "O"}],
#        "title": None}]
#     html = displacy.render(ex, style="ent", manual=True)