In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from functools import partial
import pandas as pd

In [3]:
text_fp = "../data/train/100035.txt"
ann_fp = "../data/train/100035.ann"

In [4]:
text = None
with open(text_fp, "r") as f:
    text = f.read()

In [5]:
ann = None
with open(ann_fp) as f:
    ann = f.read()

In [6]:
from annotation import parser
from ner_ehr.data.builders import AnnotationBuilder, TokenBuilder
from ner_ehr.tokenizers import ScispacyTokenizer, SplitTokenizer
from ner_ehr.callbacks import AnnotationParser
from ner_ehr.tokenizers import _validate_token_idxs

In [7]:
tokenizer = SplitTokenizer(validate_token_idxs=True)
parser = partial(parser, raw_text=text, tokenizer=tokenizer)
parser = AnnotationParser(parser=parser)

ann_builder = AnnotationBuilder(parser=parser)

In [8]:
annotations = ann_builder.build(fp=ann_fp)
_validate_token_idxs(tokens=annotations, text=text)

In [9]:
annotations[:10]

[Annotation(token='recurrent', start_idx=10179, end_idx=10188, tag='B-Reason'),
 Annotation(token='seizures', start_idx=10189, end_idx=10197, tag='I-Reason'),
 Annotation(token='ativan', start_idx=10227, end_idx=10233, tag='B-Drug'),
 Annotation(token='IM', start_idx=10240, end_idx=10242, tag='B-Route'),
 Annotation(token='Topiramate', start_idx=10455, end_idx=10465, tag='B-Drug'),
 Annotation(token='25mg', start_idx=10466, end_idx=10470, tag='B-Strength'),
 Annotation(token='PO', start_idx=10471, end_idx=10473, tag='B-Route'),
 Annotation(token='BID', start_idx=10474, end_idx=10477, tag='B-Frequency'),
 Annotation(token='PM', start_idx=10495, end_idx=10497, tag='B-Duration'),
 Annotation(token='50mg', start_idx=10515, end_idx=10519, tag='B-Strength')]

In [10]:
token_builder = TokenBuilder(tokenizer=tokenizer)

In [11]:
tokens = token_builder.build(fp=text_fp)
tokens[:10]

[Token(token='Admission', start_idx=0, end_idx=9),
 Token(token='Date:', start_idx=10, end_idx=15),
 Token(token='[**2115-2-22**]', start_idx=17, end_idx=32),
 Token(token='Discharge', start_idx=46, end_idx=55),
 Token(token='Date:', start_idx=56, end_idx=61),
 Token(token='[**2115-3-19**]\n\nDate', start_idx=64, end_idx=85),
 Token(token='of', start_idx=86, end_idx=88),
 Token(token='Birth:', start_idx=89, end_idx=95),
 Token(token='[**2078-8-9**]', start_idx=97, end_idx=111),
 Token(token='Sex:', start_idx=124, end_idx=128)]

In [12]:
pd.DataFrame(annotations[:10])

Unnamed: 0,token,start_idx,end_idx,tag
0,recurrent,10179,10188,B-Reason
1,seizures,10189,10197,I-Reason
2,ativan,10227,10233,B-Drug
3,IM,10240,10242,B-Route
4,Topiramate,10455,10465,B-Drug
5,25mg,10466,10470,B-Strength
6,PO,10471,10473,B-Route
7,BID,10474,10477,B-Frequency
8,PM,10495,10497,B-Duration
9,50mg,10515,10519,B-Strength


In [13]:
pd.DataFrame(tokens).head(10)

Unnamed: 0,token,start_idx,end_idx
0,Admission,0,9
1,Date:,10,15
2,[**2115-2-22**],17,32
3,Discharge,46,55
4,Date:,56,61
5,[**2115-3-19**]\n\nDate,64,85
6,of,86,88
7,Birth:,89,95
8,[**2078-8-9**],97,111
9,Sex:,124,128


In [16]:
import re

In [72]:
para_start_indexes1 = []
for i in re.finditer(r'(?!.*[\d]+)\. [A-Z]', text):
    para_start_indexes1.append(i.span())
    
para_start_indexes2 = []
for i in re.finditer(r'(?![\d]+)\. [A-Z]', text):
    para_start_indexes2.append(i.span())
    
    
para_start_indexes3 = []
for i in re.finditer(r'(\n[\n]+)', text):
    para_start_indexes3.append(i.span())
    
len(para_start_indexes1), len(para_start_indexes2), len(para_start_indexes3)

(72, 83, 32)

In [73]:
mismatch_idxs = [idxs for idxs in para_start_indexes2 if idxs not in para_start_indexes1]

In [74]:
import spacy
from spacy import displacy

In [80]:
for i, (start, end) in enumerate(para_start_indexes3):
    print(f"{'='*50}{i}{'='*50}")
    window = 10
    idx = (end - start + 2*window)//2
    
    string = text[start-window:end+window]
    
    ex = [{"text": string, 
       "ents": [{"start": idx-1, "end": idx+1, "label": "O"}],
       "title": None}]
    html = displacy.render(ex, style="ent", manual=True)

































































In [81]:
para_start_indexes3

[(79, 81),
 (132, 134),
 (151, 153),
 (171, 173),
 (268, 270),
 (387, 390),
 (1974, 1976),
 (2124, 2126),
 (2149, 2151),
 (2174, 2176),
 (2607, 2610),
 (6467, 6469),
 (6497, 6499),
 (6572, 6574),
 (6583, 6585),
 (8104, 8107),
 (8390, 8392),
 (13147, 13149),
 (14508, 14510),
 (14577, 14579),
 (14755, 14757),
 (15399, 15401),
 (15537, 15539),
 (16905, 16908),
 (16944, 16946),
 (17001, 17003),
 (17203, 17206),
 (17419, 17422),
 (17979, 17981),
 (18253, 18255),
 (18561, 18563),
 (18934, 18938)]

In [83]:
tokens[511].end_idx

4463

In [69]:
para_start_indexes2

[(628, 631),
 (707, 710),
 (788, 791),
 (890, 893),
 (948, 951),
 (1159, 1162),
 (1318, 1321),
 (1473, 1476),
 (1658, 1661),
 (1695, 1698),
 (1854, 1857),
 (2355, 2358),
 (2577, 2580),
 (3750, 3753),
 (3817, 3820),
 (3889, 3892),
 (3993, 3996),
 (4034, 4037),
 (4267, 4270),
 (4362, 4365),
 (4417, 4420),
 (4453, 4456),
 (4531, 4534),
 (4596, 4599),
 (4735, 4738),
 (4785, 4788),
 (4831, 4834),
 (4880, 4883),
 (4977, 4980),
 (5110, 5113),
 (5163, 5166),
 (5243, 5246),
 (5297, 5300),
 (5336, 5339),
 (5469, 5472),
 (5520, 5523),
 (5555, 5558),
 (5595, 5598),
 (5659, 5662),
 (5742, 5745),
 (5747, 5750),
 (5764, 5767),
 (5825, 5828),
 (5850, 5853),
 (5964, 5967),
 (6168, 6171),
 (6217, 6220),
 (6410, 6413),
 (6678, 6681),
 (6782, 6785),
 (6826, 6829),
 (6948, 6951),
 (7065, 7068),
 (7152, 7155),
 (7200, 7203),
 (7226, 7229),
 (7364, 7367),
 (7421, 7424),
 (7493, 7496),
 (7760, 7763),
 (7819, 7822),
 (7860, 7863),
 (8644, 8647),
 (8688, 8691),
 (8782, 8785),
 (8865, 8868),
 (9578, 9581),
 (972

In [None]:
para_start_indexes

In [None]:
tokens[256]

In [None]:
while para_start_indexes:
    idxs = para_start_indexes.pop(0)
    

In [None]:
para_start_indexes

In [None]:
def split(tokens, text, seq_length):
    
    start = 0 
    end = start + seq_length
    
    tokens[end]
    

In [None]:
text[64:85]

In [None]:
import re

In [None]:
for i in re.finditer(pattern=r'. [A-Z]', string="This is a sentence. Where? exactly. Hello"):
    print(i)

In [None]:
re.findall(r'\n [A-Z]', "This is a text. \n")