In [2]:
import sys
sys.path.append("../../")

import json
import pickle
from pathlib import Path
from collections import Counter

import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from transformers import AutoTokenizer

from utilities.utils import set_seeds, render_exp_name, load_args, load_pickle, load_json, save_json, get_logger
from utilities.preprocess import augment_extracted_emrs_with_partials, preprocess_extracted_emrs, select_labels_subset, build_label2id_mapping, augment_full_emrs_with_partials
from utilities.data import MedicalDxDataset
from utilities.model import BertNERModel, BiEncoder, encoder_names_mapping
from utilities.trainer import ICDATrainer
from utilities.evaluation import evaluate_dx_model

from finding_extractor import Recognizer, Normalizer, FindingExtractor
from emr_preprocessor import EMRPreprocessor

## Configuration

In [None]:
full_emrs_path = "../../datasets/full_emrs.csv"
ner_ckpt_path = "../../models/ner/best_model.pth"
nen_ckpt_path = "../../models/nen/best_model.pth"
ner_tokenizer_path = "../../models/ner/tokenizer/"
nen_tokenizer_path = "../../models/nen/tokenizer/"

entity_embed_path = "../../models/nen/entity_embeddings_5454.pt"
cui2name_path = "../../models/nen/smcui2name.json"

ner_model_name = "BioLinkBERT"
nen_model_name = "BERT"

ner_num_tags = 5
batch_size = 16

device = "cuda"

## Data

In [None]:
df = pd.read_csv(full_emrs_path)

emrs = df.text.tolist()
entity_embeddings = torch.load(entity_embed_path, map_location=device)
cui2name = load_json(cui2name_path)

## Model

In [None]:
ner_tokenizer = AutoTokenizer.from_pretrained(ner_tokenizer_path)
nen_tokenizer = AutoTokenizer.from_pretrained(nen_tokenizer_path)

In [None]:
ner_model = BertNERModel(encoder=encoder_names_mapping[ner_model_name], num_tags=ner_num_tags)
ner_model.load_state_dict(torch.load(ner_ckpt_path, map_location=device))

nen_model = BiEncoder(encoder_name=encoder_names_mapping[nen_model_name])
nen_model.load_state_dict(torch.load(nen_ckpt_path, map_location=device))

## Preprocessing

### Load Preprocessing Models

In [None]:
recognizer = Recognizer(
    model=ner_model,
    tokenizer=ner_tokenizer,
    batch_size=batch_size,
    device=device,
    verbose=True
)

normalizer = Normalizer(
    model=nen_model,
    tokenizer=nen_tokenizer,
    entity_embeddings=entity_embeddings,
    cui2name=cui2name,
    device=device,
    verbose=True
)

extractor = FindingExtractor(
    recognizer=recognizer,
    normalizer=normalizer
)

preprocessor = EMRPreprocessor(
    finding_extractor=extractor,
    batch_size=batch_size
)

### Extract Spans Containing Clinical Findings

In [None]:
spans2pols_l = recognizer.extract_labeled_spans(emrs)

In [None]:
spans_l, pols_l = recognizer.extract_spans_and_pols(spans2pols_l)

### Extract Medical Terms (Unnormalized) based on Spans

In [None]:
unnorm_terms_l = normalizer.normalize_term_spans(emrs, spans_l, mode="lower")

In [None]:
unnorm_patient_states = extractor.build_patient_states(terms_l=unnorm_terms_l, pols_l=pols_l, return_type="tuple")

In [None]:
save_json(unnorm_patient_states, "../../datasets/unnorm_patient_states_t.json")

### Extract Medical Terms (Normalized) based on Spans

In [None]:
norm_terms_l = normalizer.normalize_term_spans(emrs, spans_l, mode="umls")

In [None]:
norm_patient_states = extractor.build_patient_states(terms_l=norm_terms_l, pols_l=pols_l, return_type="tuple")

In [None]:
save_json(norm_patient_states, "../../datasets/norm_patient_states_t.json")

## Data Statistics

In [3]:
import numpy as np
import pandas as pd
from typing import List

def describe_emrs(emrs: List[str], tokenizer: AutoTokenizer) -> None:
    # Number of words, tokens, and characters
    # words (tokenized by whitespace)
    emrs = df.text.tolist()
    nwords = list()
    for emr in emrs:
        nword = len(emr.split())
        nwords.append(nword)
    nwords = np.array(nwords)

    # tokens
    tokenized_emrs = ner_tokenizer(emrs)["input_ids"]
    ntokens = np.array(list(map(lambda l: len(l), tokenized_emrs)))

    # characters
    nchars = list()
    for emr in emrs:
        nchar = len(emr)
        nchars.append(nchar)
    nchars = np.array(nchars)

    for item, name in zip([nwords, ntokens, nchars], ["words", "tokens", "chars"]):
        print(f"Number of {name}: {pd.Series(item).describe()}")

### Full EMRs

In [4]:
# Load data
df = pd.read_csv("../../datasets/full_emrs.csv")

In [None]:
nsamples = len(df)
nlabels = df.labels.nunique()
print(f"Sample size: {nsamples} / Label size: {nlabels}")

In [None]:
emrs = df.text.tolist()
describe_emrs(emrs, ner_tokenizer)