# Notebook for Calculating Data Statistics

## Import Packages and Utility Functions

In [None]:
import sys
sys.path.append("../")

import numpy as np
import pandas as pd
from typing import List

from transformers import AutoTokenizer

from utilities.utils import load_json
from utilities.preprocess import preprocess_patient_state_tuples

def describe_emrs(emrs: List[str], tokenizer: AutoTokenizer) -> None:
    # Number of words, tokens, and characters
    # words (tokenized by whitespace)
    nwords = list()
    for emr in emrs:
        nword = len(emr.split())
        nwords.append(nword)
    nwords = np.array(nwords)

    # tokens
    tokenized_emrs = tokenizer(emrs)["input_ids"]
    ntokens = np.array(list(map(lambda l: len(l), tokenized_emrs)))

    # characters
    nchars = list()
    for emr in emrs:
        nchar = len(emr)
        nchars.append(nchar)
    nchars = np.array(nchars)

    for item, name in zip([nwords, ntokens, nchars], ["words", "tokens", "chars"]):
        print(f"Number of {name}: {pd.Series(item).describe()}")

## Full EMRs

In [None]:
# Load data & tokenizer
full_emrs = load_json("../datasets/full_piphs.json")
tokenizer = AutoTokenizer.from_pretrained("../models/ner/tokenizer/", use_fast=True)

# Print data statistics
describe_emrs(full_emrs, tokenizer)

## Extracted Findings (Unnormalized)

In [None]:
# Load data & tokenizer
unnorm_emrs = load_json("../datasets/unnorm_patient_states_t.json")
tokenizer = AutoTokenizer.from_pretrained("../models/ner/tokenizer/", use_fast=True)

# preprocessing
label2token = {
    0: "positive",
    1: "negative"
}
ppsd_unnorm_emrs = preprocess_patient_state_tuples(state_tuples_l=unnorm_emrs, label2token=label2token)

# Print data statistics
describe_emrs(ppsd_unnorm_emrs, tokenizer)

## Extracted Findings (Normalized)

In [None]:
# Load data & tokenizer
norm_emrs = load_json("../datasets/norm_patient_states_t.json")
tokenizer = AutoTokenizer.from_pretrained("../models/ner/tokenizer/", use_fast=True)

# preprocessing
label2token = {
    0: "positive",
    1: "negative"
}
ppsd_norm_emrs = preprocess_patient_state_tuples(state_tuples_l=norm_emrs, label2token=label2token)

# Print data statistics
describe_emrs(ppsd_norm_emrs, tokenizer)