In [None]:
import sys
sys.path.append("../../")

from collections import Counter

import pandas as pd
from scipy.stats import fisher_exact
from sklearn.model_selection import train_test_split

from utilities.utils import load_json, save_json
from utilities.term import build_cooccurrence_matrix, build_fisher_matrix, build_term_ids_lists

## Configuration

In [None]:
seed = 7
train_size = 0.8
min_ndoc = 3
threshold = 0.05

## Load Data

In [None]:
norm_states = load_json("../../datasets/norm_patient_states_t.json")
out_icds = load_json("../../datasets/out_icds.json")
cui2name = load_json("../../models/nen/smcui2name.json")

## Preprocessing

In [None]:
# split data, using only training data to make term lists
train_X, eval_X, train_y, eval_y = train_test_split(
    norm_states, 
    out_icds, 
    train_size=train_size, 
    test_size=(1.0 - train_size), 
    random_state=seed, 
    stratify=out_icds
)

In [None]:
# preprocess data for building the cooccurrence matrix
entities_l = list()
for norm_state in train_X:
    entities = set()
    for entity, pol in norm_state:
        entities.add(entity)
    entities_l.append(entities)

labels = train_y

vocab = {name: i for i, (cui, name) in enumerate(cui2name.items())}
save_json(obj=vocab, f="./term2id.json")

## Build Co-occurence Matrix

In [None]:
co_matrix = build_cooccurrence_matrix(entities_l, labels, vocab)

## Determine Term-Label Relevance

In [None]:
label2ndoc = Counter(labels)
total_ndoc = len(train_y)
f_matrix = build_fisher_matrix(co_matrix, label2ndoc, total_ndoc, min_ndoc=3)

## Save Results

In [None]:
f_matrix.to_csv("./fisher_matrix.csv", index_label="term_id")

## Load Results

In [None]:
f_matrix = pd.read_csv("./fisher_matrix.csv", index_col="term_id")

## Build Term Lists

In [None]:
# build term_ids lists
term_ids_lists = build_term_ids_lists(score_matrix=f_matrix, mode="lesser", threshold=threshold)

# convert term_ids to terms
id2term = {v: k for k, v in vocab.items()}
terms_lists = dict()

for label, term_ids in term_ids_lists.items():
    terms = list(map(lambda term_id: id2term[term_id], term_ids))
    terms_lists[label] = terms