In [12]:
import sys
sys.path.append("../../")

import json
import pickle
from pathlib import Path
from collections import Counter

import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from transformers import AutoTokenizer

from utilities.utils import set_seeds, render_exp_name, load_args, load_pickle, load_jsonl, load_json, save_json, get_logger
from utilities.preprocess import augment_extracted_emrs_with_partials, preprocess_extracted_emrs, select_labels_subset, build_label2id_mapping, augment_full_emrs_with_partials
from utilities.data import MedicalDxDataset
from utilities.model import BertDxModel, encoder_names_mapping
from utilities.trainer import ICDATrainer
from utilities.evaluation import evaluate_dx_model

## Configuration

In [14]:
args = load_args("./config.json")

set_seeds(args.seed)

# logger
logger = get_logger(name=str(__name__))

# set up experiment
args.exp_name = render_exp_name(args, hparams=args.exp_hparams, sep='__')
args.exp_dir = f"{args.save_dir}/{args.exp_name}"
Path(args.exp_dir).mkdir(parents=True, exist_ok=True)

# save args
(Path(args.exp_dir) / "config.json").write_text(json.dumps(vars(args), indent=4))
(Path(args.exp_dir) / "args.pickle").write_bytes(pickle.dumps(args))

1084

## Data

### Load Data

In [None]:
emrs = load_json(args.input_file)
icds = load_json(args.label_file)

### Preprocessing

In [None]:
# Input
if args.input_type in ["unnorm", "norm"]:
    # preprocess EMRs
    pass

In [23]:
# Label
# convert ICDs to label_ids
icd2id = build_label2id_mapping(labels=icds)
labels = [icd2id[icd] for icd in icds]
# save label conversion mapping
id2icd = {id_: icd for icd, id_ in icd2id.items()}
save_json(id2icd, f"{args.exp_dir}/id2icd.json")

### Split