## Use a python 3.11 kernel cehrbert

In [1]:
!pip install meds_reader==0.1.9
!pip install setuptools
!pip install cehrbert==1.3.1

Collecting meds_reader==0.1.9
  Downloading meds_reader-0.1.9-cp312-cp312-macosx_12_0_x86_64.whl.metadata (3.0 kB)
Collecting numpy<2,>=1.16 (from meds_reader==0.1.9)
  Using cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl.metadata (61 kB)
Downloading meds_reader-0.1.9-cp312-cp312-macosx_12_0_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hUsing cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl (20.3 MB)
Installing collected packages: numpy, meds_reader
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.0
    Uninstalling numpy-2.2.0:
      Successfully uninstalled numpy-2.2.0
Successfully installed meds_reader-0.1.9 numpy-1.26.4
Collecting cehrbert==1.3.1
  Downloading cehrbert-1.3.1-py3-none-any.whl.metadata (9.9 kB)
Collecting dask==2024.1.1 (from cehrbert==1.3.1)
  Downloading dask-2024.1.1-py3-none-any.whl.metadata (3.7 kB)
Collecting d

In [3]:
#@title Download E-ICU demo
import os
from pathlib import Path
notebook_dir = os.getcwd()

ROOT_DIR=f"{notebook_dir}/work_dir/mimiciv_demo/"
# ROOT_DIR=f"{notebook_dir}/work_dir/eicu_demo/"
Path(ROOT_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
MEDS_DIR = ROOT_DIR + "/meds/"
MEDS_READER_DIR = ROOT_DIR + "/meds_reader/"
TASK_DIR = MEDS_DIR + "/task_labels"
TASK_NAME="mortality/in_icu/first_24h"
# TASK_NAME="los_in_hospital_first_48h"
OUTPUT_PRETRAIN_MODEL_DIR= ROOT_DIR + "/output/cehrbert/"
# TODO this variable has an identical name?
OUTPUT_PRETRAIN_MODEL_DIR= ROOT_DIR + "/output/cehrbert_finetuned/"

Run meds_reader on the MEDS data

In [None]:
!meds_reader_convert $MEDS_DIR $MEDS_READER_DIR

In [None]:
!mkdir -p ./content/output/cehrbert/
!mkdir -p ./content/output/cehrbert_dataset_prepared/
!mkdir -p ./content/output/cehrbert_finetuned/

In [None]:
!mkdir ./content/github_repo;cd ./content/github_repo;git clone https://github.com/cumc-dbmi/cehrbert.git;cd cehrbert;git checkout fix/meds_evaluation;pip install .;

Create the cehrbert pretraining configuration yaml file

In [None]:
cehrbert_pretrain_config = """
#Model arguments
model_name_or_path: "./content/output/cehrbert/"
tokenizer_name_or_path: "./content/output/cehrbert/"
num_hidden_layers: 6
max_position_embeddings: 1024
hidden_size: 768
vocab_size: 100000
min_frequency: 50
include_value_prediction: false # additional CEHR-BERT learning objective

#Data arguments
data_folder: "./content/meds_reader/"
dataset_prepared_path: "./content/output/cehrbert_dataset_prepared/"

# Below is a list of Med-to-CehrBert related arguments
preprocessing_num_workers: 2
preprocessing_batch_size: 128
# if is_data_in_med is false, it assumes the data is in the cehrbert format
is_data_in_meds: true
att_function_type: "cehr_bert"
inpatient_att_function_type: "mix"
include_auxiliary_token: true
include_demographic_prompt: false
# if the data is in the meds format, the validation split will be omitted
# as the meds already provide train/tuning/held_out splits
validation_split_percentage: 0.05

# Huggingface Arguments
dataloader_num_workers: 2
dataloader_prefetch_factor: 2

overwrite_output_dir: false
resume_from_checkpoint: # automatically infer the latest checkpoint from the output folder
seed: 42

output_dir: "./content/output/cehrbert/"
evaluation_strategy: "epoch"
save_strategy: "epoch"
eval_accumulation_steps: 10

learning_rate: 0.00005
per_device_train_batch_size: 8
per_device_eval_batch_size: 8
gradient_accumulation_steps: 2

num_train_epochs: 50 # for large datasets, 5-10 epochs should suffice
warmup_steps: 10
weight_decay: 0.01
logging_dir: "./logs"
logging_steps: 10

save_total_limit:
load_best_model_at_end: true
metric_for_best_model: "eval_loss"
greater_is_better: false

report_to: "none"
"""
PRETRAIN_CONFIG_FP = ROOT_DIR + "/output/cehrbert/cehrbert_pretrain_config.yaml"
with open(PRETRAIN_CONFIG_FP, 'w') as f:
    f.write(cehrbert_pretrain_config)

## Pretrain cehrbert using MLM
!python3.11 -m cehrbert.runners.hf_cehrbert_pretrain_runner ./content/output/cehrbert/cehrbert_pretrain_config.yaml

## Create the cehrbert finetuning configuration yaml file
cehrbert_finetune_config = f"""
Model arguments
model_name_or_path: "./content/output/cehrbert/"
tokenizer_name_or_path: "./content/output/cehrbert/"
num_hidden_layers: 6
max_position_embeddings: 1024
hidden_size: 768
vocab_size: 100000
min_frequency: 50
include_value_prediction: false # additional CEHR-BERT learning objective

Data arguments
cohort_folder: "./content/tasks/{TASK_NAME}/"
data_folder: "./content/meds_reader/"
dataset_prepared_path: "./content/output/cehrbert_dataset_prepared/"

LORA
use_lora: True
lora_rank: 64
lora_alpha: 16
target_modules: [ "query", "value" ]
lora_dropout: 0.1

Below is a list of Med-to-CehrBert related arguments
preprocessing_num_workers: 2
preprocessing_batch_size: 128
if is_data_in_med is false, it assumes the data is in the cehrbert format
is_data_in_meds: true
att_function_type: "cehr_bert"
inpatient_att_function_type: "mix"
include_auxiliary_token: true
include_demographic_prompt: false
if the data is in the meds format, the validation split will be omitted
as the meds already provide train/tuning/held_out splits
validation_split_percentage: 0.05

Huggingface Arguments
dataloader_num_workers: 2
dataloader_prefetch_factor: 2

overwrite_output_dir: false
resume_from_checkpoint: # automatically infer the latest checkpoint from the output folder
seed: 42

output_dir: "./content/output/cehrbert_finetuned"
evaluation_strategy: "epoch"
save_strategy: "epoch"
eval_accumulation_steps: 10

do_train: True
do_predict: True

learning_rate: 0.00005
per_device_train_batch_size: 8
per_device_eval_batch_size: 8
gradient_accumulation_steps: 2

num_train_epochs: 10
warmup_steps: 10
weight_decay: 0.01
logging_dir: "./logs"
logging_steps: 10

save_total_limit:
load_best_model_at_end: true
metric_for_best_model: "eval_loss"
greater_is_better: false

report_to: "none"
"""
FINETUNE_CONFIG_FP = f"./content/output/cehrbert/cehrbert_finetune_config.yaml"
with open(FINETUNE_CONFIG_FP, 'w') as f:
    f.write(cehrbert_finetune_config)

In [None]:
# ## Finetune cehrbert for the downstream task
!python3.11 -m cehrbert.runners.hf_cehrbert_finetune_runner {ROOT_DIR}/output/cehrbert/cehrbert_finetune_config.yaml

python3.11 -m cehrbert.runners.hf_cehrbert_finetune_runner /Users/sim/Documents/projects/MEDS-DEV/demo/work_dir/mimiciv_demo//output/cehrbert/cehrbert_finetune_config.yaml


In [None]:
import pandas as pd

pd.read_parquet("./content/output/cehrbert_finetuned/test_predictions")

In [None]:
!cat {ROOT_DIR}/output/cehrbert_finetuned/test_results.json