In [1]:
from datasets import load_dataset, load_from_disk, concatenate_datasets
import numpy as np
import pandas as pd

#### MedNLI Dataset

In [2]:
# local path
mednli_data_dir = "/mnt/sdd/niallt/mednli/hf_dataset/"

In [3]:
mednli_dataset = load_from_disk(mednli_data_dir)

In [4]:
mednli_dataset

DatasetDict({
    train: Dataset({
        features: ['pairID', 'gold_label', 'sentence1', 'sentence2', 'sentence1_parse', 'sentence2_parse', 'sentence1_binary_parse', 'sentence2_binary_parse'],
        num_rows: 11232
    })
    test: Dataset({
        features: ['pairID', 'gold_label', 'sentence1', 'sentence2', 'sentence1_parse', 'sentence2_parse', 'sentence1_binary_parse', 'sentence2_binary_parse'],
        num_rows: 1422
    })
    validation: Dataset({
        features: ['pairID', 'gold_label', 'sentence1', 'sentence2', 'sentence1_parse', 'sentence2_parse', 'sentence1_binary_parse', 'sentence2_binary_parse'],
        num_rows: 1395
    })
})

In [5]:
mednli_dataset["train"][0]

{'pairID': '23eb94b8-66c7-11e7-a8dc-f45c89b91419',
 'gold_label': 'entailment',
 'sentence1': 'Labs were notable for Cr 1.7 (baseline 0.5 per old records) and lactate 2.4.',
 'sentence2': ' Patient has elevated Cr',
 'sentence1_parse': '(ROOT (S (NP (NNPS Labs)) (VP (VBD were) (ADJP (JJ notable) (PP (IN for) (NP (NP (NP (NN Cr) (CD 1.7)) (PRN (-LRB- -LRB-) (NP (NP (NN baseline) (CD 0.5)) (PP (IN per) (NP (JJ old) (NNS records)))) (-RRB- -RRB-))) (CC and) (NP (NN lactate) (CD 2.4)))))) (. .)))',
 'sentence2_parse': '(ROOT (S (NP (NN Patient)) (VP (VBZ has) (NP (JJ elevated) (NN Cr)))))',
 'sentence1_binary_parse': '( Labs ( ( were ( notable ( for ( ( ( ( Cr 1.7 ) ( -LRB- ( ( ( baseline 0.5 ) ( per ( old records ) ) ) -RRB- ) ) ) and ) ( lactate 2.4 ) ) ) ) ) . ) )',
 'sentence2_binary_parse': '( Patient ( has ( elevated Cr ) ) )'}

In [None]:
# from nlpie group

In [5]:
nlpie_mednli_data_dir = "/mnt/sdd/niallt/mednli/nlpie_hf_dataset/MedNLI/"

In [6]:
nlpie_mednli_dataset = load_from_disk(nlpie_mednli_data_dir)

In [7]:
nlpie_mednli_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'labels'],
        num_rows: 11232
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'labels'],
        num_rows: 1395
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'labels'],
        num_rows: 1422
    })
})

In [11]:
nlpie_mednli_dataset["train"][0]

{'sentence1': 'Labs were notable for Cr 1.7 (baseline 0.5 per old records) and lactate 2.4.',
 'sentence2': ' Patient has elevated Cr',
 'labels': 2}

In [21]:
# get value counts of each label in each dataset

nlpie_mednli_dataset["test"].to_pandas().labels.value_counts()

labels
2    474
0    474
1    474
Name: count, dtype: int64

### i2b2 2010


In [10]:
i2b2_2010_data_dir = "/mnt/sdd/niallt/bio-lm/data/tasks/I2B22010NER_hf_dataset/"
i2b2_2010_dataset = load_from_disk(i2b2_2010_data_dir)

In [11]:
i2b2_2010_dataset["train"][50]

{'tokens': ['The',
  'patient',
  '&apos;s',
  'sister',
  'has',
  'a',
  'history',
  'of',
  'cervical',
  'cancer',
  '.'],
 'ner_tags_str': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-problem',
  'I-problem',
  'O'],
 'ner_tags': [6, 6, 6, 6, 6, 6, 6, 6, 1, 5, 6]}

## I2B2_2010_Relation Extraction


In [8]:
i2b2_2010_RE_data_dir = "/mnt/sdd/niallt/i2b2_2010_RE/hf_dataset/i2b2-2010-RE/"

In [9]:
i2b2_2010_RE_dataset = load_from_disk(i2b2_2010_RE_data_dir)

In [10]:
i2b2_2010_RE_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 22256
    })
    validation: Dataset({
        features: ['sentence', 'labels'],
        num_rows: 43000
    })
})

In [15]:
i2b2_2010_RE_dataset["train"][0]

{'sentence': 'carduac enzymes mildly elevated , felt 02-08 @problem$ in setting of @problem$ , CK and MB trended Mcewen at time of admission to medical floor .',
 'labels': 0}

# i2b2 2012

In [16]:
i2b2_2012_data_dir = "/mnt/sdd/niallt/bio-lm/data/tasks/i2b2-2012_hf_dataset/"

In [21]:
i2b2_2012_dataset = load_from_disk(i2b2_2012_data_dir)

In [23]:
i2b2_2012_dataset['train'][0]

{'tokens': ['Admission', 'Date', ':'],
 'ner_tags_str': ['B-OCCURRENCE', 'O', 'O'],
 'ner_tags': [11, 0, 0]}

# 2014 i2b2

In [19]:
i2b2_2014_data_dir = "/mnt/sdd/niallt/bio-lm/data/tasks/i2b2-2014_hf_dataset/"

In [53]:
i2b2_2014_dataset = load_from_disk(i2b2_2014_data_dir)

In [55]:
i2b2_2014_dataset['train'][0]

{'tokens': ['Record', 'date:', '2081-10-18'],
 'ner_tags_str': ['O', 'O', 'B-DATE'],
 'ner_tags': [39, 39, 17]}

# clinical outcomes datasets

In [2]:
clinical_outcomes_dir = "/mnt/sdd/efficient_ml_data/datasets/mimic3-clinical-outcomes/"

## LoS 

In [3]:
los_data_dir = f"{clinical_outcomes_dir}/los/"

In [5]:
# there are seemingly several train/valid files
los_df = pd.read_csv(f"{los_data_dir}/LOS_WEEKS_adm_train.csv")

In [10]:
los_df.head(), los_df.shape

(       id                                               text  los_label
 0  155297  CHIEF COMPLAINT: Decreased responsiveness  Maj...          0
 1  168150  CHIEF COMPLAINT: Fatigue, wide complex tachyca...          1
 2  154015  CHIEF COMPLAINT: preop CABG\n\nPRESENT ILLNESS...          2
 3  145268  CHIEF COMPLAINT: \n\nPRESENT ILLNESS: This is ...          1
 4  170521  CHIEF COMPLAINT: worsening shortness of breath...          2,
 (30421, 3))

In [7]:
los_train_df = pd.read_csv(f"{los_data_dir}/train.csv")

In [9]:
los_train_df.head(), los_train_df.shape

(       id                                               text  los_label
 0  155297  CHIEF COMPLAINT: Decreased responsiveness  Maj...          0
 1  168150  CHIEF COMPLAINT: Fatigue, wide complex tachyca...          1
 2  154015  CHIEF COMPLAINT: preop CABG\n\nPRESENT ILLNESS...          2
 3  145268  CHIEF COMPLAINT: \n\nPRESENT ILLNESS: This is ...          1
 4  170521  CHIEF COMPLAINT: worsening shortness of breath...          2,
 (30421, 3))

In [6]:
los_dataset =  load_dataset("csv", 
                        data_files = {"train":f"{los_data_dir}/train.csv",
                                      "validation":f"{los_data_dir}/valid.csv",
                                      "test":f"{los_data_dir}/test.csv"},)


Downloading and preparing dataset csv/default to /home/niallt/.cache/huggingface/datasets/csv/default-85b93545fbaf13bc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/niallt/.cache/huggingface/datasets/csv/default-85b93545fbaf13bc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
los_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'los_label'],
        num_rows: 30421
    })
    validation: Dataset({
        features: ['id', 'text', 'los_label'],
        num_rows: 4391
    })
    test: Dataset({
        features: ['id', 'text', 'los_label'],
        num_rows: 8797
    })
})

In [8]:
los_dataset["train"][0]

{'id': 155297,
 'text': 'CHIEF COMPLAINT: Decreased responsiveness  Major [**First Name3 (LF) 2947**] or Invasive Procedure: Femoral central line\n\nPRESENT ILLNESS: 65M with PMHx of CVA (nonverbal and does not move his arms or legs at baseline), Afib on coumadin, multiple pneumonias (s/p trach/PEG [**3-/2200**]), multiple UTI/urosepsis with Proteus sensitive to Cefepime/ceftriaxone/meropenem, ESBL Klebsiella sensitive to cipro/meropenem/zosyn, C diff s/p colectomy, type 2 diabetes mellitus, peripheral vascular disease.  Patient presents from [**Hospital1 1501**] found today with sats 80s and not responding to commands, not nodding. Baseline non-verbal, but will nod to questions.  In ED, BPs dipped to high 80s, low 90s. Patient with a trach, seems to have a cuff [**Last Name (LF) 3564**], [**First Name3 (LF) **] need to be changed out. UA positive. Given cefepime and vanco. Trop may be demand. Given 2L NS.\n\nMEDICAL HISTORY: * Hypertension * Hypothyroidism * H/o CVA (bilateral embolic

## Mortality Prediction

In [9]:
mp_data_dir = f"{clinical_outcomes_dir}/mp/"

In [10]:
mp_dataset = load_dataset("csv",
                        data_files = {"train":f"{mp_data_dir}/train.csv",
                                        "validation":f"{mp_data_dir}/valid.csv",
                                        "test":f"{mp_data_dir}/test.csv"},)

Downloading and preparing dataset csv/default to /home/niallt/.cache/huggingface/datasets/csv/default-cc166c590192d179/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/niallt/.cache/huggingface/datasets/csv/default-cc166c590192d179/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
mp_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'hospital_expire_flag'],
        num_rows: 33954
    })
    validation: Dataset({
        features: ['id', 'text', 'hospital_expire_flag'],
        num_rows: 4908
    })
    test: Dataset({
        features: ['id', 'text', 'hospital_expire_flag'],
        num_rows: 9822
    })
})

In [12]:
mp_dataset["train"][0]

{'id': 107384,
 'text': 'CHIEF COMPLAINT: AMS, concern for toxic alcohol ingestion\n\nPRESENT ILLNESS: Mr. [**Known lastname 27389**] is a 39 year old man with h/o EtOH and polysubstance abuse, seizure disorder, who was found to be unresponsive while visiting his partner in the ICU earlier today. . The patient was visiting his partner in the ICU earlier today. He was awake and conversant in the morning with no acute complaints. He was noted to be sleeping on the floor, but walked to the chair by himself when he was awakened. Later in the afternoon, the patient was noted to still be asleep in the chair. He was unarousable with verbal stimuli or sternal rub, so he was taken down to the ED. . In the ED, the patient was initially altered, but was otherwise hemodynamically stable. No urine incontinence or e/o toxidromes. Labs notable for EtOH 86, Osms 366, anion gap 16, lactate 3.8. Utox positive for barbs, but Stox and Utox otherwise negative. Given high serum osmolar gap (60), toxicology 

## proc

In [7]:
mimic_proc_dir = f"{clinical_outcomes_dir}/proc/"

In [9]:
proc_dataset = load_dataset("csv",
                        data_files = {"train":f"{mimic_proc_dir}/PRO_GROUPS_3_DIGITS_train.csv",
                                        "validation":f"{mimic_proc_dir}/PRO_GROUPS_3_DIGITS_val.csv",
                                        "test":f"{mimic_proc_dir}/PRO_GROUPS_3_DIGITS_test.csv"},)

Downloading and preparing dataset csv/default to /home/niallt/.cache/huggingface/datasets/csv/default-21bd43a7138cb822/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/niallt/.cache/huggingface/datasets/csv/default-21bd43a7138cb822/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
proc_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'short_codes'],
        num_rows: 30073
    })
    validation: Dataset({
        features: ['id', 'text', 'short_codes'],
        num_rows: 4368
    })
    test: Dataset({
        features: ['id', 'text', 'short_codes'],
        num_rows: 8693
    })
})

In [13]:
proc_dataset["train"][5]

{'id': 137014,
 'text': 'Unit No:  [**Numeric Identifier 60728**]\nAdmission Date: [**2118-4-18**]\nDischarge Date: [**2118-4-25**]\nDate of Birth:  [**2066-7-21**]\nSex:  F\nService:  CSU\n\n\nHISTORY OF PRESENT ILLNESS:  This 51-year-old female had an\naortic valvuloplasty performed via a sternotomy in [**2093**] at\n[**Hospital3 1810**] with a known diagnostic of bicuspid\naortic valve and complaints of increasing fatigue and\ndecreasing exercise tolerance over the past year. She had\nbeen followed by cardiologists, Dr. [**First Name8 (NamePattern2) **] [**Last Name (NamePattern1) 1924**] and Dr. [**Last Name (STitle) 60729**]\n[**Name (STitle) 60730**] at [**Hospital3 1810**]. Recently, echocardiogram and\ncath were performed which revealed severe aortic stenosis.\nPreoperatively, a cardiac cath was performed at [**Hospital **]\nHospital. It showed normal coronaries and aortic valve area\nof 0.6 cm to 0.8 cm2, and a gradient of 60 mmHg. Cardiac echo\nperformed on [**2118-3-24**] sh

icd9 diag

In [5]:
mimic_dia_dir = f"{clinical_outcomes_dir}/dia/"

In [7]:
mimic_dia_dir

'/mnt/sdd/efficient_ml_data/datasets/mimic3-clinical-outcomes//dia/'

In [8]:
dia_dataset = load_dataset("csv",
                        data_files = {"train":f"{mimic_dia_dir}/DIA_GROUPS_3_DIGITS_adm_train.csv",
                                        "validation":f"{mimic_dia_dir}/DIA_GROUPS_3_DIGITS_adm_val.csv",
                                        "test":f"{mimic_dia_dir}/DIA_GROUPS_3_DIGITS_adm_test.csv"},)

Downloading and preparing dataset csv/default to /home/niallt/.cache/huggingface/datasets/csv/default-6f196285a2ec5a34/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/niallt/.cache/huggingface/datasets/csv/default-6f196285a2ec5a34/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
dia_dataset["train"][0]

{'id': 159643,
 'text': 'CHIEF COMPLAINT: \n\nPRESENT ILLNESS: This 60 year old white male has a known murmur since childhood.  He is status post inferior myocardial infarction and anteroseptal myocardial infarction in [**2130**] and status post angioplasty at that time.  Since [**2140-9-23**] he has had increased dyspnea on exertion and an echocardiogram in [**2140-12-24**], revealed an aortic stenosis with an 80 mm gradient and ejection fraction of 40% with apical akinesis.  He had a cardiac catheterization in [**2140-12-24**] which revealed an ejection fraction of 40%, 1+ mitral regurgitation with moderate MAC, left anterior descending is 90% mid 90% lesion, diagonal 1 70% lesion and the right coronary artery had a mid occlusion.  He is now admitted for aortic valve replacement and coronary artery bypass graft.\n\nMEDICAL HISTORY: Significant for history of skin cancer of the left shoulder, history of hypothyroidism, history of hypercholesterolemia and history of hypertension and hi

# Test fewshot sampling

In [18]:
# rename los label
los_dataset = los_dataset.rename_column("los_label","labels")

In [19]:
labels = np.unique(los_dataset['train']['labels']).tolist()

In [20]:
labels

[0, 1, 2, 3]

In [55]:
los_dataset[]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 30421
    })
    validation: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 4391
    })
    test: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 8797
    })
})

In [56]:

num_samples_per_class = 50  # Set this to determine the number of samples per class

# Extract subsets of the training dataset with equal number of samples per class

train_datasets = []
for label in range(len(labels)):
    label_dataset = los_dataset['train'].filter(lambda x: x['labels'] == label).shuffle(seed=42)
    num_samples = len(label_dataset)
    if num_samples <= num_samples_per_class:
        # select all samples in the label
        pass
    else:
        # select num_samples_per_class samples from the label
        label_dataset = label_dataset.select(range(num_samples_per_class))
        
    train_datasets.append(label_dataset)

los_dataset["train"] = concatenate_datasets(train_datasets)


Loading cached processed dataset at /home/niallt/.cache/huggingface/datasets/csv/default-85b93545fbaf13bc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0b4b4197dd27e828.arrow
Loading cached shuffled indices for dataset at /home/niallt/.cache/huggingface/datasets/csv/default-85b93545fbaf13bc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-62f01273f0f8d64c.arrow
Loading cached processed dataset at /home/niallt/.cache/huggingface/datasets/csv/default-85b93545fbaf13bc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-469efe14ec2bc115.arrow
Loading cached shuffled indices for dataset at /home/niallt/.cache/huggingface/datasets/csv/default-85b93545fbaf13bc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-eaa1ff6f42759f84.arrow
Loading cached processed dataset at /home/niallt/.cache/huggingface/datasets/csv/default-85b93545fbaf13bc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325

In [57]:
los_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 4391
    })
    test: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 8797
    })
})

In [51]:
range(50)

range(0, 50)

In [59]:
from datasets import DatasetDict

In [61]:
d = DatasetDict()
d["train"] = los_dataset["train"]

In [62]:
d

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 200
    })
})

In [58]:
fewshot_los_train_dataset

Dataset({
    features: ['id', 'text', 'labels'],
    num_rows: 30421
})

In [48]:
if "labels" in fewshot_los_train_dataset.features:
    print(f"year")

year


# Random

In [3]:
import nltk
import pandas as pd

In [2]:
text = ["I have a headache and a fever", "random sentence"]

In [5]:
text_df = pd.DataFrame({"id": [0, 1], "text": text})

In [6]:
text_df

Unnamed: 0,id,text
0,0,I have a headache and a fever
1,1,random sentence


In [12]:
text = text_df.text.str.lower().str.cat(sep=" ")

In [13]:
text

'i have a headache and a fever random sentence'

In [19]:
words = nltk.word_tokenize(text)

In [20]:
words

['I', 'have', 'a', 'headache', 'and', 'a', 'fever']

In [21]:
nltk.FreqDist(words)

FreqDist({'a': 2, 'I': 1, 'have': 1, 'headache': 1, 'and': 1, 'fever': 1})

In [39]:
import torch

In [41]:
x = torch.randn(2, 6, 64)

In [42]:
x.shape

torch.Size([2, 6, 64])

In [44]:
x_resized = x.view(-1, 64)

In [45]:
x_resized.shape

torch.Size([12, 64])

In [46]:
from transformers import AutoTokenizer, AutoModel

In [None]:
# generate ra