In [1]:
import csv
import os
import pickle
import sys
import time

import sklearn.model_selection as ms
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm

In [2]:
import pyhealth
from pyhealth.data import Event, Visit, Patient

import numpy as np

np.random.seed(1234)

In [3]:
from pyhealth.datasets import eICUDataset

dataset_const = eICUDataset(
    root='../../eicu_csv',
    tables=["diagnosis", "treatment", "admissionDx"],
    refresh_cache=False,
    dev=True
)

dataset = eICUDataset(
    root='../../eicu_csv',
    tables=["diagnosis", "treatment", "admissionDx"],
    refresh_cache=False,
    dev=True
)

In [4]:
dataset.stat()
dataset.info()


Statistics of base dataset (dev=True):
	- Dataset: eICUDataset
	- Number of patients: 3671
	- Number of visits: 5000
	- Number of visits per patient: 1.3620
	- Number of events per visit in diagnosis: 7.3732
	- Number of events per visit in treatment: 0.0000
	- Number of events per visit in admissionDx: 2.7186


dataset.patients: patient_id -> <Patient>

<Patient>
    - visits: visit_id -> <Visit> 
    - other patient-level info
    
    <Visit>
        - event_list_dict: table_name -> List[Event]
        - other visit-level info
    
        <Event>
            - code: str
            - other event-level info



In [5]:
dataset.patients.values()

dict_values([Patient 002-10034+141169 with 1 visits, Patient 002-10052+137239 with 1 visits, Patient 002-10079+136669 with 1 visits, Patient 002-10086+153868 with 1 visits, Patient 002-10122+140376 with 1 visits, Patient 002-10145+142615 with 2 visits, Patient 002-10157+145878 with 2 visits, Patient 002-10157+152760 with 1 visits, Patient 002-10160+153075 with 1 visits, Patient 002-10169+150223 with 1 visits, Patient 002-10187+150828 with 1 visits, Patient 002-10241+133684 with 2 visits, Patient 002-1025+136438 with 2 visits, Patient 002-10266+151627 with 1 visits, Patient 002-10270+130077 with 1 visits, Patient 002-10287+151162 with 2 visits, Patient 002-10300+132587 with 1 visits, Patient 002-10300+147538 with 1 visits, Patient 002-10323+137216 with 1 visits, Patient 002-10328+146107 with 1 visits, Patient 002-10351+152389 with 1 visits, Patient 002-10357+135570 with 2 visits, Patient 002-1039+129391 with 2 visits, Patient 002-10393+138053 with 1 visits, Patient 002-10424+134042 with

In [6]:
patient_id = '002-9990+146474'
patient = dataset.patients[patient_id]
visits = dataset.patients[patient_id].visits
visits

OrderedDict([('163891',
              Visit 163891 from patient 002-9990+146474 with 4 events from tables ['diagnosis', 'admissionDx'])])

In [7]:
visit_id = '163891'
dir(visits[visit_id])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'add_event',
 'attr_dict',
 'available_tables',
 'discharge_status',
 'discharge_time',
 'encounter_time',
 'event_list_dict',
 'get_code_list',
 'get_event_list',
 'num_events',
 'patient_id',
 'set_event_list',
 'visit_id']

In [8]:
visit = visits[visit_id]
print("### Accessing the diagnosis events ###")
print(visit.get_event_list('diagnosis'))
visit.get_code_list('diagnosis')

print("### Accessing the admissionDx events ###")
print(visit.get_event_list('admissionDx'))

print("### Accessing the treatment events ###")
print(visit.get_event_list('treatment'))

### Accessing the diagnosis events ###
[Event with ICD9CM code 518.82 from table diagnosis]
### Accessing the admissionDx events ###
[Event with eICU_ADMITDXPATH code admission diagnosis|Was the patient admitted from the O.R. or went to the O.R. within 4 hours of admission?|No from table admissionDx, Event with eICU_ADMITDXPATH code admission diagnosis|Non-operative Organ Systems|Organ System|Cardiovascular from table admissionDx, Event with eICU_ADMITDXPATH code admission diagnosis|All Diagnosis|Non-operative|Diagnosis|Cardiovascular|CHF, congestive heart failure from table admissionDx]
### Accessing the treatment events ###
[]


In [9]:
# Dropping patient with less than 24 hours duration minute
# should be stated in the data entry 'unitdischargeoffset'
# aka visit.discharge_time - visit.encounter_time
def process_patient(ds, hour_threshold=24):
    dataset_processed = ds
    encounter_processed_count = 0
    encounter_deleted_count = 0

    for patient_id, patient in ds.patients.items():
        visits = patient.visits.copy()
        for visit_id, visit in visits.items():
            encounter_processed_count += 1
            if (visit.discharge_time - visit.encounter_time) < np.timedelta64(hour_threshold, 'h'):
                # print("Dropping patient {} visit {} due to less than {} hours duration".format(patient_id, visit_id, hour_threshold))
                encounter_deleted_count += 1
                del dataset_processed.patients[patient_id].visits[visit_id]

    print("Processed {} encounters, deleted {} encounters".format(encounter_processed_count, encounter_deleted_count))
    return dataset_processed


# Processed 200859 encounters, deleted 67959 encounters
dataset_processed = process_patient(dataset)


Processed 5000 encounters, deleted 2106 encounters


In [10]:
# readmission prediction
from pyhealth.tasks import readmission_prediction_eicu_fn
eicu_base = eICUDataset(
    root='../../eicu_csv',
    tables=["diagnosis", "treatment", "admissionDx", "physicalExam", "medication", "lab"],
    dev=True,
    refresh_cache=False,
)
sample_dataset = eicu_base.set_task(task_fn=readmission_prediction_eicu_fn)

Generating samples for readmission_prediction_eicu_fn: 100%|██████████| 3671/3671 [00:00<00:00, 172547.65it/s]


In [11]:
sample_dataset.stat()
print(sample_dataset.available_keys)
print(sample_dataset.samples[0])

Statistics of sample dataset:
	- Dataset: eICUDataset
	- Task: readmission_prediction_eicu_fn
	- Number of samples: 320
	- Number of patients: 278
	- Number of visits: 320
	- Number of visits per patient: 1.1511
	- conditions:
		- Number of conditions per sample: 9.5813
		- Number of unique conditions: 467
		- Distribution of conditions (Top-10): [('518.81', 84), ('J96.00', 83), ('038.9', 72), ('584.9', 70), ('N17.9', 70), ('A41.9', 68), ('491.20', 64), ('J44.9', 64), ('458.9', 61), ('I95.9', 61)]
	- procedures:
		- Number of procedures per sample: 19.8344
		- Number of unique procedures: 55
		- Distribution of procedures (Top-10): [('notes/Progress Notes/Physical Exam/Physical Exam Obtain Options/Performed - Structured', 319), ('notes/Progress Notes/Physical Exam/Physical Exam/Neurologic/GCS/Score/scored', 316), ('notes/Progress Notes/Physical Exam/Physical Exam/Constitutional/Weight and I&O/Weight (kg)/Current', 280), ('notes/Progress Notes/Physical Exam/Physical Exam/Neurologic/GCS/

In [12]:
from pyhealth.tasks import mortality_prediction_eicu_fn
sample_dataset = eicu_base.set_task(mortality_prediction_eicu_fn)
sample_dataset.stat()
print(sample_dataset.available_keys)
print(sample_dataset.samples[0])

Generating samples for mortality_prediction_eicu_fn: 100%|██████████| 3671/3671 [00:00<00:00, 175218.09it/s]

Statistics of sample dataset:
	- Dataset: eICUDataset
	- Task: mortality_prediction_eicu_fn
	- Number of samples: 320
	- Number of patients: 278
	- Number of visits: 320
	- Number of visits per patient: 1.1511
	- conditions:
		- Number of conditions per sample: 9.5813
		- Number of unique conditions: 467
		- Distribution of conditions (Top-10): [('518.81', 84), ('J96.00', 83), ('038.9', 72), ('584.9', 70), ('N17.9', 70), ('A41.9', 68), ('491.20', 64), ('J44.9', 64), ('458.9', 61), ('I95.9', 61)]
	- procedures:
		- Number of procedures per sample: 19.8344
		- Number of unique procedures: 55
		- Distribution of procedures (Top-10): [('notes/Progress Notes/Physical Exam/Physical Exam Obtain Options/Performed - Structured', 319), ('notes/Progress Notes/Physical Exam/Physical Exam/Neurologic/GCS/Score/scored', 316), ('notes/Progress Notes/Physical Exam/Physical Exam/Constitutional/Weight and I&O/Weight (kg)/Current', 280), ('notes/Progress Notes/Physical Exam/Physical Exam/Neurologic/GCS/Mo




In [13]:
from load_eicu import readmission_prediction_eicu_fn_customized
samples_list = []
readmission_count = 0
for patient in eicu_base.patients:
    samples = readmission_prediction_eicu_fn_customized(eicu_base.patients[patient], 20)
    if len(samples) != 0:
        for sample in samples:
            if sample['label'] == 1:
                readmission_count += 1
                break
        samples_list.append(samples)

print("Total number of patients: {}".format(len(eicu_base.patients.keys())))
print("Total number of samples: {}".format(len(samples_list)))
print("Total number of readmission: {}".format(readmission_count))


ModuleNotFoundError: No module named 'load_eicu'

In [None]:
admissionDx = samples_list[42][1]['admissionDx'][0]
conditions = samples_list[42][1]['conditions'][0]
treatment = samples_list[42][1]['treatment'][0]
# samples_list[42]
[ax.lower() for ax in admissionDx]

In [None]:
visit = eicu_base.patients[samples_list[42][1]['patient_id']].visits[samples_list[42][1]['visit_id']]

In [None]:
visit.get_event_list('admissionDx')

In [None]:
visit.get_code_list('admissionDx')

In [None]:
# print the encounter id and its label in samples_list
for sample in samples_list[42]:
    print(sample['visit_id'], sample['label'])