In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import json
import pandas as pd
import math
import ast
import os
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration
from utilities import health_data


### Accessing config file and log

In [16]:
config = configuration.get_config()
for key in config:
    print(f'{key:40} (type: {type(config[key])})')


logging = logger.init_logger(config['system_log'])
logging.debug('Logger has started ont notebook 09 Random sample of instances.ipynb ...')

system_log                               (type: <class 'str'>)
json_file                                (type: <class 'str'>)
train_val_json                           (type: <class 'str'>)
heldout_json                             (type: <class 'str'>)
unused_after_heldout_json                (type: <class 'str'>)
unified_merged_file_cz                   (type: <class 'str'>)
unified_merged_file_noncz                (type: <class 'str'>)
unified_merged_file                      (type: <class 'str'>)
data_path                                (type: <class 'str'>)
cz_files                                 (type: <class 'list'>)
noncz_files                              (type: <class 'list'>)
2023-11-08 10:34:55,112 - root - DEBUG - Logger has started ont notebook 09 Random sample of instances.ipynb ...


### Reading data from JSON file

In [3]:
f = open(config['json_file'])
data = json.load(f)
print(len(data))

618697


### Parsing data to Python Data Classes

In [4]:
all_admissions = []
for ix in data:
    all_admissions.append(
        health_data.Admission.from_dict_data(admit_id=int(ix), admission=data[ix])
        )
len(all_admissions)

618697

### Ordering data per patient to find for each patient if there are readmissions 

In [5]:
from collections import defaultdict

# Dictionary organizing data by patient
patient2admissions = defaultdict(list)
for admission in all_admissions:
    code = admission.code
    patient2admissions[code].append(admission)

# Ordering patient list by discharge date (from back )
for patient_code in patient2admissions:
    admissions_list = patient2admissions[patient_code]
    admissions_list = sorted(admissions_list, key=lambda admission: admission.discharge_date, reverse=False)
    assert all([admissions_list[i].discharge_date <= admissions_list[i+1].discharge_date for i in range(len(admissions_list)-1)])
    patient2admissions[patient_code] = admissions_list
print(len(patient2admissions))

298372


### Reviewing ordered data for each patient, if there is a readmission, data is modified accordingly 

In [6]:
patient_count=0
valid_readmission_count=0
for patient_code in patient2admissions:
    patient_admissions = patient2admissions[patient_code]
    ix = 0 
    while ix < len(patient_admissions):
        readmission_code = patient_admissions[ix].readmission_code
        if health_data.ReadmissionCode.is_readmit(readmission_code):
            # Either is not the first admission (ix>0) or 
            # we don't have the patient previous admition (readmission close to begining of dataset) (admit-(2015-01-01))<28 days
            # assert ix>0 or (patient_admissions[ix].admit_date - datetime.datetime.fromisoformat('2015-01-01')).days<365
            if ix>0 and  patient_admissions[ix-1].is_valid_readmission(patient_admissions[ix]):
                patient_admissions[ix-1].add_readmission(patient_admissions[ix])
                valid_readmission_count+=1
        ix+=1
    patient_count+=1
valid_readmission_count

60041

### Computing Held-out boundaries

In [7]:
# Computing average LOS
length_of_stays=[]
for admission in all_admissions:
    if not admission.admit_date is None:
        days = (admission.discharge_date - admission.admit_date).days
        length_of_stays.append(days)
print(f'mean LOS: {np.average(length_of_stays):.3f} days')
print(f'std LOS:  {np.std(length_of_stays):.3f} days')

# Assuming normal distribution of LOS, mean LOS + one std to each side will contain 68 % of instances. 
# So, from 0 to mean LOS + one std has 84 %  (until the mean has 50% of instances, + 68%/2 for the mean LOS to (mean LOS + one std))
# We will round up to a length of stay of 60 days. 


mean LOS: 10.338 days
std LOS:  48.062 days


In [8]:

held_out_size = 365
readmission_timeframe = 30
time_for_discharge_to_happen = 60   # For us to have the readmission, the readmission and the discharge has to happen before the end of our data (Dec 31st, 2022)
                                    # So, for us to see the discharge it has to happen in October 2nd, 2022, 30 days after the readmission happens (Nov 1st, 2022), 60 after the discharge 
                                    # happens (on Dec 31st, 2022), so we will have the full entry of the readmission, because the discharge happened before the end of our dataset.

latest_date = max([admission.discharge_date for admission in all_admissions])
begining_dataset = min([admission.discharge_date for admission in all_admissions])


start_heldout=latest_date - datetime.timedelta(days=held_out_size+readmission_timeframe+time_for_discharge_to_happen)
end_heldout = latest_date - datetime.timedelta(days=readmission_timeframe+time_for_discharge_to_happen) 

print(f'Begining:                              {begining_dataset}')
print(f'Start held-out:                        {start_heldout}')
print(f'End held-out:                          {end_heldout}')
print(f'End data (usable 30 days prior):       {latest_date}')


str(latest_date)

Begining:                              2015-01-01 00:00:00
Start held-out:                        2021-10-02 00:00:00
End held-out:                          2022-10-02 00:00:00
End data (usable 30 days prior):       2022-12-31 00:00:00


'2022-12-31 00:00:00'

### Creating two JSON files from the two the original JSON file

In [11]:


held_out_data = {}
train_val_data = {}
unused_after_heldout={}
for ix in data:
    discharge_date = datetime.datetime.fromisoformat(data[ix]['Discharge Date'])
    if begining_dataset <= discharge_date and discharge_date < start_heldout:
        train_val_data[ix]=data[ix]
    elif start_heldout<= discharge_date and discharge_date <= end_heldout:
        held_out_data[ix]=data[ix]
    else:
        unused_after_heldout[ix]=data[ix]


print(f'held out:      {len(held_out_data):,}')
print(f'train and dev: {len(train_val_data):,}')
print(f'unused:        {len(unused_after_heldout):,}')

held out:      74,350
train and dev: 524,986
unused:        19,361


In [12]:
len(held_out_data)+len(train_val_data)+len(unused_after_heldout)

618697

In [17]:
# Training JSON
with open(config['train_val_json'], 'w') as f:
    json.dump(train_val_data, f)

# Held-out JSON
with open(config['heldout_json'], 'w') as f:
    json.dump(held_out_data, f)

# Unused JSON
with open(config['unused_after_heldout_json'], 'w') as f:
    json.dump(unused_after_heldout, f)

In [None]:
# train_and_validation_admissions = []
# heldout_admissions = []

# for admission in all_admissions:
#     if begining_dataset <= admission.discharge_date and admission.discharge_date < start_heldout:
#         train_and_validation_admissions.append(admission)
#     elif start_heldout <= admission.discharge_date:
#         heldout_admissions.append(admission)
# print(f'len(train_and_validation_admissions)= {train_and_validation_admissions}')
# print(f'len(heldout_admissions)=              {heldout_admissions}')

In [None]:
# count=0
# for admission in all_admissions:
#     if admission.has_readmission():
#         count+=1
# print(f'{100*(count/len(all_admissions))} %')