In [1]:
#general instructions

#2 or more patient encounters are required as the authors remove the last encounter while making predictions
# patient mortality is the outcome label and aquired from mimic. 
#2825 out of 7537 patients died (37.9%)
#Data set was divided into  train (5275/7537), validation (753/7537) and test (1509/7537)
# Once the optimal parameters were selected, the model was retrained by combining train and validation (6028/7537)
#
#
#********************** stars mean the data is not correct but the implemention functions.
#

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import time

In [3]:
# set seed - only really necessary to compare results with each other, but can remove from the submission
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# define data path
DATA_PATH = "~/Documents/DLH 598/mimic-iii-clinical-database-1.4"


# define data - separated to show which modules load (some take much longer than others)
#The commented out sections take a long time to load, and weren't immidiately useful at this stage
#Can uncommment when relevant

#TODO
# - reorganize into a definition() after  

In [4]:
Patients = pd.read_csv(DATA_PATH + '/PATIENTS.csv')

In [187]:
Diagnosis = pd.read_csv(DATA_PATH + '/DIAGNOSES_ICD.csv')

In [6]:
Procedures = pd.read_csv(DATA_PATH + '/D_ICD_PROCEDURES.csv')

In [7]:
#Prescriptions = pd.read_csv(DATA_PATH + '/PRESCRIPTIONS.csv')

In [8]:
#LabEvents = pd.read_csv(DATA_PATH + '/LABEVENTS.csv')

In [9]:
LabLabels = pd.read_csv(DATA_PATH + '/D_LABITEMS.csv')

In [10]:
Admissions = pd.read_csv(DATA_PATH + '/ADMISSIONS.csv')

In [11]:
ProcedureLabels = pd.read_csv(DATA_PATH + '/D_ITEMS.csv')

In [97]:
IcuStays = pd.read_csv(DATA_PATH + '/ICUSTAYS.csv')

In [98]:
#ChartEvents = pd.read_csv(DATA_PATH + '/CHARTEVENTS.csv')

In [145]:
# patient inclusion - Only ICU patients with 2 or more visits are included. Patient IDs gathered
#May be incorrect as the paper specifies only 7,537 patients in their list.


#********************************************
#Generate an ignore list for later ease of use.
def Patient_Inclusion(IcuStays):
    IcuList = IcuStays.to_numpy()

    unique, counts = np.unique(IcuList[1:, 1], return_counts=True)
    
    AllPatients = np.array(list(zip(np.transpose(unique), np.transpose(counts))))

    PatientList = []
    IgnoreList = []

    for i in AllPatients:
        if i[1] >= 2:
            PatientList.append(i)
        else:
            IgnoreList.append(i[0])

    print(pd.DataFrame(PatientList)[0])
    print(len(IgnoreList))
    return PatientList, IgnoreList

In [173]:
#To be added to main at the end


#separate patient ids and count information
PatientList, IgnoreList = Patient_Inclusion(IcuStays)
PatientList = pd.DataFrame(PatientList, columns = ['ID', 'Count'])
Patient_ID = PatientList['ID']
Visits = PatientList['Count']

0           7
1          17
2          21
3          23
4          34
        ...  
8750    99865
8751    99883
8752    99897
8753    99923
8754    99982
Name: 0, Length: 8755, dtype: int64
37720


In [174]:
# data statistics - Ethnicity and visit statistics

def Ethnicity_Statistics(Admissions):
    white = 0
    black = 0
    hispanic = 0
    asian = 0
    other = 0
    
    #separate out ethnicity information
    EthStatistics = Admissions[['SUBJECT_ID', 'ETHNICITY']]
    IcuPatientEth = EthStatistics[EthStatistics.SUBJECT_ID.isin(Patient_ID)].drop_duplicates('SUBJECT_ID').reset_index(drop = True)

    #Not yet working.... General idea.
    for i in IcuPatientEth.ETHNICITY:
        if "WHITE" in i:
            white += 1
        if "BLACK" in i:
            black += 1
        if "HISPANIC" in i:
            hispanic += 1
        if "ASIAN"  in i:
            asian += 1
    
        #I tried combining it, but the count was always more wrong...
        if "OTHER" in i:
            other += 1
        if "DECLINED" in i:
            other += 1
        if "NATIVE" in i:
            other += 1
        if "MIDDLE EASTERN" in i:
            other += 1
        if "MULTI" in i:
            other += 1
        if "UNABLE" in i:
            other += 1
        if "UNKNOWN" in i:
            other += 1
        

    print(white / len(IcuPatientEth))
    print(black / len(IcuPatientEth))
    print(hispanic / len(IcuPatientEth))
    print(asian / len(IcuPatientEth))
    print(other / len(IcuPatientEth))

    print(((white + black + hispanic + asian + other) / len(IcuPatientEth)))
    
    #*****************************************************
    #6 too many values. Need to sort that out....
    print(len(IcuPatientEth))
    print(white + black + asian + hispanic + other)
    return white, black, hispanic, asian, other

In [175]:
#To be added to main at the end
Ethnicity_Statistics(Admissions)

0.7356938892061679
0.10428326670474015
0.03438035408338092
0.024785836664762993
0.10154197601370646
1.0006853226727583
8755
8761


(6441, 913, 301, 217, 889)

In [132]:
#Age statistics - unable to find age so far
#def Age_statistics():

In [184]:
#Sex statistics
def Gender_Statistics(Patients):
    male = 0
    female = 0
    
    SexStatistics = Patients[['SUBJECT_ID', 'GENDER']]
    IcuPatientSex = SexStatistics[SexStatistics.SUBJECT_ID.isin(Patient_ID)].drop_duplicates('SUBJECT_ID').reset_index(drop = True)
    
    for i in IcuPatientSex.GENDER:
        if i == "M":
            male += 1
        if i == "F":
            female += 1
            
    print(male)
    print(female)
    
    print(male / len(IcuPatientSex))
    print(female / len(IcuPatientSex))
    
    return male, female

In [185]:
Gender_Statistics(Patients)

4943
3812
0.5645916619074814
0.43540833809251855


(4943, 3812)

In [189]:
#ICD Code statistics
def ICD_Codes(Diagnosis):
    
    ICDStatistics = Diagnosis[['SUBJECT_ID', 'ICD9_CODE']]
    IcuPatientICD = ICDStatistics[ICDStatistics.SUBJECT_ID.isin(Patient_ID)].reset_index(drop = True)
    print(IcuPatientICD)
        
ICD_Codes(Diagnosis)

        SUBJECT_ID ICD9_CODE
0              109     40301
1              109       486
2              109     58281
3              109      5855
4              109      4254
...            ...       ...
278613       97488      0414
278614       97488     30391
278615       97488     E8798
278616       97488     78791
278617       97488     V4986

[278618 rows x 2 columns]
SUBJECT_ID
ICD9_CODE


In [None]:
#Medication statistics


In [None]:
#Lab component statistics
#The paper mentioned significant removal of duplicate patient information at this step (20% of data removed)
#a total of 5,609,021 lab tests were removed as they were not used.


In [None]:
#Outcome label = patient mortatility. Flag found in Admissions?


In [None]:
#Data loader


In [None]:
#Retain


In [None]:
#Retain training


In [None]:
#Clout model (The most important model to finish)


In [None]:
#Clout training


In [None]:
#Linear regression model


In [None]:
#LR training


In [None]:
#Evaluation (do all together? or can do separate?)


In [None]:
#Summary conclusions


In [None]:
#Summary Graphs etc...
