### Import Libraries 

In [9]:
import pandas as pd
import numpy as np
from pandas.io import gbq

pd.options.mode.chained_assignment = None  # default='warn'

In [13]:
# Load the Data:
query_hep = """
            SELECT *
            FROM `bachelorarbeit-heparin.mimic_data.cohort1_hep_data`
            WHERE rel_starttime < 1440
            """
df_hep = gbq.read_gbq(query_hep, project_id = "bachelorarbeit-heparin")

Downloading: 100%|███████████████████████████████████████████████████████████| 15118/15118 [00:04<00:00, 3083.67rows/s]


### The aim of this code is to take the heparin related data from cohort 1 and to fill up missing intervals, where no heparin was administered with a rate of zero. This facilitates subsequent work with the data

In [18]:
new_df = pd.DataFrame(columns=['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'init_hep_starttime', 'rel_starttime', 'rel_endtime', 'treatment_length', 'kum_amount_start', 'kum_amount_end', 'kum_amount_start_by_weight', 'kum_amount_end_by_weight', 'amount', 'amountuom', 'rate', 'rateuom', 'patientweight', 'rate_by_weight', 'ordercategorydescription'])
new_df = new_df[0:0] # makes sure, that df is empty, before the loop starts


for key, item in df_hep.groupby(['stay_id']):
    item.reset_index(inplace=True, drop=True)
    
    # Data Imputation:
    # 1. for-loop: if there are missing hep_intervals, insert the missing interval with rate=0
    for i in range(len(item)): 
        
        if (item.iloc[i,6] != 0) and (item.iloc[i,6] != item.iloc[i-1,7]):
            item.loc[i-0.5] = item.iloc[i,0], item.iloc[i,1], item.iloc[i,2], item.iloc[i-1,4], item.iloc[i,3], item.iloc[i,5], item.iloc[i-1,7], item.iloc[i,6], item.iloc[i,6]-item.iloc[i-1,7], item.iloc[i-1,10], item.iloc[i-1,10], item.iloc[i-1,12], item.iloc[i-1,12], 0, item.iloc[i,14], 0, item.iloc[i,16], item.iloc[i,17], 0, item.iloc[i,19]
                                
    item.sort_index(inplace=True)
    item.reset_index(inplace=True, drop=True)
    
    # 2. if-condition: if hep_treatment ends before t=24, insert a row with rate=0 for the time between the end
    #                  of treatment and t=24
    if item.iloc[len(item)-1,7] < 1440:
        item.loc[len(item)] = item.iloc[len(item)-1,0], item.iloc[len(item)-1,1], item.iloc[len(item)-1,2], item.iloc[len(item)-1,4], 'tbd', item.iloc[len(item)-1,5], item.iloc[len(item)-1,7], 1440, 1440-item.iloc[len(item)-1,7], item.iloc[len(item)-1,10], item.iloc[len(item)-1,10], item.iloc[len(item)-1,12], item.iloc[len(item)-1,12], 0, item.iloc[len(item)-1,14], 0, item.iloc[len(item)-1,16], item.iloc[len(item)-1,17], 0, item.iloc[len(item)-1,19]
    
    item.sort_index(inplace=True)
    item.reset_index(inplace=True, drop=True)
    
    for i in range(len(item)):
        
        new_df = new_df.append(item.iloc[i], ignore_index=True)
        
new_df.sort_values(by=['stay_id', 'rel_starttime'])
new_df.set_index('stay_id', inplace=True)   

In [19]:
# Load to .csv File
new_df.to_csv('cohort1_hep_data_preprocessed.csv')