In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import pathlib

In [27]:
def datCnv(src):
    return pd.to_datetime(src)

def IOBP2_cleaning(filepath,clean_data_path):
    #load patient roster
    filename = filepath + 'IOBP2PtRoster.txt'
    roster = pd.read_csv(filename, sep="|")
    #build clean roster
    PatientInfo = pd.DataFrame(columns=['PtID','StartDate','EndDate','TrtGroup','Age'])
    PatientInfo['PtID'] = roster['PtID']
    PatientInfo['StartDate'] = roster['RandDt']
    PatientInfo['EndDate'] = roster['TransRandDt']
    PatientInfo['TrtGroup'] = roster['TrtGroup']
    PatientInfo['Age'] = roster['AgeAsofEnrollDt']
    
    #load manual injections data
    filename = filepath + 'IOBP2ManualInsulinInj.txt'
    data_man_inj = pd.read_csv(filename, sep="|")
    #create datetime objects for easy inclusion
    data_man_inj['DateTime'] = np.nan
    data_man_inj.InsInjDt = [datetime.strptime(data_man_inj['InsInjDt'][x],'%m/%d/%Y').date() for x in data_man_inj.index.values]
    for i in data_man_inj.index.values:
        if (data_man_inj.InsInjAMPM[i] == 'PM') & (data_man_inj.InsInjHr[i]!= 12):
            data_man_inj.InsInjHr[i] = data_man_inj.InsInjHr[i] + 12

        data_man_inj['DateTime'][i] = datetime(data_man_inj.InsInjDt[i].year,
                                           data_man_inj.InsInjDt[i].month,
                                           data_man_inj.InsInjDt[i].day,
                                           data_man_inj.InsInjHr[i],
                                           data_man_inj.InsInjMin[i],
                                          )
    #load insulin pump data
    filename = filepath + 'IOBP2DeviceiLet.txt'
    data = pd.read_csv(filename, sep="|")
    #create new dateframe for clean data
    cleaned_data = pd.DataFrame()
    patient_data = pd.DataFrame()
    for id in PatientInfo.PtID.values:
        subj_data = data[data.PtID == id].reset_index(drop=True)
        

        subj_info = PatientInfo[PatientInfo.PtID == id].reset_index(drop=True)
        if len(subj_data) > 0:
            subj_info['DaysOfData'] = np.nan
            subj_info['Weight'] = np.nan
            subj_info['AVG_CGM'] = np.nan
            subj_info['STD_CGM'] = np.nan
            subj_info['CGM_Availability'] = np.nan
            subj_info['eA1C'] = np.nan
            subj_info['TIR'] = np.nan
            subj_info['TDD'] = np.nan

            subj_inj = data_man_inj[data_man_inj.PtID == id].reset_index(drop=True)

            data_preclean = subj_data.filter(items=['DeviceDtTm','PtID','CGMVal','BGMVal','InsDelivAvail','InsDelivPrev'])
            data_preclean['InsulinDelivered'] = data_preclean.InsDelivPrev.shift(-1)
            data_preclean['DateTime'] = data_preclean.DeviceDtTm.apply(datCnv)

#             if id in [222,463,112]: #each day at midnight the datetime stamp is only a date
#                 for x in data_preclean.index:
#                     try:
#                         data_preclean['DeviceDtTm'][x] = datetime.strptime(data_preclean['DeviceDtTm'][x],'%m/%d/%Y %H:%M:%S %p').isoformat()
#                     except:
#                         data_preclean['DeviceDtTm'][x] = datetime.strptime(data_preclean['DeviceDtTm'][x],'%m/%d/%Y').isoformat()        
#             else:
#                 data_preclean['DeviceDtTm'] = [datetime.strptime(data_preclean['DeviceDtTm'][x],'%m/%d/%Y %H:%M:%S %p').isoformat() for x in data_preclean.index]
#             data_preclean['DateTime'] = [datetime.fromisoformat(data_preclean['DeviceDtTm'][x]) for x in data_preclean.index]
            data_preclean = data_preclean.sort_values(by='DateTime').reset_index(drop=True)
            try:
                subj_info['StartDate'] = subj_info.StartDate[0].apply(datCnv)
                data_preclean = data_preclean[data_preclean.DateTime >= subj_info.StartDate.iloc[0]]
                
            except:
                pass
            #not everyone has an end data
            try:
                subj_info['EndDate'] = subj_info.EndDate[0].apply(datCnv)
                data_preclean = data_preclean[data_preclean.DateTime <= subj_info.EndDate.iloc[0]]
            except:
                pass

            data_preclean['TimeBetween'] = data_preclean.DateTime.diff()
            data_preclean['TimeBetween'] = [data_preclean['TimeBetween'][x].total_seconds()/60 for x in data_preclean.index]
            #add manual injections
            data_preclean['ManualIns'] = 0
            if len(subj_inj)>0:
                #find closest CGM time to injection
                for i in subj_inj.index.values:
                    data_preclean['TimeFromInj'] = [(data_preclean['DateTime'][x] - subj_inj.DateTime[i]).total_seconds() for x in data_preclean.index]
                    data_preclean['TimeFromInj'] = data_preclean['TimeFromInj'].abs()
                    injection_index = data_preclean[data_preclean.TimeFromInj == data_preclean.TimeFromInj.min()].index.values[0]
                    data_preclean['ManualIns'][injection_index] = subj_inj.InsInjAmt[i]

            clean_subj = data_preclean.filter(items=['DateTime','PtID','CGMVal','BGMVal','InsDelivAvail','InsulinDelivered','ManualIns'])
            clean_subj['DateTime'] = [clean_subj['DateTime'][x].isoformat() for x in clean_subj.index]
            clean_subj = clean_subj.rename(columns={"DeviceDtTm": "LocalTime", 
                                       "InsDelivAvail": "InsulinAvailable",
                                       "ManualIns": "ManualDelivery"
                                      })

            cleaned_data = pd.concat([cleaned_data,clean_subj])
            if len(clean_subj)>0:
                subj_info['DaysOfData'][0] = np.round(len(subj_data)/288,2)
                subj_info['Weight'][0] = subj_data.PtWeight.iloc[-1]
                subj_info['AVG_CGM'][0] = np.round(clean_subj.CGMVal.mean(),2)
                subj_info['STD_CGM'][0] = np.round(clean_subj.CGMVal.std(),2)
                subj_info['CGM_Availability'][0] = np.round(100 * len(clean_subj[clean_subj.CGMVal>0])/len(clean_subj),2)
                subj_info['eA1C'][0] = np.round((46.7 + clean_subj.CGMVal.mean())/28.7,2)
                subj_info['TIR'][0] = np.round(100 * len(clean_subj[(clean_subj.CGMVal>=70) & (clean_subj.CGMVal<=180)])/len(clean_subj[clean_subj.CGMVal>0]),2)
                subj_info['TDD'][0] = np.round(clean_subj.InsulinDelivered.sum()/subj_info['DaysOfData'][0],2)

                pt_data = subj_info.filter(items=['PtID','StartDate','EndDate','TrtGroup','Age','DaysOfData','Weight','AVG_CGM','STD_CGM','CGM_Availability',
                                                  'eA1C','TIR','TDD'])
                patient_data = pd.concat([patient_data,pt_data])
        #creates a new folder (if it doesnt exist) for cleaned data to be saved
        pathlib.Path(clean_data_path + "CleanedData").mkdir(parents=True, exist_ok=True)
        cleaned_data.to_csv(clean_data_path + "CleanedData/IOBP2_cleaned_egvinsulin.csv",index=False)
        patient_data.to_csv(clean_data_path + "CleanedData/IOBP2_patient_data.csv",index=False)
    
    return cleaned_data,patient_data

In [13]:
cleaned_data_path = '/Users/rachelbrandt/egvinsulin/' #location where you want cleaned data to be stored
pathname = '/Users/rachelbrandt/Downloads/IOBP2 RCT Public Dataset/Data Tables/' #location of data to be cleaned

cleaned_data,patient_data = IOBP2_cleaning(pathname,cleaned_data_path)
cleaned_data

Unnamed: 0,DateTime,PtID,CGMVal,BGMVal,InsulinAvailable,InsulinDelivered,ManualDelivery
0,2020-07-26T00:02:43,235,204.0,,True,0.160,0
1,2020-07-26T00:07:43,235,206.0,,True,0.069,0
2,2020-07-26T00:12:43,235,204.0,,True,0.000,0
3,2020-07-26T00:17:43,235,200.0,,True,0.076,0
4,2020-07-26T00:22:43,235,199.0,,True,0.000,0
...,...,...,...,...,...,...,...
28195,2021-04-19T11:02:02,346,,,False,0.000,0
28196,2021-04-19T11:07:02,346,,,False,0.000,0
28197,2021-04-19T11:12:02,346,,,False,0.000,0
28198,2021-04-19T11:17:02,346,,,False,0.000,0


In [38]:
def datCnv(src):
    return pd.to_datetime(src)

def DCLP5_cleaning(filepath_old_data,filepath_new_data,clean_data_path):
    
    filename = filepath_new_data + 'DCLP5TandemBolus_Completed_Combined_b.txt'
    bolus_data = pd.read_csv(filename, sep="|",low_memory = False)

    filename = filepath_old_data + 'DexcomClarityCGM.csv'
    data_cgm = pd.read_csv(filename, sep="|",low_memory = False)

    filename = filepath_old_data + 'TandemAACGMDATA.csv'
    data_deliv = pd.read_csv(filename, sep="|",low_memory = False)

    filename = filepath_new_data + 'PtRoster.txt'
    roster = pd.read_csv(filename, sep="|")
#     if study == 'DCLP3':
#         filename = filepath_new_data + 'DCLP5TandemBolus_Completed_Combined_b.txt'
#         bolus_data = pd.read_csv(filename, sep="|",low_memory = False)
        
#         filename = filepath_old_data + 'DexcomClarityCGM_a.txt'
#         data_cgm = pd.read_csv(filename, sep="|",low_memory = False)

#         filename = filepath_old_data + 'TandemAACGMDATA.csv'
#         data_deliv = pd.read_csv(filename, sep="|",low_memory = False)

#         filename = filepath_new_data + 'PtRoster.txt'
#         roster = pd.read_csv(filename, sep="|")
    
    PatientInfo = pd.DataFrame(columns=['PtID','StartDate','Phase2Start','TrtGroup','Age'])
    PatientInfo['PtID'] = roster['PtID']
    PatientInfo['StartDate'] = roster['RandDt']
    PatientInfo['Phase2Start'] = roster['Phase2StartDt']
    PatientInfo['TrtGroup'] = roster['trtGroup']
    PatientInfo['Age'] = roster['AgeAtEnrollment']
    
    
        
    cleaned_data = pd.DataFrame()
    patient_data = pd.DataFrame()
    for id in PatientInfo.PtID.values:
        try:
            subj_info = PatientInfo[PatientInfo.PtID == id].reset_index(drop=True)
            patient_cgm = data_cgm[data_cgm.PtID == id].filter(items=['PtID','DataDtTm','CGM'])
            patient_deliv = data_deliv[data_deliv.PtID == id].filter(items=['DataDtTm','dU','Usugg'])
            patient_bolus = bolus_data[bolus_data.PtID == id].filter(items=['DataDtTm','BolusAmount','BolusType'])

            patient_deliv['DateTime'] = patient_deliv.DataDtTm.apply(datCnv)
            patient_cgm['DateTime'] = patient_cgm.DataDtTm.apply(datCnv)
            patient_bolus['DateTime'] = patient_bolus.DataDtTm.apply(datCnv)
            
            patient_deliv = patient_deliv.sort_values(by='DateTime').reset_index(drop=True)
            patient_cgm = patient_cgm.sort_values(by='DateTime').reset_index(drop=True)
            patient_bolus = patient_bolus.sort_values(by='DateTime').reset_index(drop=True)
            
            patient_cgm = patient_cgm[patient_cgm.DateTime >= subj_info.StartDate.iloc[0]]
            patient_bolus = patient_bolus[patient_bolus.DateTime >= subj_info.StartDate.iloc[0]]
            patient_deliv = patient_deliv[patient_deliv.DateTime >= subj_info.StartDate.iloc[0]]

            #round all time to nearest 5 minute value
            patient_cgm['DateTime'] = patient_cgm['DateTime'].dt.round("5min")
            patient_deliv['DateTime'] = patient_deliv['DateTime'].dt.round("5min")
            patient_bolus['DateTime'] = patient_bolus['DateTime'].dt.round("5min")
            #merge dataframes on rounded time - will keep true pump time
            data_merge = patient_deliv.merge(patient_cgm,on='DateTime',how='outer').merge(patient_bolus,on='DateTime',how='outer')
            #keep data for only when the pump was operational (has a time value) - some CGM data predates insulin
            data_merge_final = data_merge[(data_merge.DataDtTm_x.notna())]
            #convert extended boluses to 50% at time of bolus (t) and 50% extended for 2 hours (from t+1 to t+24)
            extended_index = data_merge_final[data_merge_final.BolusType=='Extended'].index.values
            data_merge_final.BolusAmount = data_merge_final.BolusAmount.fillna(0)
            for e in extended_index:
                data_merge_final.BolusAmount[e] = data_merge_final.BolusAmount[e]*0.5
                data_merge_final.BolusAmount.loc[e+1:e+24] = data_merge_final.BolusAmount.loc[e+1:e+24] + (data_merge_final.BolusAmount[e]*0.5)/24
            data_merge_final['BasalDelivery'] = data_merge_final.Usugg/1000
            data_merge_final['InsulinDelivered'] = data_merge_final.BasalDelivery + data_merge_final.BolusAmount
            clean_subj = data_merge_final.filter(items=['PtID','DataDtTm_x','CGM','BasalDelivery','BolusAmount','BolusType','InsulinDelivered'])
            clean_subj['PtID'] = id
            clean_subj = clean_subj.rename(columns={"DataDtTm_x": "DateTime", 
                                           "BolusAmount": "BolusDelivery",
                                           "CGM": "CGMVal"         
                                          })

            cleaned_data = pd.concat([cleaned_data,clean_subj])
            if len(clean_subj)>0:
                subj_info = PatientInfo[PatientInfo.PtID == id].reset_index(drop=True)
                subj_info['DaysOfData'] = np.nan
                subj_info['AVG_CGM'] = np.nan
                subj_info['STD_CGM'] = np.nan
                subj_info['CGM_Availability'] = np.nan
                subj_info['eA1C'] = np.nan
                subj_info['TIR'] = np.nan
                subj_info['TDD'] = np.nan

                subj_info['DaysOfData'][0] = np.round(len(data_merge_final)/288,2)
                subj_info['AVG_CGM'][0] = np.round(clean_subj.CGMVal.mean(),2)
                subj_info['STD_CGM'][0] = np.round(clean_subj.CGMVal.std(),2)
                subj_info['CGM_Availability'][0] = np.round(100 * len(clean_subj[clean_subj.CGMVal>0])/len(clean_subj),2)
                subj_info['eA1C'][0] = np.round((46.7 + clean_subj.CGMVal.mean())/28.7,2)
                subj_info['TIR'][0] = np.round(100 * len(clean_subj[(clean_subj.CGMVal>=70) & (clean_subj.CGMVal<=180)])/len(clean_subj[clean_subj.CGMVal>0]),2)
                subj_info['TDD'][0] = np.round(clean_subj.InsulinDelivered.sum()/subj_info['DaysOfData'][0],2)

                pt_data = subj_info.filter(items=['PtID','StartDate','Phase2Start','TrtGroup','Age','DaysOfData','Weight','AVG_CGM','STD_CGM','CGM_Availability',
                                                  'eA1C','TIR','TDD'])
                patient_data = pd.concat([patient_data,pt_data])
        except:
            pass
    pathlib.Path(clean_data_path + "CleanedData").mkdir(parents=True, exist_ok=True)
    cleaned_data.to_csv(clean_data_path + "CleanedData/DCLP5_cleaned_egvinsulin.csv",index=False)
    patient_data.to_csv(clean_data_path + "CleanedData/DCLP5_patient_data.csv",index=False)
    
    return cleaned_data,patient_data

In [39]:
filepath_old_data = '/Users/rachelbrandt/Downloads/Data Files DCLP5/'
filepath_new_data = '/Users/rachelbrandt/Downloads/DCLP5_Dataset_2022-01-20-5e0f3b16-c890-4ace-9e3b-531f3687cf53/'
cleaned_data_path = '/Users/rachelbrandt/egvinsulin/' #location where you want cleaned data to be stored

cleaned_data,patient_data =  DCLP_cleaning(filepath_old_data,filepath_new_data,cleaned_data_path)
patient_data

Unnamed: 0,PtID,StartDate,Phase2Start,TrtGroup,Age,DaysOfData,AVG_CGM,STD_CGM,CGM_Availability,eA1C,TIR,TDD
0,21,2019-04-12 00:00:00,2019-08-02 00:00:00,CLC,13,196.75,150.38,46.48,90.83,6.87,77.66,62.69
0,27,2018-10-27 00:00:00,2019-02-17 00:00:00,SC,8,86.23,170.57,77.74,95.22,7.57,61.75,33.05
0,26,2019-04-16 00:00:00,2019-08-07 00:00:00,CLC,10,197.89,187.43,65.60,58.87,8.16,52.83,64.65
0,80,2018-11-07 00:00:00,2019-02-21 00:00:00,CLC,11,202.56,135.39,45.78,1.89,6.34,82.81,38.28
0,87,2018-11-26 00:00:00,2019-03-19 00:00:00,SC,11,91.07,135.38,38.05,98.44,6.34,85.87,54.48
...,...,...,...,...,...,...,...,...,...,...,...,...
0,51,2019-03-02 00:00:00,2019-06-23 00:00:00,CLC,13,197.40,174.45,73.35,94.47,7.71,61.57,53.52
0,18,2019-01-11 00:00:00,2019-05-05 00:00:00,CLC,11,200.91,177.74,66.33,64.66,7.82,56.70,50.36
0,95,2019-04-29 00:00:00,2019-08-19 00:00:00,CLC,11,195.88,175.73,66.07,52.08,7.75,59.98,130.95
0,12,2018-11-17 00:00:00,2019-03-09 00:00:00,CLC,10,202.97,145.15,52.92,95.51,6.68,75.27,31.29


In [21]:
def datCnv(src):
    return pd.to_datetime(src)

def DCLP3_cleaning(filepath_data,clean_data_path):
    filename = filepath_data + 'Pump_BasalRateChange.txt'
    BasalRate = pd.read_csv(filename, sep="|", low_memory = False)

    filename = filepath_data + 'Pump_BolusDelivered.txt'
    Bolus = pd.read_csv(filename, sep="|" , low_memory = False)

    filename = filepath_data + 'Pump_CGMGlucoseValue.txt'
    CGM = pd.read_csv(filename, sep="|", low_memory = False)

    filename = filepath_data + 'PtRoster_b.txt'
    roster = pd.read_csv(filename, low_memory = False)
    
    PatientInfo = pd.DataFrame(columns=['PtID','StartDate','TrtGroup'])
    PatientInfo['PtID'] = roster['PtID']
    PatientInfo['StartDate'] = roster['RandDt']
    PatientInfo['TrtGroup'] = roster['trtGroup']
    
    cleaned_data = pd.DataFrame()
    patient_data = pd.DataFrame()
    j = 0
    for id in PatientInfo.PtID.values:
        print(id)
        try:
            subj_info = PatientInfo[PatientInfo.PtID == id].reset_index(drop=True)
            patient_deliv = BasalRate[BasalRate.PtID == id]
            patient_cgm = CGM[CGM.PtID == id]
            patient_bolus = Bolus[Bolus.PtID == id]

            patient_deliv['DateTime'] = patient_deliv.DataDtTm.apply(datCnv)
            patient_cgm['DateTime'] = patient_cgm.DataDtTm.apply(datCnv)
            patient_bolus['DateTime'] = patient_bolus.DataDtTm.apply(datCnv)

            patient_deliv = patient_deliv.sort_values(by='DateTime').reset_index(drop=True)
            patient_cgm = patient_cgm.sort_values(by='DateTime').reset_index(drop=True)
            patient_bolus = patient_bolus.sort_values(by='DateTime').reset_index(drop=True)

            patient_cgm['DateTime'] = patient_cgm['DateTime'].dt.round("5min")
            patient_deliv['DateTime'] = patient_deliv['DateTime'].dt.round("5min")
            patient_bolus['DateTime'] = patient_bolus['DateTime'].dt.round("5min")

            patient_deliv['DataDtTm'] = patient_deliv.DataDtTm.apply(datCnv)
            patient_deliv['TimeBetween'] = patient_deliv.DataDtTm.diff(-1)
            patient_deliv['TimeBetween'] = [patient_deliv['TimeBetween'][x].total_seconds()/60 for x in patient_deliv.index]
            patient_deliv = patient_deliv[patient_deliv.TimeBetween < -3.5]
            dup_index = patient_deliv[patient_deliv.DateTime.duplicated()].index.values
            patient_deliv.DateTime.loc[dup_index] = patient_deliv.DateTime.loc[dup_index] + timedelta(minutes=5)

            patient_deliv = patient_deliv.drop_duplicates(subset=['DateTime'],keep='first')
            patient_deliv.index = patient_deliv.DateTime
            patient_deliv = patient_deliv.resample('5T').asfreq()
            patient_deliv.DateTime = patient_deliv.index.values
            patient_deliv = patient_deliv.reset_index(drop=True)
            interp_deliv = patient_deliv.filter(items=['PtID','CommandedBasalRate','DateTime'])

            interp_deliv = interp_deliv.ffill()
            interp_deliv['BasalDelivery'] = interp_deliv.CommandedBasalRate/12

            bolus_filt = patient_bolus.filter(items=['DateTime','BolusType','BolusAmount'])
            cgm_filt = patient_cgm.filter(items=['DateTime','CGMValue'])
            clean_subj = interp_deliv.merge(cgm_filt,on='DateTime',how='outer').merge(bolus_filt,on='DateTime',how='outer')

            clean_subj = clean_subj.dropna(subset=['BasalDelivery'])
            clean_subj.BolusAmount = clean_subj.BolusAmount.fillna(0)

            extended_index = clean_subj[clean_subj.BolusType=='Extended'].index.values
            for e in extended_index:
                clean_subj.BolusAmount[e] = clean_subj.BolusAmount[e]*0.5
                clean_subj.BolusAmount.loc[e+1:e+24] = clean_subj.BolusAmount.loc[e+1:e+24] + (clean_subj.BolusAmount[e]*0.5)/24
            clean_subj = clean_subj.rename(columns={
                                           "CGMValue": "CGMVal",
                                           "BolusAmount": "BolusDelivery",
                                          }) 
            clean_subj['InsulinDelivered'] = clean_subj.BasalDelivery + clean_subj.BolusDelivery
            cleaned_data = pd.concat([cleaned_data,clean_subj])
            if len(clean_subj)>0:
                subj_info['DaysOfData'] = np.nan
                subj_info['AVG_CGM'] = np.nan
                subj_info['STD_CGM'] = np.nan
                subj_info['CGM_Availability'] = np.nan
                subj_info['eA1C'] = np.nan
                subj_info['TIR'] = np.nan
                subj_info['TDD'] = np.nan

                subj_info['DaysOfData'][0] = np.round(len(clean_subj)/288,2)
                subj_info['AVG_CGM'][0] = np.round(clean_subj.CGMVal.mean(),2)
                subj_info['STD_CGM'][0] = np.round(clean_subj.CGMVal.std(),2)
                subj_info['CGM_Availability'][0] = np.round(100 * len(clean_subj[clean_subj.CGMVal>0])/len(clean_subj),2)
                subj_info['eA1C'][0] = np.round((46.7 + clean_subj.CGMVal.mean())/28.7,2)
                subj_info['TIR'][0] = np.round(100 * len(clean_subj[(clean_subj.CGMVal>=70) & (clean_subj.CGMVal<=180)])/len(clean_subj[clean_subj.CGMVal>0]),2)
                subj_info['TDD'][0] = np.round(clean_subj.InsulinDelivered.sum()/subj_info['DaysOfData'][0],2)

                pt_data = subj_info.filter(items=['PtID','StartDate','TrtGroup','DaysOfData','AVG_CGM','STD_CGM','CGM_Availability',
                                                  'eA1C','TIR','TDD'])
                patient_data = pd.concat([patient_data,pt_data])
        except:
            pass
    pathlib.Path(clean_data_path + "CleanedData").mkdir(parents=True, exist_ok=True)
    cleaned_data.to_csv(clean_data_path + "CleanedData/DCLP3_cleaned_egvinsulin.csv",index=False)
    patient_data.to_csv(clean_data_path + "CleanedData/DCLP3_patient_data.csv",index=False)
    
    return cleaned_data,patient_data

In [None]:
filepath = '/Users/rachelbrandt/Downloads/DCLP3 Public Dataset - Release 3 - 2022-08-04/Data Files/'
cleaned_data_path = '/Users/rachelbrandt/egvinsulin/' #location where you want cleaned data to be stored

cleaned_data,patient_data =  DCLP3_cleaning(filepath,cleaned_data_path)
patient_data

In [23]:
cleaned_data

Unnamed: 0,PtID,CommandedBasalRate,DateTime,BasalDelivery,CGMVal,BolusType,BolusDelivery,InsulinDelivered
0,10.0,2.000,2018-04-04 12:55:00,0.166667,,,0.0,0.166667
1,10.0,0.000,2018-04-04 13:00:00,0.000000,,,0.0,0.000000
2,10.0,0.000,2018-04-04 13:05:00,0.000000,,,0.0,0.000000
3,10.0,0.000,2018-04-04 13:10:00,0.000000,,Standard,0.1,0.100000
4,10.0,0.000,2018-04-04 13:15:00,0.000000,,,0.0,0.000000
...,...,...,...,...,...,...,...,...
54025,171.0,0.950,2018-10-24 08:45:00,0.079167,168.0,,0.0,0.079167
54026,171.0,0.950,2018-10-24 08:50:00,0.079167,174.0,,0.0,0.079167
54027,171.0,1.119,2018-10-24 08:55:00,0.093250,192.0,,0.0,0.093250
54028,171.0,3.207,2018-10-24 09:00:00,0.267250,189.0,,0.0,0.267250


In [64]:
def datCnv(src):
    return pd.to_datetime(src)

def build_basal_profile(data):
    data['DataDtTm'] = data.DataDtTm.apply(datCnv)
    data['DateTime'] = data['DataDtTm'].dt.round("5min")
    data = data.sort_values(by='DateTime')
    data = data.drop_duplicates(subset=['DateTime'],keep='last')
    #get 5 minute data and convert rates to delivery
    data.index = data.DateTime
    data = data.resample('5T').asfreq()
    data.DateTime = data.index.values
    data = data.reset_index(drop=True)
    data.BasalRt = data.BasalRt.ffill()
    data.BasalRt = data.BasalRt/12 #convert to 5 minute delivery
    
    extended_boluses = data[data.ExtendBolusDuration.notna()]
    if len(extended_boluses) > 0:
        extended_boluses['Duration'] = [datetime.strptime(extended_boluses.ExtendBolusDuration[t],"%H:%M:%S") for t in extended_boluses.index.values]
        extended_boluses['Duration_minutes'] = [timedelta(hours=extended_boluses['Duration'][t].hour, minutes=extended_boluses['Duration'][t].minute, seconds=extended_boluses['Duration'][t].second).total_seconds()/60 for t in extended_boluses.index]
        extended_boluses['Duration_steps'] = extended_boluses['Duration_minutes']/5
        extended_boluses['Duration_steps'] = extended_boluses['Duration_steps'].round()
    
        for ext in extended_boluses.index:
            bolus_parts = extended_boluses.BolusDeliv[ext]/extended_boluses.Duration_steps[ext]
            data.BolusDeliv.loc[ext:ext+int(extended_boluses.Duration_steps[ext])] = bolus_parts

    return data

def FLAIR_cleaning(filepath_data,clean_data_path):
    filename = filepath_data + 'FLAIRDevicePump.txt'
    InsulinData = pd.read_csv(filename, sep="|", low_memory = False)

    filename = filepath_data + 'FLAIRDeviceCGM.txt'
    CGM = pd.read_csv(filename, sep="|" , low_memory = False)

    filename = filepath_data + 'PtRoster.txt'
    roster = pd.read_csv(filename, sep="|", low_memory = False)

     
    PatientInfo = pd.DataFrame(columns=['PtID','StartDate','TrtGroup'])
    PatientInfo['PtID'] = roster['PtID']
    PatientInfo['StartDate'] = roster['RandDt']
    PatientInfo['TrtGroup'] = roster['TrtGroup']
    PatientInfo['Age'] = roster['AgeAsofEnrollDt']

    cleaned_data = pd.DataFrame()
    patient_data = pd.DataFrame()
    
    for id in PatientInfo.PtID.values:
        try:
            subj_info = PatientInfo[PatientInfo.PtID == id].reset_index(drop=True)
            patient_deliv = InsulinData[InsulinData.PtID == id]
            if len(patient_deliv)>0:
                patient_cgm = CGM[CGM.PtID == id]

                patient_ins = build_basal_profile(patient_deliv)

                patient_cgm['DateTime'] = patient_cgm.DataDtTm.apply(datCnv)

                patient_cgm['DateTime'] = patient_cgm['DateTime'].dt.round("5min")
                patient_cgm = patient_cgm.sort_values(by='DateTime')
                patient_cgm = patient_cgm.drop_duplicates(subset=['DateTime'],keep='last')

                data_merge = patient_ins.merge(patient_cgm,on='DateTime',how='outer')
                clean_subj = data_merge[(data_merge.DataDtTm_x.notna())]
                clean_subj = clean_subj.filter(items=['PtID_y','DateTime','BolusType', 'BolusDeliv', 'BasalRt','CGM'])

                clean_subj = clean_subj.rename(columns={
                                               "CGM": "CGMVal","PtID_y": "PtID",
                                               "BolusDeliv": "BolusDelivery",
                                               "BasalRt": "BasalDelivery",
                                              }) 
                clean_subj.BolusDelivery = clean_subj.BolusDelivery.fillna(0)
                clean_subj['InsulinDelivered'] = clean_subj.BasalDelivery + clean_subj.BolusDelivery
                clean_subj['PtID'] = id
                cleaned_data = pd.concat([cleaned_data,clean_subj])
                if len(clean_subj)>0:
                    subj_info['DaysOfData'] = np.nan
                    subj_info['AVG_CGM'] = np.nan
                    subj_info['STD_CGM'] = np.nan
                    subj_info['CGM_Availability'] = np.nan
                    subj_info['eA1C'] = np.nan
                    subj_info['TIR'] = np.nan
                    subj_info['TDD'] = np.nan

                    subj_info['DaysOfData'][0] = np.round(len(clean_subj)/288,2)
                    subj_info['AVG_CGM'][0] = np.round(clean_subj.CGMVal.mean(),2)
                    subj_info['STD_CGM'][0] = np.round(clean_subj.CGMVal.std(),2)
                    subj_info['CGM_Availability'][0] = np.round(100 * len(clean_subj[clean_subj.CGMVal>0])/len(clean_subj),2)
                    subj_info['eA1C'][0] = np.round((46.7 + clean_subj.CGMVal.mean())/28.7,2)
                    subj_info['TIR'][0] = np.round(100 * len(clean_subj[(clean_subj.CGMVal>=70) & (clean_subj.CGMVal<=180)])/len(clean_subj[clean_subj.CGMVal>0]),2)
                    subj_info['TDD'][0] = np.round(clean_subj.InsulinDelivered.sum()/subj_info['DaysOfData'][0],2)

                    pt_data = subj_info.filter(items=['PtID','Age','StartDate','TrtGroup','DaysOfData','AVG_CGM','STD_CGM','CGM_Availability',
                                                      'eA1C','TIR','TDD'])
                    patient_data = pd.concat([patient_data,pt_data])
        except:
            pass
            
    pathlib.Path(clean_data_path + "CleanedData").mkdir(parents=True, exist_ok=True)
    cleaned_data.to_csv(clean_data_path + "CleanedData/FLAIR_cleaned_egvinsulin.csv",index=False)
    patient_data.to_csv(clean_data_path + "CleanedData/FLAIR_patient_data.csv",index=False)
    
    return cleaned_data,patient_data      

In [65]:
filepath = '/Users/rachelbrandt/Downloads/FLAIRPublicDataSet/Data Tables/'
cleaned_data_path = '/Users/rachelbrandt/egvinsulin/' #location where you want cleaned data to be stored

cleaned_data,patient_data =  FLAIR_cleaning(filepath,cleaned_data_path)
patient_data

Unnamed: 0,PtID,Age,StartDate,TrtGroup,DaysOfData,AVG_CGM,STD_CGM,CGM_Availability,eA1C,TIR,TDD
0,26,18,3/11/2019 12:19:01 PM,AHCL,94.90,182.65,65.62,80.46,7.99,52.44,100.70
0,40,20,4/8/2019 10:45:31 AM,AHCL,107.06,180.74,51.47,95.16,7.92,54.03,46.38
0,78,20,7/7/2019 11:37:42 AM,AHCL,79.90,181.34,65.72,83.88,7.95,52.62,115.76
0,46,14,8/13/2018 8:32:38 AM,AHCL,44.23,211.08,84.09,72.41,8.98,42.49,217.99
0,15,15,7/25/2019 10:39:52 AM,670G,115.27,167.04,62.98,88.46,7.45,61.89,62.64
...,...,...,...,...,...,...,...,...,...,...,...
0,104,15,6/10/2019 6:44:54 AM,AHCL,122.64,160.06,50.01,97.06,7.20,66.71,50.95
0,67,20,1/7/2019 3:19:25 AM,AHCL,141.70,167.00,47.72,95.39,7.45,68.66,63.01
0,43,21,4/4/2019 7:06:59 AM,AHCL,117.98,167.30,61.53,90.84,7.46,60.32,64.40
0,68,20,10/9/2018 3:24:09 AM,670G,119.47,156.05,51.78,93.19,7.06,70.86,93.18
