In [3]:
# misc tools
from typing import List, Union, Dict
import sys
import os
import yaml
import re
import warnings
from datetime import datetime
sys.path.insert(1, '..')
os.chdir('..')
# plotting
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
# analysis tools for time series
import statsmodels.api as sm
from statsforecast.models import AutoARIMA
from torch.utils.tensorboard import SummaryWriter
# darts
from darts import models
from darts import metrics
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
# utils for darts
from data_formatter.base import *
from utils.darts_dataset import *
from utils.darts_processing import *
from utils.darts_training import *
from utils.darts_evaluation import *
# gluformer model
from lib.gluformer.model import Gluformer
from lib.gluformer.utils.evaluation import test


  from tqdm.autonotebook import tqdm


# Processing

In [3]:

# This is the relative path to the CGM files with original names
file_path = os.path.join("exploratory_analysis", "RT-CGM Randomized Clinical Trial", "DataTables")
# Depending on how you have unzipped the file, you may need to edit this slightly

# # This will list only the files of CGM data
all_files = os.listdir(file_path)

# Filter files matching the pattern "RTCGM"
files = [f for f in all_files if re.search("RTCGM", f)]

# One can then loop through the files
nfiles = len(files)

df_groups = []
for i in range(0,len(files)):
    filename = files[i] # Get the file name
    print(filename)
  # Read each csv in
    curr = pd.read_csv(os.path.join(file_path, filename))    
  
  # We don't need this column, so we'll delete it
    curr = curr.drop('RecID', axis = 1)
    
  # Rename columns to standard column names
    curr = curr.rename(columns = {"PtID":"id", "DeviceDtTm":"time", "Glucose": "gl"}) 
  
  # Convert time to datetime
    curr['time'] = pd.to_datetime(curr['time'], format='ISO8601')

  # Format the datetime without microseconds
    curr['time'] = curr['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
  
  #Ensure glucose values are recorded as numeric
    curr['gl'] = pd.to_numeric(curr['gl'])

    df_groups.append(curr)

df_all = pd.concat(df_groups)

print((df_all))

    

tblADataRTCGM_Unblinded_RTCGMGroup_6.csv
tblADataRTCGM_Unblinded_RTCGMGroup_7.csv
tblADataRTCGM_Unblinded_RTCGMGroup_5.csv
tblADataRTCGM_Unblinded_RTCGMGroup_4.csv
tblADataRTCGM_Blind_ControlGroup.csv
tblADataRTCGM_Unblinded_RTCGMGroup_1.csv
tblADataRTCGM_Unblinded_RTCGMGroup_3.csv
tblADataRTCGM_Unblinded_RTCGMGroup_2.csv
tblADataRTCGM_Unblinded_ControlGroup_5.csv
tblADataRTCGM_Unblinded_ControlGroup_4.csv
tblADataRTCGM_Unblinded_RTCGMGroup_12.csv
tblADataRTCGM_Unblinded_RTCGMGroup_10.csv
tblADataRTCGM_Unblinded_RTCGMGroup_11.csv
tblADataRTCGM_Unblinded_ControlGroup_3.csv
tblADataRTCGM_Unblinded_ControlGroup_2.csv
tblADataRTCGM_Unblinded_ControlGroup_1.csv
tblADataRTCGM_Blind_Baseline.csv
tblADataRTCGM_Unblinded_RTCGMGroup_9.csv
tblADataRTCGM_Unblinded_RTCGMGroup_8.csv
         id                 time   gl
0       296  2000-09-12 12:43:00   78
1       296  2000-09-12 12:48:00   78
2       296  2000-09-12 14:53:00  100
3       296  2000-09-12 14:58:00  100
4       296  2000-09-12 15:03:

In [4]:
# Combining covariates

# Read in covariate data
df_demo = pd.read_csv(os.path.join(file_path, "tblAPtSummary.csv")) 
df_a1c = pd.read_csv(os.path.join(file_path, "tblALabHbA1c.csv")) 

# Convert time to datetime
df_a1c['LabHbA1cDt'] = pd.to_datetime(df_a1c['LabHbA1cDt'], format='ISO8601')

# Format the datetime without microseconds
df_a1c['LabHbA1cDt'] = df_a1c['LabHbA1cDt'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Convert A1c result to numeric
df_a1c['LabA1cResult'] = pd.to_numeric(df_a1c['LabA1cResult'])

# Drop unnecessary variables
df_a1c = df_a1c.drop(['RecID', "Visit", "LabHbA1cNotDone", "QCA1cResult", "LabHbA1cShipDt"], axis = 1)

# Rename columns
df_a1c = df_a1c.rename(columns = {"PtID":"id", "LabA1cResult":"HbA1c", "LabHbA1cDt":"HbA1c_time"}) 

# Drop unnecessary variables
df_demo = df_demo.drop(['RecID', "HGMReadAvg", "RandDt"], axis = 1)

# convert to numeric based on the mapping:
# 'M' = 0, 
# 'F' = 1
df_demo['Gender'] = df_demo['Gender'].map({'M': 0, 'F': 1})

# convert to numeric based on the mapping:
# 'Unknown/not reported' = 0, 
# 'More than one race' = 1,
# 'White' = 2,
# 'Asian' = 3,  
# 'Black/African American' = 4, 
# 'American Indian/Alaskan Native' = 5,
# 'Native Hawaiian/Other Pacific Islander' = 6
df_demo['Race'] = df_demo['Race'].map({'Unknown/not reported': 0, 
                                               'More than one race': 1, 
                                               'White': 2, 
                                               'Asian': 3, 
                                               "Black/African American": 4, 
                                               "American Indian/Alaskan Native": 5,
                                               "Native Hawaiian/Other Pacific Islander": 6})

# convert to numeric based on the mapping:
# 'Unknown/not reported' = 0, 
# 'Not Hispanic or Latino' = 1,
# 'Hispanic or Latino' = 2
df_demo['Ethnicity'] = df_demo['Ethnicity'].map({'Unknown/not reported': 0, 
                                               'Not Hispanic or Latino': 1, 
                                               'Hispanic or Latino': 3})

# convert to numeric based on the mapping:
# 'Subject' = 0, 
# 'Mother' = 1,
# 'Father' = 2,
# 'Spouse' = 3
df_demo['EduCareGvrP'] = df_demo['EduCareGvrP'].map({'Subject': 0, 
                                               'Mother': 1, 
                                               'Father': 3, 
                                               'Spouse': 4})

# convert to numeric based on the mapping:
# '11' = 0, 
# '12' = 1,
# 'Associates' = 2,
# 'Professional' = 3,  
# 'Bachelors' = 4, 
# 'Masters' = 5
df_demo['EduCareGvrPEdu'] = df_demo['EduCareGvrPEdu'].map({'11': 0, 
                                               '12': 1, 
                                               'Associates': 2, 
                                               'Professional': 3, 
                                               "Bachelors": 4, 
                                               "Masters": 5})

# convert to numeric based on the mapping:
# 'Control' = 0, 
# 'RT-CGM' = 1
df_demo['TxGroup'] = df_demo['TxGroup'].map({'Control': 0, 'RT-CGM': 1})

# convert to numeric based on the mapping:
# 'Pump' = 0, 
# 'Injections' = 1
df_demo['InsulinModality'] = df_demo['InsulinModality'].map({'Pump': 0, 'Injections': 1})

# convert to numeric based on the mapping:
# '0' = 0, 
# '1' = 1,
# '2' = 2,
# '3' = 3,
# '>3' = 4
df_demo['NumSevHypo'] = df_demo['NumSevHypo'].map({'0': 0, 
                                               '1': 1, 
                                               '2': 3, 
                                               '3': 4,
                                               '>3': 5})

# Rename columns
df_demo = df_demo.rename(columns = {"PtID":"id"}) 


In [33]:
# Merge demographic data
df_new = df_all.merge(df_demo, on = 'id', how='left')

# Issues with left_join() so manually add proper A1c values in full df:

# Sort both dataframes by 'id' and relevant time columns for manual merging
df_a1c = df_a1c.sort_values(by=['id', 'HbA1c_time']).reset_index(drop=True)
df_new = df_new.sort_values(by=['id', 'time']).reset_index(drop=True)

# Initialize empty list to store A1c values for df_new
HbA1c_values = []

# Loop over each row in df_new
for i, row in df_new.iterrows():
    subject_id = row['id']
    cgm_time = row['time']
    
    # Find most recent A1c value for current subject where HbA1c_time <= cgm_time
    relevant_a1c_entries = df_a1c[(df_a1c['id'] == subject_id) & (df_a1c['HbA1c_time'] <= cgm_time)]
    
    if not relevant_a1c_entries.empty:
    # Take most recent A1c value if df_a1c is empty and most recent A1c exists
        most_recent_HbA1c = relevant_a1c_entries.iloc[-1]['HbA1c']
        
    else:
    # If no A1c exists before first cgm value, use first available A1c for subject
        first_a1c_entry = df_a1c[df_a1c['id'] == subject_id]
        
        if not first_a1c_entry.empty:
        # Use first A1c value
            most_recent_HbA1c = first_a1c_entry.iloc[0]['HbA1c']  
        else:
        # If no A1c data exists at all for this subject, set to NA
            most_recent_HbA1c = None  

    # Append most recent A1c value to list
    HbA1c_values.append(most_recent_HbA1c)


# Assign computed A1c values back to df_new
df_new['HbA1c'] = HbA1c_values


In [86]:
# Drop rows which have no A1c values or duplicate times per person
df_new.dropna(subset=['HbA1c'], inplace=True)
df_new = df_new.drop_duplicates(subset=['id', 'time'])


In [143]:
# Save data
df_new.to_csv('./raw_data/tamborlane.csv', index=False)