-
Notifications
You must be signed in to change notification settings - Fork 0
/
glooko.py
191 lines (149 loc) · 7.97 KB
/
glooko.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""Tools for loading Glooko data
"""
import sys
sys.path.append("../")
from os import path
import glob
import tools.stats as st
# import stats as st
import pandas as pd
import numpy as np
def read_all(data_folder, min_target=70, max_target=180, match_date_ranges=True):
"""
Read all of the data and return a series of pandas data frames
Example usage:
df_cgm, df_bolus, df_basal, df_insulin, df_cgm_daily = read_all(r"data")
Args:
data_folder (str): path to the root folder containing all the data files. The file names
and structure should be left exactly as they were in the initial download from Glooko.
Keyword Args:
min_target (number): lower bound of blood glucose range (mg/dL)
max_target (number): upper bound of blood glucose range (mg/dL)
match_date_ranges (bool): flag indicating whether to reduce the daily stats so that they all have the same length (default = True). This is because sometimes CGM and insulin/pump data are not the same length.
Returns:
A 5-element tuple
- **df_cgm** (pandas df): pandas dataframe containing CGM time series data
- **df_bolus** (pandas df): Bolus data
- **df_basal** (pandas df): Basal data
- **df_insulin** (pandas df): Insulin corrections data
- **df_cgm_daily** (pandas df): Various daily BG stats
"""
# Load and format CGM data
cgm_file = path.normpath(data_folder + r"/cgm_data.csv")
df_cgm_data = pd.read_csv(cgm_file, header=1,\
names=["time", "bg", "sn_cgm"])
df_cgm_data["time"] = pd.to_datetime(df_cgm_data["time"])
df_cgm_data["yearday"] = st.get_yeardays(df_cgm_data)
df_cgm_data.drop("sn_cgm", axis=1, inplace=True)
# COMPUTE STATS PER DAY
df_cgm_daily = df_cgm_data[["bg", "yearday"]].groupby("yearday").describe()["bg"]
df_cgm_daily["yearday"] = df_cgm_daily.index
df_cgm_daily["time"] = pd.to_datetime(df_cgm_daily['yearday'], format='%Y-%j')
# Load and format bolus data
bolus_file = path.normpath(data_folder + r"/Insulin data/bolus_data.csv")
df_bolus_data = pd.read_csv(bolus_file, header=1,
names=["time", "insulin_type", "bg_input", "carbs_input", "carb_ratio",
"insulin_delivered", "initial_delivery", "extended_delivery",
"sn_omni"])
df_bolus_data.drop("sn_omni", axis=1, inplace=True)
df_bolus_data["time"] = pd.to_datetime(df_bolus_data["time"])
# Extract carb correction and insulin correction from bolus dataframe; add as new columns
df_bolus_data["carb_correction"] = np.divide(df_bolus_data["carbs_input"], df_bolus_data["carb_ratio"])
df_bolus_data["insulin_correction"] = df_bolus_data["insulin_delivered"] - df_bolus_data["carb_correction"]
# Load and format basal data
basal_file = path.normpath(data_folder + r"/Insulin data/basal_data.csv")
df_basal_data = pd.read_csv(basal_file, header=1,\
names=["time", "insulin_type", "duration", "percentage", "rate", "insulin_delivered",\
"sn_omni"])
df_basal_data.drop("sn_omni", axis=1, inplace=True)
df_basal_data["time"] = pd.to_datetime(df_basal_data["time"])
# Load and format insulin data
insulin_file = path.normpath(data_folder + r"/Insulin data/insulin_data.csv")
df_insulin_data = pd.read_csv(insulin_file, header=1,\
names=["time", "total_bolus", "total_insulin", "total_basal", "sn_omni"])
df_insulin_data.drop("sn_omni", axis=1, inplace=True)
df_insulin_data["time"] = pd.to_datetime(df_insulin_data["time"])
df_insulin_data["yearday"] = st.get_yeardays(df_insulin_data)
# Compute additional daily stats
# COMPUTE DAILY TIME IN RANGE
min_target = 70
max_target = 180
unique_days = df_cgm_daily["yearday"].tolist()
# Initialize lists
pct_above = []
pct_below = []
pct_inrange = []
# loop through each day (I'm sure there's a fancy way to do this with Pandas but I
# can't figure it out...)
for day in unique_days:
df_sub = df_cgm_data[df_cgm_data["yearday"]==day]
this_total = len(df_sub)
if this_total > 0:
pct_above.append(sum(df_sub["bg"]>max_target)/this_total*100)
pct_below.append(sum(df_sub["bg"]<=min_target)/this_total*100)
pct_inrange.append(sum((df_sub["bg"] > min_target) & (df_sub["bg"] <= max_target))/this_total*100)
else:
pct_above.append(0)
pct_below.append(0)
pct_inrange.append(0)
df_cgm_daily["pct_above"] = pct_above
df_cgm_daily["pct_below"] = pct_below
df_cgm_daily["pct_inrange"] = pct_inrange
# make sure the date ranges match
cgm_in_insulin = df_cgm_daily["yearday"].isin(df_insulin_data["yearday"])
insulin_in_cgm = df_insulin_data["yearday"].isin(df_cgm_daily["yearday"])
if match_date_ranges:
df_cgm_daily = df_cgm_daily[cgm_in_insulin]
df_insulin_data = df_insulin_data[insulin_in_cgm]
return df_cgm_data, df_bolus_data, df_basal_data, df_insulin_data, df_cgm_daily
def merge_data(main_folder, min_target=70, max_target=180):
"""
Combine multiple downloaded glooko folders and save the non-overlapping time series.
Example usage:
df_cgm, df_bolus, df_basal, df_insulin, df_cgm_daily = merge_data("../data")
Args:
main_folder (string): Main data folder path and filename
Returns:
A 5-element tuple
- **df_cgm** (pandas df): pandas dataframe containing CGM time series data
- **df_bolus** (pandas df): Bolus data
- **df_basal** (pandas df): Basal data
- **df_insulin** (pandas df): Insulin corrections data
- **df_cgm_daily** (pandas df): Various daily BG stats
"""
# TODO: Check that sub-folders do contain properly-formatted Glooko data. If none of the folders contain Glooko data, or there ARE no sub-folders, throw an error.
folder_list = glob.glob(path.join(main_folder, "*/"))
# Create empty dataframes
df_cgm0 = pd.DataFrame(columns=['time', 'bg', 'dayofyear', 'year', 'yearday'])
df_bolus0 = pd.DataFrame(columns=['time', 'insulin_type', 'bg_input', 'carbs_input', 'carb_ratio',
'insulin_delivered', 'initial_delivery', 'extended_delivery', 'carb_correction', 'insulin_correction'])
df_basal0 = pd.DataFrame(columns=['time', 'insulin_type', 'duration', 'percentage', 'rate',
'insulin_delivered'])
df_insulin0 = pd.DataFrame(columns=['time', 'total_bolus', 'total_insulin', 'total_basal', 'dayofyear',
'year', 'yearday'])
df_cgm_daily0 = pd.DataFrame(columns=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'yearday',
'time', 'pct_above', 'pct_below', 'pct_inrange'])
for folder in folder_list:
# Read files and create dataframes
df_cgm, df_bolus, df_basal, df_insulin, df_cgm_daily = read_all(folder, min_target=70, max_target=180)
# Concatenate the new data
df_cgm0 = pd.concat((df_cgm0, df_cgm))
df_bolus0 = pd.concat((df_bolus0, df_bolus))
df_basal0 = pd.concat((df_basal0, df_basal))
df_insulin = pd.concat((df_insulin0, df_insulin))
df_cgm_daily0 = pd.concat((df_cgm_daily0, df_cgm_daily))
# Remove duplicate rows
df_cgm0.drop_duplicates(inplace=True)
df_bolus0.drop_duplicates(inplace=True)
df_basal0.drop_duplicates(inplace=True)
df_insulin0.drop_duplicates(inplace=True)
df_cgm_daily0.drop_duplicates(inplace=True)
return df_cgm0, df_bolus0, df_basal0, df_insulin0, df_cgm_daily0
if __name__ == "__main__":
# Demonstrate usage
DATA_FOLDER = r"data/glooko"
# df_cgm, df_bolus, df_basal, df_insulin, df_cgm_daily = read_all(DATA_FOLDER, min_target=70, max_target=180)
# folders = glob.glob("data/*/")
# outfolder = "data/"
# merge_data(folders, outfolder)
print('breakpoint here...')