In [1]:
import pandas as pd
from cgm_pp_helpers import (
    handle_empty_cgm_data, handle_missing_meal_times, read_cgm_data, 
    expand_df, calculate_and_aggregate_meal_features, CGMDataPipeline)
%load_ext autoreload
%autoreload 2


In [None]:
cgm_train = read_cgm_data("../data/cgm_train.csv")
label_train = pd.read_csv("../data/label_train.csv")



In [3]:
cgm_train.head()

Unnamed: 0,Subject ID,Day,Breakfast Time,Lunch Time,CGM Data
0,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,"[('2021-09-19 08:20:00', 98.26666666666667), (..."
1,1,3,2021-09-20 09:50:00,2021-09-20 15:20:00,"[('2021-09-20 09:10:00', 97.18333333333334), (..."
2,1,4,2021-09-21 09:34:00,2021-09-21 13:09:00,"[('2021-09-21 09:20:00', 107.36666666666666), ..."
3,1,5,2021-09-22 09:46:00,2021-09-22 13:50:00,"[('2021-09-22 09:25:00', 107.28333333333333), ..."
4,1,6,2021-09-23 09:07:00,2021-09-23 13:17:00,"[('2021-09-23 08:55:00', 103.0), ('2021-09-23 ..."


In [4]:
label_train.head()

Unnamed: 0,Subject ID,Day,Breakfast Calories,Lunch Calories,Breakfast Carbs,Lunch Carbs,Breakfast Fat,Lunch Fat,Breakfast Protein,Lunch Protein
0,1,2,448.0,830,66.0,92,10.5,42.0,22.0,17
1,1,3,608.0,435,66.0,16,10.5,14.0,66.0,66
2,1,4,712.0,555,66.0,94,42.0,13.0,22.0,12
3,1,5,902.0,355,73.0,19,42.0,15.0,66.0,32
4,1,6,268.0,1180,24.0,81,10.5,54.5,22.0,88


In [5]:
label_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Subject ID          324 non-null    int64  
 1   Day                 324 non-null    int64  
 2   Breakfast Calories  324 non-null    float64
 3   Lunch Calories      324 non-null    int64  
 4   Breakfast Carbs     324 non-null    float64
 5   Lunch Carbs         324 non-null    int64  
 6   Breakfast Fat       324 non-null    float64
 7   Lunch Fat           324 non-null    float64
 8   Breakfast Protein   324 non-null    float64
 9   Lunch Protein       324 non-null    int64  
dtypes: float64(5), int64(5)
memory usage: 25.4 KB


In [6]:
cgm_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Subject ID      324 non-null    int64         
 1   Day             324 non-null    int64         
 2   Breakfast Time  298 non-null    datetime64[ns]
 3   Lunch Time      300 non-null    datetime64[ns]
 4   CGM Data        324 non-null    object        
dtypes: datetime64[ns](2), int64(2), object(1)
memory usage: 12.8+ KB


## Handling Missing Values

### Missing CGM Data

In [7]:
cgm_train_cleaned = handle_empty_cgm_data(cgm_train)

Subject ID: 26, Dropped Days: [6, 7] due to missing CGM data (empty list)
Subject ID: 32, Dropped Days: [3] due to missing CGM data (empty list)
Subject ID: 33, Dropped Days: [2] due to missing CGM data (empty list)
Subject ID: 42, Dropped Days: [8] due to missing CGM data (empty list)


### Missing Meal Times

In [8]:
cgm_train_missing_values_handled, averages = handle_missing_meal_times(cgm_train_cleaned, method = 2)

In [9]:
cgm_train_missing_values_handled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 319 entries, 0 to 323
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Subject ID      319 non-null    int64         
 1   Day             319 non-null    int64         
 2   Breakfast Time  319 non-null    datetime64[ns]
 3   Lunch Time      319 non-null    datetime64[ns]
 4   CGM Data        319 non-null    object        
dtypes: datetime64[ns](2), int64(2), object(1)
memory usage: 15.0+ KB


In [10]:
cgm_train_missing_values_handled.head()

Unnamed: 0,Subject ID,Day,Breakfast Time,Lunch Time,CGM Data
0,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,"[('2021-09-19 08:20:00', 98.26666666666667), (..."
1,1,3,2021-09-20 09:50:00,2021-09-20 15:20:00,"[('2021-09-20 09:10:00', 97.18333333333334), (..."
2,1,4,2021-09-21 09:34:00,2021-09-21 13:09:00,"[('2021-09-21 09:20:00', 107.36666666666666), ..."
3,1,5,2021-09-22 09:46:00,2021-09-22 13:50:00,"[('2021-09-22 09:25:00', 107.28333333333333), ..."
4,1,6,2021-09-23 09:07:00,2021-09-23 13:17:00,"[('2021-09-23 08:55:00', 103.0), ('2021-09-23 ..."


In [11]:
averages["avg_breakfast"]

Subject ID
1            09:41:40
2     09:40:13.333333
3            09:27:00
5     09:30:13.333333
6     08:42:33.333333
7     08:46:06.666667
8            09:00:00
9     08:28:46.666667
10           06:09:00
11    08:12:53.333333
12    09:53:22.500000
13    08:03:52.500000
14           08:30:00
15    09:37:53.333333
16           08:20:00
17    10:31:53.333333
19    09:46:07.500000
20           06:06:20
22    09:12:46.666667
23    09:01:53.333333
26           09:49:10
28    09:28:39.111111
29    06:14:06.875000
30    08:15:30.714286
31    07:47:14.888889
32    06:47:43.333333
33    08:19:29.750000
35    07:44:13.875000
36    07:56:52.857143
38    06:52:33.250000
41    08:24:13.888889
42    09:21:37.166667
44    08:34:21.857143
45    09:21:03.250000
48           09:14:49
49    07:39:42.428571
Name: avg_breakfast, dtype: object

## Feature Extraction

In [12]:
cgm_train_expanded = expand_df(cgm_train_missing_values_handled)

In [13]:
cgm_train_expanded.head()

Unnamed: 0,Subject ID,Day,Breakfast Time,Lunch Time,Timestamp,CGM Reading
0,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,2021-09-19 08:20:00,98.266667
1,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,2021-09-19 08:25:00,95.183333
2,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,2021-09-19 08:30:00,97.283333
3,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,2021-09-19 08:35:00,106.116667
4,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,2021-09-19 08:40:00,121.65


In [14]:
cgm_train_expanded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29166 entries, 0 to 29165
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Subject ID      29166 non-null  int64         
 1   Day             29166 non-null  int64         
 2   Breakfast Time  29166 non-null  datetime64[ns]
 3   Lunch Time      29166 non-null  datetime64[ns]
 4   Timestamp       29166 non-null  datetime64[ns]
 5   CGM Reading     29166 non-null  float64       
dtypes: datetime64[ns](3), float64(1), int64(2)
memory usage: 1.3 MB


In [15]:
breakfast_features = calculate_and_aggregate_meal_features(cgm_train_expanded)

Subjects with no data around breakfast windows of 2 hours:
Subject ID: 13, Missing Breakfast Days: [9]
Subject ID: 19, Missing Breakfast Days: [6]
Subject ID: 26, Missing Breakfast Days: [3]
Subject ID: 30, Missing Breakfast Days: [2, 5]
Subject ID: 35, Missing Breakfast Days: [2]
Subject ID: 38, Missing Breakfast Days: [2]
Subject ID: 42, Missing Breakfast Days: [9]
Subject ID: 49, Missing Breakfast Days: [7]

Subjects with no data around lunch windows of 2 hours:
Subject ID: 29, Missing Lunch Days: [2]
Subject ID: 32, Missing Lunch Days: [9]
Subject ID: 35, Missing Lunch Days: [9]
Subject ID: 44, Missing Lunch Days: [7]


In [16]:
breakfast_features

Unnamed: 0,Subject ID,Day,Breakfast_mean,Breakfast_std,Breakfast_min,Breakfast_max,Breakfast_auc,Breakfast_rate_of_change,Lunch_mean,Lunch_std,Lunch_min,Lunch_max,Lunch_auc,Lunch_rate_of_change
0,1,2,99.084001,26.936086,45.183333,141.816667,2784.877696,-0.693452,85.813542,22.279849,40.733333,133.166667,4057.091667,0.655674
1,1,3,97.482426,9.015236,87.183333,118.083333,3123.236709,-0.218750,97.615556,3.300128,91.183333,102.633333,2833.375000,0.225862
2,1,4,114.275309,13.533432,95.450000,139.900000,2984.025000,-0.458333,108.017361,12.706781,90.000000,134.450000,5078.241667,0.195390
3,1,5,109.570115,10.484334,89.366667,126.000000,3076.933333,-0.477381,99.329932,6.383248,86.183333,110.633333,4774.483333,-0.229167
4,1,6,107.497972,7.850691,94.190000,124.633333,2798.445238,0.076923,106.885278,4.551785,98.126667,117.810000,5024.651667,-0.089149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,49,6,176.800000,29.469277,137.736667,227.736667,4280.290000,0.164444,147.382973,48.864177,88.000000,209.420000,5309.091667,2.393241
315,49,7,,,,,,,144.961605,38.219384,84.526667,187.736667,3782.726667,3.556667
316,49,8,250.374744,46.895978,154.263333,295.000000,6331.218333,1.940933,144.025347,26.739147,110.263333,187.683333,6754.191667,0.762624
317,49,9,275.055128,54.007854,165.316667,331.000000,6946.250000,3.189333,154.330000,32.892468,96.263333,186.210000,4026.541667,3.392692


## Putting All Together

In [17]:
pp = CGMDataPipeline()

cgm_train_features = pp.fit_transform(cgm_train)

Step 1: Handling empty CGM data...
Subject ID: 26, Dropped Days: [6, 7] due to missing CGM data (empty list)
Subject ID: 32, Dropped Days: [3] due to missing CGM data (empty list)
Subject ID: 33, Dropped Days: [2] due to missing CGM data (empty list)
Subject ID: 42, Dropped Days: [8] due to missing CGM data (empty list)

Step 2: Handling missing meal times (fit)...

Step 3: Expanding CGM data...

Step 4: Calculating and aggregating meal features (breakfast and lunch)...
Subjects with no data around breakfast windows of 2 hours:
Subject ID: 13, Missing Breakfast Days: [9]
Subject ID: 19, Missing Breakfast Days: [6]
Subject ID: 26, Missing Breakfast Days: [3]
Subject ID: 30, Missing Breakfast Days: [2, 5]
Subject ID: 35, Missing Breakfast Days: [2]
Subject ID: 38, Missing Breakfast Days: [2]
Subject ID: 42, Missing Breakfast Days: [9]
Subject ID: 49, Missing Breakfast Days: [7]

Subjects with no data around lunch windows of 2 hours:
Subject ID: 29, Missing Lunch Days: [2]
Subject ID: 32,

In [18]:
cgm_train_features.head()

Unnamed: 0,Subject ID,Day,Breakfast_mean,Breakfast_std,Breakfast_min,Breakfast_max,Breakfast_auc,Breakfast_rate_of_change,Lunch_mean,Lunch_std,Lunch_min,Lunch_max,Lunch_auc,Lunch_rate_of_change
0,1,2,99.084001,26.936086,45.183333,141.816667,2784.877696,-0.693452,85.813542,22.279849,40.733333,133.166667,4057.091667,0.655674
1,1,3,97.482426,9.015236,87.183333,118.083333,3123.236709,-0.21875,97.615556,3.300128,91.183333,102.633333,2833.375,0.225862
2,1,4,114.275309,13.533432,95.45,139.9,2984.025,-0.458333,108.017361,12.706781,90.0,134.45,5078.241667,0.19539
3,1,5,109.570115,10.484334,89.366667,126.0,3076.933333,-0.477381,99.329932,6.383248,86.183333,110.633333,4774.483333,-0.229167
4,1,6,107.497972,7.850691,94.19,124.633333,2798.445238,0.076923,106.885278,4.551785,98.126667,117.81,5024.651667,-0.089149


In [19]:
cgm_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, 0 to 318
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                306 non-null    int64  
 1   Day                       306 non-null    int64  
 2   Breakfast_mean            306 non-null    float64
 3   Breakfast_std             306 non-null    float64
 4   Breakfast_min             306 non-null    float64
 5   Breakfast_max             306 non-null    float64
 6   Breakfast_auc             306 non-null    float64
 7   Breakfast_rate_of_change  306 non-null    float64
 8   Lunch_mean                306 non-null    float64
 9   Lunch_std                 306 non-null    float64
 10  Lunch_min                 306 non-null    float64
 11  Lunch_max                 306 non-null    float64
 12  Lunch_auc                 306 non-null    float64
 13  Lunch_rate_of_change      306 non-null    float64
dtypes: float64(12),

In [20]:
cgm_train_features.isna().sum()

Subject ID                  0
Day                         0
Breakfast_mean              0
Breakfast_std               0
Breakfast_min               0
Breakfast_max               0
Breakfast_auc               0
Breakfast_rate_of_change    0
Lunch_mean                  0
Lunch_std                   0
Lunch_min                   0
Lunch_max                   0
Lunch_auc                   0
Lunch_rate_of_change        0
dtype: int64