In [1]:
from cgm_pp_helpers import read_cgm_data, CGMDataPipeline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
%load_ext autoreload
%autoreload 2

## Data Preparation

### Importing

In [2]:
cgm_train = read_cgm_data("../data/cgm_train.csv")
cgm_test = read_cgm_data("../data/cgm_test.csv")
label_train = pd.read_csv("../data/label_train.csv")
label_test = pd.read_csv("../data/label_test_breakfast_only.csv")

In [3]:
label_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Subject ID          324 non-null    int64  
 1   Day                 324 non-null    int64  
 2   Breakfast Calories  324 non-null    float64
 3   Lunch Calories      324 non-null    int64  
 4   Breakfast Carbs     324 non-null    float64
 5   Lunch Carbs         324 non-null    int64  
 6   Breakfast Fat       324 non-null    float64
 7   Lunch Fat           324 non-null    float64
 8   Breakfast Protein   324 non-null    float64
 9   Lunch Protein       324 non-null    int64  
dtypes: float64(5), int64(5)
memory usage: 25.4 KB


### Processing pipeline

In [4]:
pp = CGMDataPipeline()
cgm_train_features = pp.fit_transform(cgm_train)

Step 1: Handling empty CGM data...
Subject ID: 26, Dropped Days: [6, 7] due to missing CGM data (empty list)
Subject ID: 32, Dropped Days: [3] due to missing CGM data (empty list)
Subject ID: 33, Dropped Days: [2] due to missing CGM data (empty list)
Subject ID: 42, Dropped Days: [8] due to missing CGM data (empty list)

Step 2: Handling missing meal times (fit)...

Step 3: Expanding CGM data...

Step 4: Calculating and aggregating meal features (breakfast and lunch)...
Subjects with no data around breakfast windows of 2 hours:
Subject ID: 13, Missing Breakfast Days: [9]
Subject ID: 19, Missing Breakfast Days: [6]
Subject ID: 26, Missing Breakfast Days: [3]
Subject ID: 30, Missing Breakfast Days: [2, 5]
Subject ID: 35, Missing Breakfast Days: [2]
Subject ID: 38, Missing Breakfast Days: [2]
Subject ID: 42, Missing Breakfast Days: [9]
Subject ID: 49, Missing Breakfast Days: [7]

Subjects with no data around lunch windows of 2 hours:
Subject ID: 29, Missing Lunch Days: [2]
Subject ID: 32,

In [20]:
cgm_test_features = pp.fit_transform(cgm_test, dropna = False, method = 3)

Step 1: Handling empty CGM data...

Step 2: Handling missing meal times (fit)...

Step 3: Expanding CGM data...

Step 4: Calculating and aggregating meal features (breakfast and lunch)...


In [6]:
cgm_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, 0 to 318
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                306 non-null    int64  
 1   Day                       306 non-null    int64  
 2   Breakfast_mean            306 non-null    float64
 3   Breakfast_std             306 non-null    float64
 4   Breakfast_min             306 non-null    float64
 5   Breakfast_max             306 non-null    float64
 6   Breakfast_auc             306 non-null    float64
 7   Breakfast_rate_of_change  306 non-null    float64
 8   Lunch_mean                306 non-null    float64
 9   Lunch_std                 306 non-null    float64
 10  Lunch_min                 306 non-null    float64
 11  Lunch_max                 306 non-null    float64
 12  Lunch_auc                 306 non-null    float64
 13  Lunch_rate_of_change      306 non-null    float64
dtypes: float64(12),

In [7]:
cgm_test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                73 non-null     int64  
 1   Day                       73 non-null     int64  
 2   Breakfast_mean            65 non-null     float64
 3   Breakfast_std             65 non-null     float64
 4   Breakfast_min             65 non-null     float64
 5   Breakfast_max             65 non-null     float64
 6   Breakfast_auc             65 non-null     float64
 7   Breakfast_rate_of_change  65 non-null     float64
 8   Lunch_mean                73 non-null     float64
 9   Lunch_std                 73 non-null     float64
 10  Lunch_min                 73 non-null     float64
 11  Lunch_max                 73 non-null     float64
 12  Lunch_auc                 73 non-null     float64
 13  Lunch_rate_of_change      73 non-null     float64
dtypes: float64(1

In [8]:
training_data = pd.merge(cgm_train_features, label_train, on = ["Subject ID", "Day"])

In [9]:
test_data = pd.merge(cgm_test_features, label_test, on = ["Subject ID", "Day"])

In [10]:
training_data.head()

Unnamed: 0,Subject ID,Day,Breakfast_mean,Breakfast_std,Breakfast_min,Breakfast_max,Breakfast_auc,Breakfast_rate_of_change,Lunch_mean,Lunch_std,...,Lunch_auc,Lunch_rate_of_change,Breakfast Calories,Lunch Calories,Breakfast Carbs,Lunch Carbs,Breakfast Fat,Lunch Fat,Breakfast Protein,Lunch Protein
0,1,2,99.084001,26.936086,45.183333,141.816667,2784.877696,-0.693452,85.813542,22.279849,...,4057.091667,0.655674,448.0,830,66.0,92,10.5,42.0,22.0,17
1,1,3,97.482426,9.015236,87.183333,118.083333,3123.236709,-0.21875,97.615556,3.300128,...,2833.375,0.225862,608.0,435,66.0,16,10.5,14.0,66.0,66
2,1,4,114.275309,13.533432,95.45,139.9,2984.025,-0.458333,108.017361,12.706781,...,5078.241667,0.19539,712.0,555,66.0,94,42.0,13.0,22.0,12
3,1,5,109.570115,10.484334,89.366667,126.0,3076.933333,-0.477381,99.329932,6.383248,...,4774.483333,-0.229167,902.0,355,73.0,19,42.0,15.0,66.0,32
4,1,6,107.497972,7.850691,94.19,124.633333,2798.445238,0.076923,106.885278,4.551785,...,5024.651667,-0.089149,268.0,1180,24.0,81,10.5,54.5,22.0,88


In [11]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                306 non-null    int64  
 1   Day                       306 non-null    int64  
 2   Breakfast_mean            306 non-null    float64
 3   Breakfast_std             306 non-null    float64
 4   Breakfast_min             306 non-null    float64
 5   Breakfast_max             306 non-null    float64
 6   Breakfast_auc             306 non-null    float64
 7   Breakfast_rate_of_change  306 non-null    float64
 8   Lunch_mean                306 non-null    float64
 9   Lunch_std                 306 non-null    float64
 10  Lunch_min                 306 non-null    float64
 11  Lunch_max                 306 non-null    float64
 12  Lunch_auc                 306 non-null    float64
 13  Lunch_rate_of_change      306 non-null    float64
 14  Breakfast 

In [12]:
test_data.head()

Unnamed: 0,Subject ID,Day,Breakfast_mean,Breakfast_std,Breakfast_min,Breakfast_max,Breakfast_auc,Breakfast_rate_of_change,Lunch_mean,Lunch_std,Lunch_min,Lunch_max,Lunch_auc,Lunch_rate_of_change,Breakfast Calories,Breakfast Carbs,Breakfast Fat,Breakfast Protein
0,4,2,170.747333,26.603114,114.683333,208.136667,4146.23,0.6475,112.462847,20.510459,88.273333,153.0,5271.968333,-0.759078,448.0,66.0,10.5,22.0
1,4,3,134.760267,18.795275,103.0,163.0,3252.416667,1.1325,99.090694,8.685806,88.0,130.18,4642.763333,-0.705957,608.0,66.0,10.5,66.0
2,4,6,106.4308,19.939555,87.0,141.453333,2569.065,-0.392083,106.486944,20.453546,85.136667,134.863333,5006.941667,0.69922,268.0,24.0,10.5,22.0
3,4,7,140.510769,23.328257,95.0,165.0,3543.258333,1.201733,112.351181,23.985536,80.0,153.043333,5249.54,-0.413901,448.0,66.0,10.5,22.0
4,4,8,120.426923,12.026732,94.136667,139.863333,3022.031667,1.194533,98.394792,13.873721,81.136667,124.0,4611.313333,-0.355887,608.0,66.0,10.5,66.0


In [13]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                73 non-null     int64  
 1   Day                       73 non-null     int64  
 2   Breakfast_mean            65 non-null     float64
 3   Breakfast_std             65 non-null     float64
 4   Breakfast_min             65 non-null     float64
 5   Breakfast_max             65 non-null     float64
 6   Breakfast_auc             65 non-null     float64
 7   Breakfast_rate_of_change  65 non-null     float64
 8   Lunch_mean                73 non-null     float64
 9   Lunch_std                 73 non-null     float64
 10  Lunch_min                 73 non-null     float64
 11  Lunch_max                 73 non-null     float64
 12  Lunch_auc                 73 non-null     float64
 13  Lunch_rate_of_change      73 non-null     float64
 14  Breakfast Ca

## Modeling

### Train Test Split

In [14]:
features = [
    'Breakfast_mean', 'Breakfast_std', 'Breakfast_min', 'Breakfast_max',
    'Breakfast_auc', 'Breakfast_rate_of_change', 'Breakfast Calories',
    'Breakfast Carbs', 'Breakfast Fat', 'Breakfast Protein'
]

lunch_targets = ['Lunch Calories']
X = training_data[features]
y = training_data[lunch_targets]
X_test = test_data[features]
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



### Training

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

### Evaluation

In [16]:
# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

# Display results
evaluation_results = pd.DataFrame({
    'Target': lunch_targets,
    'MSE': mse,
    'MAE': mae,
    'R2': r2
})

In [17]:
evaluation_results

Unnamed: 0,Target,MSE,MAE,R2
0,Lunch Calories,35317.95532,113.643334,0.447912


### Predict on test set

In [18]:
y_test_pred = model.predict(X_test)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
y_test_pred_df = pd.DataFrame(y_test_pred, columns = ['label'])

In [None]:
y_test_pred_df

Unnamed: 0,label
0,784.735830
1,474.528670
2,740.025771
3,803.815691
4,479.366005
...,...
60,608.380700
61,819.111877
62,489.472449
63,600.045330


In [None]:
submissions = y_test_pred_df.to_csv("dirty_first_model.csv")