Groupby apply routine should not operate on the grouping column. This is deprecated behavior. Instead, the resulting dataframe will have a multi index and using reset_index (e.g. dropping the second level) will get back tjhe original patient id.

Example using a simple dataset:

In [1]:
import pandas as pd
import numpy as np

# Sample data
data = {
    'patient_id': [1, 1, 1, 2, 2, 2],
    'datetime': [
        '2023-10-01 00:00:00', '2023-10-01 00:03:00', '2023-10-01 00:07:00',
        '2023-10-01 00:01:00', '2023-10-01 00:04:00', '2023-10-01 00:08:00'
    ],
    'bolus': [1.0, 2.0, 3.0, 10, 20, 30]
}

# Create DataFrame
df = pd.DataFrame(data)
df['datetime'] = pd.to_datetime(df['datetime'])

# Function to resample data to 5-minute intervals
def resample_to_5min(group):
    group = group.set_index('datetime')
    resampled = group.resample('5T').sum().reset_index()
    return resampled

# Group by patient_id and apply the resampling function
result = df.groupby('patient_id').apply(resample_to_5min, include_groups=False)

# Display the result
display(result)

display(result.reset_index(level=0))


  resampled = group.resample('5T').sum().reset_index()
  resampled = group.resample('5T').sum().reset_index()


Unnamed: 0_level_0,Unnamed: 1_level_0,datetime,bolus
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,2023-10-01 00:00:00,3.0
1,1,2023-10-01 00:05:00,3.0
2,0,2023-10-01 00:00:00,30.0
2,1,2023-10-01 00:05:00,30.0


Unnamed: 0,patient_id,datetime,bolus
0,1,2023-10-01 00:00:00,3.0
1,1,2023-10-01 00:05:00,3.0
0,2,2023-10-01 00:00:00,30.0
1,2,2023-10-01 00:05:00,30.0


Example using the post-processing routine to resample boluses (copied from postprocessing.py and modified to remove patient_id operations).

In [2]:
from datetime import timedelta
import pandas as pd
import time 
import numpy as np

import os, sys
sys.path.append(os.path.join(os.getcwd(),'..'))

#functions for time alignment and transformation of basal, bolus, and cgm event data. These functions can be used for any study dataset.
def bolus_transform(bolus_data):
    """
    Transform the bolus data by aligning timestamps, handling duplicates, and extending boluses based on durations.

    Parameters:
    - bolus_data (DataFrame): The input is a bolus data dataframe containing columns 'patient_id, 'datetime', 'bolus', and 'delivery_duration'.

    Returns:
    - bolus_data (DataFrame): The transformed bolus data with aligned timestamps, duplicates removed, and extended bolus handling.
    """

    #start data from midnight
    bolus_data = bolus_data.sort_values(by='datetime').reset_index(drop=True)
    #round to the nearest 5 minute value so timestamps that are close become duplicates (2:32:35 and 2:36:05 would both become 2:35:00)
    #this allows us to handle duplicates before needing to align data
    bolus_data['datetime'] = bolus_data['datetime'].dt.round("5min")
    #data aligns on unix time
    bolus_data['UnixTime'] = [int(time.mktime(bolus_data.datetime[x].timetuple())) for x in bolus_data.index]
    #create a new dataset of 5 minute time series data starting at midnight based on the data available
    start_date = bolus_data['datetime'].iloc[0].date()
    end_date = bolus_data['datetime'].iloc[-1].date() + timedelta(days=1)
    bolus_from_mid = pd.DataFrame(columns=['datetime_adj'])
    bolus_from_mid['datetime_adj'] = pd.date_range(start = start_date, end = end_date, freq="5min").values
    bolus_from_mid['UnixTime'] = [int(time.mktime(bolus_from_mid.datetime_adj[x].timetuple())) for x in bolus_from_mid.index]
    bolus_from_mid = bolus_from_mid.drop_duplicates(subset=['UnixTime']).sort_values(by='UnixTime')
    #sum boluses if there is a duplicate time (happens when two or more boluses are announces <5 minutes apart)
    #keep maximum duration of the bolus - in the rare case a standard and extended are announced int the same 5 minute window, it will be treated as an extended bolus
    bolus_data = bolus_data.groupby('UnixTime').agg({'bolus':'sum','delivery_duration':'max'}).reset_index()
   
    #merge new midnight aligned times with bolus data
    bolus_merged = pd.merge_asof(bolus_from_mid, bolus_data, on="UnixTime",direction="nearest",tolerance=149)
    bolus_data = bolus_merged.filter(items=['datetime_adj','bolus','delivery_duration'])
    bolus_data = bolus_data.rename(columns={"datetime_adj": "datetime",
                                        }) 
    #extended bolus handling: duration must be a timedelta for this to work
    extended_boluses = bolus_data[bolus_data.delivery_duration > timedelta(minutes=5)]
    #determine how many 5 minute steps the bolus is extended for and round to the nearst whole number step
    extended_boluses['Duration_minutes'] = extended_boluses['delivery_duration'].dt.total_seconds()/60
    extended_boluses['Duration_steps'] = extended_boluses['Duration_minutes']/5
    extended_boluses['Duration_steps'] = extended_boluses['Duration_steps'].round()
    #extend the bolus out assumming an equal amount of delivery for each time step            
    for ext in extended_boluses.index:
        #devide the bolus by the number of time steps it is extended by
        bolus_parts = extended_boluses.bolus[ext]/extended_boluses.Duration_steps[ext]
        #replace bolus info with extended data
        bolus_data.loc[ext:ext+int(extended_boluses.Duration_steps[ext])-1, 'bolus'] = bolus_parts
                        
    #fill nans with 0
    bolus_data.bolus = bolus_data.bolus.fillna(0)

    return bolus_data



# Sample data
data = {
    'patient_id': [1, 1, 1, 2, 2, 2],
    'datetime': [
        '2023-10-01 00:00:00', '2023-10-01 00:03:00', '2023-10-01 00:07:00',
        '2023-10-01 00:01:00', '2023-10-01 00:04:00', '2023-10-01 00:08:00'
    ],
    'bolus': [1.0, 2.0, 3.0, 10, 20, 30]
}

# Create DataFrame
df = pd.DataFrame(data)
df['delivery_duration'] = timedelta(minutes=5)
df['datetime'] = pd.to_datetime(df['datetime'])

# Function to resample data to 5-minute intervals
def resample_to_5min(group):
    group = group.set_index('datetime')
    resampled = group.resample('5T').sum().reset_index()
    return resampled

# Group by patient_id and apply the resampling function
result = df.groupby('patient_id').apply(bolus_transform)

# Display the result
display(result.head())
display(result.reset_index(level=0).head())

  result = df.groupby('patient_id').apply(bolus_transform)


Unnamed: 0_level_0,Unnamed: 1_level_0,datetime,bolus,delivery_duration
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,2023-10-01 00:00:00,1.0,0 days 00:05:00
1,1,2023-10-01 00:05:00,5.0,0 days 00:05:00
1,2,2023-10-01 00:10:00,0.0,NaT
1,3,2023-10-01 00:15:00,0.0,NaT
1,4,2023-10-01 00:20:00,0.0,NaT


Unnamed: 0,patient_id,datetime,bolus,delivery_duration
0,1,2023-10-01 00:00:00,1.0,0 days 00:05:00
1,1,2023-10-01 00:05:00,5.0,0 days 00:05:00
2,1,2023-10-01 00:10:00,0.0,NaT
3,1,2023-10-01 00:15:00,0.0,NaT
4,1,2023-10-01 00:20:00,0.0,NaT
