# Dataset Overview

## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Helper Functions

In [2]:
def plot_feature_samples_per_subject(df, column, save_fig=False):
    # Group by 'id' and count non-null 'CGM' values
    counts = df.groupby('id')[f'{column}'].count()
    print(f'{column} counts', counts)
    print(counts.max())
    print(counts.min())
    
    # Plot histogram
    plt.figure(figsize=(8, 6))
    plt.hist(counts, bins=30, edgecolor='black', alpha=0.7)
    plt.xlabel(f'Number of {column} Values')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {column} samples per ID')
    
    plt.show()
    
    if save_fig:
        plt.savefig(f'{column}_feature_samples_per_subject.png')

In [3]:
def plot_daily_average_feature_sum_per_subject(df, column, save_fig=False):
    # Define a function to resample and calculate daily sum within each group
    def resample_and_sum(group):
        return group[f'{column}'].resample('D').sum().mean()
    
    # Apply the function to each group
    avg_daily_sum = df.groupby('id').apply(resample_and_sum)
    
    # Plot histogram of average daily sum of insulin
    plt.figure(figsize=(8, 6))
    plt.hist(avg_daily_sum, bins=30, edgecolor='black', alpha=0.7)
    plt.xlabel(f'Average Daily Sum of {column}')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Average Daily Sum of {column} per Subject')
    plt.show()
    
    if save_fig:
        plt.savefig(f'{column}_daily_average_feature_sum_per_subject.png')

In [4]:
def plot_histogram_for_feature(df, column, save_fig=False):

    plt.figure()
    if column == 'bolus':
        plt.hist(df.replace(0, np.nan)[df['bolus'] < 60][column], bins=100, edgecolor='black', alpha=0.7)
    elif column == 'basal':
        plt.hist(df.replace(0, np.nan)[df['basal'] < 60][column], bins=100, edgecolor='black', alpha=0.7)
    else:
        plt.hist(df.replace(0, np.nan)[column], bins=100, edgecolor='black', alpha=0.7)
    plt.xlabel(f'{column} value')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {column} values')
    plt.tight_layout()
    plt.show()

    if save_fig:
        plt.savefig(f'{column}_daily_average_feature_sum_per_subject.png')


In [5]:
def print_samples_per_feature(df):
    data = df.copy()
    data.loc[data['carbs'] == 0, 'carbs'] = np.nan
    data.loc[data['bolus'] == 0, 'bolus'] = np.nan
    data.loc[data['workout_intensity'] == 0, 'workout_intensity'] = np.nan
    
    for col in data.columns:
        print(f'{col}: {data[data[col].notna()][col].shape[0]}')

## Load Data

In [15]:
folder_path = os.path.join('../', 'processed_data')

datasets = {}
for dataset_folder in os.listdir(folder_path):
    print(dataset_folder)
    dataset_folder_path_train = os.path.join(folder_path, dataset_folder, "train")
    dataset_folder_path_test = os.path.join(folder_path, dataset_folder, "test")

    df_train_list = []
    df_test_list = []
    for subject_data in os.listdir(dataset_folder_path_train):
        if subject_data.endswith('.csv'):
            file_path = os.path.join(dataset_folder_path_train, subject_data)
            df = pd.read_csv(file_path, parse_dates=['date'], index_col='date', low_memory=False)

            df['id'] = subject_data.split(".")[0]
            #df.set_index('date', inplace=True)
            df_train_list.append(df)

    for subject_data in os.listdir(dataset_folder_path_test):
        if subject_data.endswith('.csv'):       
            file_path = os.path.join(dataset_folder_path_test, subject_data)
            df = pd.read_csv(file_path, parse_dates=['date'], index_col='date', low_memory=False)

            df['id'] = subject_data.split(".")[0]
            #df.set_index('date', inplace=True)
            df_test_list.append(df)
            
    combined_df_train = pd.concat(df_train_list)
    combined_df_test = pd.concat(df_test_list)
    datasets[dataset_folder] = {
        "train": combined_df_train,
        "test": combined_df_test,
    }

T1DEXI
T1DEXIP
tidepool_dataset
OhioT1DM


## Dataset Overview

#### Counting Subjects, Study Lengths, and Samples

In [35]:
def print_numbers_for_datasets(df):
    n = len(df['id'].unique())
    total_samples_in_years = df['CGM'].notna().shape[0] / 12 / 24 / 365 # in years!
    
    # TODO: Average time span each subject when CGM not na
    if n == 0:
        average_per_subject = 0
    else:
        average_per_subject = total_samples_in_years * 365 / 30 / n
    
    # For the standard deviation, calculate the time span for each subject and then find the std
    time_span_per_subject = df[df['CGM'].notna()].groupby('id').size() / (12 * 24 * 30)
    std_time_span = np.std(time_span_per_subject)

    print(f"n={n}, total samples={total_samples_in_years:.1f} years, average time span={average_per_subject:.1f} months, std={std_time_span:.1f}")

for key in datasets:
    if not key in ['T1DEXI', 'T1DEXIP']:
        print("DATASET: ", key)
        train_df = datasets[key]['train']
        test_df = datasets[key]['test']
        combined_df = pd.concat([train_df, test_df])
        print_numbers_for_datasets(combined_df)
        print(" ")

t1dexi_dfs = []
for key in datasets:
    if key in ['T1DEXI', 'T1DEXIP']:
        train_df = datasets[key]['train']
        test_df = datasets[key]['test']
        # Creating unique ids
        train_df['id'] = train_df['id'] + f"_{key}"
        test_df['id'] = test_df['id'] + f"_{key}"
        
        t1dexi_dfs.append(train_df)
        t1dexi_dfs.append(test_df)

print("T1DEXI")
combined_df = pd.concat(t1dexi_dfs)
print_numbers_for_datasets(combined_df)
print(" ")

DATASET:  tidepool_dataset
n=150, total samples=70.9 years, average time span=5.8 months, std=3.3
 
DATASET:  OhioT1DM
n=12, total samples=1.8 years, average time span=1.8 months, std=0.1
 
T1DEXI
n=639, total samples=115.6 years, average time span=2.2 months, std=0.3
 


In [38]:
def print_samples_for_each_feature(df):

    # TODO: Let insulin carbs be nan if 0?
    
    for col in df.columns:
        if not col in ['id']:
            print(f"{col}: {df[col][df[col].notna()].shape[0]}")

for key in datasets:
    if not key in ['T1DEXI', 'T1DEXIP']:
        print("DATASET: ", key)
        train_df = datasets[key]['train']
        test_df = datasets[key]['test']
        combined_df = pd.concat([train_df, test_df])
        print_samples_for_each_feature(combined_df)
        print(" ")

t1dexi_dfs = []
for key in datasets:
    if key in ['T1DEXI', 'T1DEXIP']:
        train_df = datasets[key]['train']
        test_df = datasets[key]['test']
        # Creating unique ids
        train_df['id'] = train_df['id'] + f"_{key}"
        test_df['id'] = test_df['id'] + f"_{key}"
        
        t1dexi_dfs.append(train_df)
        t1dexi_dfs.append(test_df)

print("T1DEXI")
combined_df = pd.concat(t1dexi_dfs)
print_samples_for_each_feature(combined_df)
print(" ")

DATASET:  tidepool_dataset
CGM: 7174707
carbs: 7395763
bolus: 7442564
basal: 7453514
insulin: 7442109
 
DATASET:  OhioT1DM
CGM: 166532
carbs: 189739
bolus: 190060
basal: 189730
gsr: 147444
skin_temp: 147590
exercise: 155383
heartrate: 86811
 
T1DEXI
CGM: 3683581
meal_grams: 27505
meal_name: 27512
bolus: 3731775
basal: 10201164
workout: 110005
workout_intensity: 139172
workout_duration: 4397381
heartrate: 3374920
 


In [None]:
# TODO: Combine all dfs, both training and test samples!

In [None]:
plot_feature_samples_per_subject(combined_df, 'CGM')

In [None]:
plot_daily_average_feature_sum_per_subject(combined_df, 'bolus')

In [None]:
plot_histogram_for_feature(combined_df, 'carbs')

In [None]:
# TODO: Separate this into test and train
print_samples_per_feature(combined_df)