# Data Diagnostics

This notebook is to check the data quality for each subject, and to find out if some subjects have data that looks strange and we should look more into. 

What we are checking here:
- Min / Max / Mean values of relevant features (CGM and heartrate), compared to predefined expectations and thresholds
- Ratios between basal / bolus / carbs to check for unreasonable data
- Verify that time intervals are correct
- Feature sparsity in train / test data for each subject after imputation

Results storage and visualization:
- Saving a dataframe with results
- Plot results in color coded tables 

### Imports

In [1]:
import pandas as pd
import numpy as np
import os

### Load Data

In [2]:
folder = 'processed_data'
file_name = 'tidepool_dataset.csv'
df = pd.read_csv(os.path.join('..', folder, file_name), index_col='date', parse_dates=['date'], low_memory=False)

In [8]:
# TODO: Trim data in the beginning and end for each subject
# TODO: Create a dataframe where each row is an id, and each column is a feature sparsity
train_data = []
test_data = []

for subject_id, subset_df in df.groupby('id'): 
    def get_data_from_trimmed_data(df_subset, is_test=True):        
        # Validating time intervals
        time_diffs = df_subset.index.to_series().diff()
        expected_interval = pd.Timedelta(minutes=5)
        valid_intervals = (time_diffs[1:] == expected_interval).all()
        if not valid_intervals:
            invalid_intervals = time_diffs[time_diffs != expected_interval]
            print(f"Subject {subject_id} has invalid intervals found:", invalid_intervals)
        
        cgm_col = 'CGM_smoothed' if 'CGM_smoothed' in df_subset.columns else 'CGM'
        
        # Trim data from nan values in the beginning and end
        first_valid_index = df_subset[cgm_col].first_valid_index()
        last_valid_index = df_subset[cgm_col].last_valid_index()
        if first_valid_index is not None and last_valid_index is not None:
            trimmed_subject_data = df_subset.loc[first_valid_index:last_valid_index]
        else:
            print(f'Subject {subject_id} for is_test {is_test} does not have valid data! DF is probably empty.')
            return        
        def safe_round(val):
            if pd.isna(val):  # Check if the value is NaN
                return val  # Return NaN as is
            else:
                return round(val)  # Round the value if it's not NaN
        daily_bolus = safe_round(trimmed_subject_data['bolus'].mean() * 12*24)
        daily_basal = safe_round(trimmed_subject_data['basal'].mean()/12 * 12*24)
        daily_carbs = safe_round(trimmed_subject_data['carbs'].mean() * 12*24)
        basal_ratio = safe_round(daily_basal / (daily_bolus + daily_basal) * 100)
        
        subject_summary = {
            'subject_id': subject_id,
            'mean_CGM': safe_round(trimmed_subject_data[cgm_col].mean()),
            'min_CGM': safe_round(trimmed_subject_data[cgm_col].min()),
            'max_CGM': safe_round(trimmed_subject_data[cgm_col].max()),
            'daily_basal_ratio': basal_ratio,
            'daily_bolus_ratio': 100 - basal_ratio,
            'daily_carbs_ratio': np.nan if daily_carbs == 0 else safe_round(daily_bolus / daily_carbs * 100),
            #'is_test': is_test,
        }
        if 'heartrate' in trimmed_subject_data.columns:
            subject_summary['mean_heartrate']: safe_round(trimmed_subject_data['heartrate'].mean())
            subject_summary['min_heartrate']: safe_round(trimmed_subject_data['heartrate'].min())
            subject_summary['max_heartrate']: safe_round(trimmed_subject_data['heartrate'].max())
        
        # Add sparsity of each feature
        for col in trimmed_subject_data.columns:
            if not col in ['id', 'is_test']:
                nan_percentage = round(trimmed_subject_data[col].isna().mean() * 100, 1)
                subject_summary[col] = nan_percentage
        return subject_summary
    
    train_data.append(get_data_from_trimmed_data(subset_df[subset_df['is_test'] == False], False))
    test_data.append(get_data_from_trimmed_data(subset_df[subset_df['is_test'] == True], True))    
    
train_summary_df = pd.DataFrame(train_data)
test_summary_df = pd.DataFrame(test_data)


Subject HCL150-094f81301c6b8e8936d557200006d4430de32971d7ebb8ea41d72a243640c84b has invalid intervals found: date
2018-03-26 01:10:00+00:00                   NaT
2018-03-26 00:10:00+00:00   -567 days +00:10:00
2019-10-14 00:05:00+00:00     566 days 23:00:00
Name: date, dtype: timedelta64[ns]
Subject HCL150-1be3cb72496c6373f5df107af201c13773e46dc9db7cf93161ab81d7ee62e7a4 has invalid intervals found: date
2019-02-12 02:15:00+00:00                   NaT
2019-02-12 01:05:00+00:00   -244 days +01:05:00
2019-02-12 00:05:00+00:00     -1 days +21:55:00
Name: date, dtype: timedelta64[ns]
Subject HCL150-1eb65ac8c57a13526a40cc735d7f02c618d36f8d3a8af7b801d6c28ddaf28b36 has invalid intervals found: date
2019-10-14 00:40:00+00:00                  NaT
2019-10-14 00:05:00+00:00   -30 days +00:05:00
Name: date, dtype: timedelta64[ns]
Subject HCL150-40b9676cffac53ccb404b57b8009590380e4479360773f84151a1289e45d5825 has invalid intervals found: date
2019-06-26 03:20:00+00:00                   NaT
2019-06-2

In [None]:
train_summary_df

### Create a Styled CSV for Feature Sparsity

In [9]:
def style_feature_sparsity(df_features):
    exclude_substrings = ['id', 'is_test', 'daily', 'mean', 'min', 'max']
    features = [col for col in train_summary_df.columns if not any(substring in col for substring in exclude_substrings)]
    
    # Function to convert RGB to Hex
    def rgb_to_hex(r, g, b):
        """Convert RGB to hex color."""
        return f'#{int(r):02x}{int(g):02x}{int(b):02x}'
    
    # Define styling function for color scale
    def highlight_severity(val):
        if pd.isna(val):  # Check if the value is NaN
            return 'background-color: white'  # White for NaN
        if val < 30:
            red, green, blue = 0, 255, 0  # Green for values less than 30
        elif val < 70:
            red, green, blue = 255, 255, 0  # Yellow for values less than 70
        else:
            red, green, blue = 255, 0, 0  # Red for values 70 and above
        hex_color = rgb_to_hex(red, green, blue)
        return f'background-color: {hex_color}'
    
    df_features = df_features.style.applymap(highlight_severity, subset=features)
    return df_features

## Check if CGM and Heartrate Values are Reasonable

In [13]:
def rgb_to_hex(r, g, b):
    """Convert RGB to hex color."""
    return f'#{int(r):02x}{int(g):02x}{int(b):02x}'

def highlight_range_severity(val, range_min, range_max):
    if pd.isna(val):
        return 'background-color: white'
    if range_min <= val <= range_max:
        red = 0  # No red component
        green = 255  # Full green
        blue = 0  # No blue component
    else:
        red = 255  # Full red
        green = 0  # No green component
        blue = 0  # No blue component
    hex_color = rgb_to_hex(red, green, blue)
    return f'background-color: {hex_color}'

def style_cgm_and_heartrate(df_features):
    df_features = df_features.applymap(lambda val: highlight_range_severity(val, 70, 220), subset=['mean_CGM'])
    df_features = df_features.applymap(lambda val: highlight_range_severity(val, 10, 100), subset=['min_CGM'])
    df_features = df_features.applymap(lambda val: highlight_range_severity(val, 200, 750), subset=['max_CGM'])
    
    if 'mean_heartrate' in df_features.columns:
        df_features = df_features.applymap(lambda val: highlight_range_severity(val, 30, 100), subset=['mean_heartrate'])
        df_features = df_features.applymap(lambda val: highlight_range_severity(val, 30, 100), subset=['min_heartrate'])
        df_features = df_features.applymap(lambda val: highlight_range_severity(val, 80, 250), subset=['max_heartrate'])
    
    return df_features

## Check if Insulin Carb Ratios are Reasonable

The reason for this check is to see if there are any weird values in bolus, basal or carbs. We use the ratio between those values to determine that. 

We consider that the ratio between bolus and basal should be from 40-60 to 70-30. 

Reasoning carb ratio: Imagine that 1U of insulin covers around 10-15 g of carbohydrates. If it is 10, there is a bolus-carbs relationship of 1:10. 

So if we let the gap be from 5 to 50% of "normal" ratios, we say that it is normal that one bolus dose covers everything from 5 to 50g of carbohydrates.

In [11]:
def style_insulin_carb_ratios(df_features):
    # We expect the basal-bolus ratio to be between 30-60% of total insulin, bolus to be around 40-70%
    # And bolus should be around 5-50% of carbs
    df_features = df_features.applymap(lambda val: highlight_range_severity(val, 30, 60), subset=['daily_basal_ratio'])
    df_features = df_features.applymap(lambda val: highlight_range_severity(val, 40, 70), subset=['daily_bolus_ratio'])
    df_features = df_features.applymap(lambda val: highlight_range_severity(val, 5, 50), subset=['daily_carbs_ratio'])
    return df_features

## Save the Color Coded Data 

In [14]:
def style_all_columns_and_save(df_features, is_test):
    df_features = style_feature_sparsity(df_features)
    df_features = style_cgm_and_heartrate(df_features)
    df_features = style_insulin_carb_ratios(df_features)
    
    if is_test:
        save_file_name = f'{file_name.split(".")[0]}_test.xlsx'
    else:
        save_file_name = f'{file_name.split(".")[0]}_train.xlsx'
    save_path = os.path.join('..', 'data_diagnostics', save_file_name)
    df_features.to_excel(save_path, engine='openpyxl', index=False)

style_all_columns_and_save(train_summary_df, is_test=False)
style_all_columns_and_save(test_summary_df, is_test=True)

In [None]:
# TODO: Would be nice to have the files sorted with the "best" ids on top / bottom