# Data Quality

## Imports

In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Helper Functions

In [18]:
def get_datasets():
    folder_path = os.path.join('../', 'processed_data')
    datasets = {}
    for dataset_folder in os.listdir(folder_path):
        print(dataset_folder)
        dataset_folder_path_train = os.path.join(folder_path, dataset_folder, "train")
        dataset_folder_path_test = os.path.join(folder_path, dataset_folder, "test")
    
        df_train_list = []
        df_test_list = []
        for subject_data in os.listdir(dataset_folder_path_train):
            if subject_data.endswith('.csv'):
                file_path = os.path.join(dataset_folder_path_train, subject_data)
                df = pd.read_csv(file_path, parse_dates=['date'], index_col='date', low_memory=False)
    
                df['id'] = subject_data.split(".")[0]
                #df.set_index('date', inplace=True)
                df_train_list.append(df)
    
        for subject_data in os.listdir(dataset_folder_path_test):
            if subject_data.endswith('.csv'):       
                file_path = os.path.join(dataset_folder_path_test, subject_data)
                df = pd.read_csv(file_path, parse_dates=['date'], index_col='date', low_memory=False)
    
                df['id'] = subject_data.split(".")[0]
                #df.set_index('date', inplace=True)
                df_test_list.append(df)
                
        combined_df_train = pd.concat(df_train_list)
        combined_df_test = pd.concat(df_test_list)
        datasets[dataset_folder] = {
            "train": combined_df_train,
            "test": combined_df_test,
        }
    return datasets 

def trim_cgm_data(df):
    # Group by 'id' to apply trimming for each unique id
    trimmed_dfs = []
    for unique_id in df['id'].unique():
        # Select rows for the current id
        id_data = df[df['id'] == unique_id]

        # Find the first and last non-NaN index in the 'cgm' column
        first_valid_index = id_data['CGM'].first_valid_index()
        last_valid_index = id_data['CGM'].last_valid_index()

        # If valid indices are found, trim the DataFrame
        if first_valid_index is not None and last_valid_index is not None:
            trimmed_id_data = id_data.loc[first_valid_index:last_valid_index]
            trimmed_dfs.append(trimmed_id_data)

    # Concatenate all trimmed DataFrames
    return pd.concat(trimmed_dfs, ignore_index=True)


## Load Data

In [6]:
datasets = get_datasets()

T1DEXI
T1DEXIP
tidepool_dataset
OhioT1DM


## Data Validation
Checking whether features are within a realistic range.

In [11]:
ranges = {
    'CGM': (10, 540),
    'carbs': (0, 300),
    'bolus': (0, 100),
    'basal': (0, 10),
    'heartrate': (20, 300)
}

for key in datasets:
    print("KEY: ", key)
    train_df = datasets[key]['train']
    test_df = datasets[key]['test']
    combined_df = pd.concat([train_df, test_df])

    for feature, (min_val, max_val) in ranges.items():
        if feature in combined_df.columns:
            # Count values outside the specified range
            outside_range_count = combined_df[(combined_df[feature] < min_val) | (combined_df[feature] > max_val)].shape[0]
            print(f"{feature}: {outside_range_count} samples outside the range {min_val}-{max_val}")


KEY:  T1DEXI
CGM: 0 samples outside the range 10-540
bolus: 0 samples outside the range 0-100
basal: 6 samples outside the range 0-10
heartrate: 0 samples outside the range 20-300
KEY:  T1DEXIP
CGM: 1 samples outside the range 10-540
bolus: 0 samples outside the range 0-100
basal: 12 samples outside the range 0-10
heartrate: 0 samples outside the range 20-300
KEY:  tidepool_dataset
CGM: 0 samples outside the range 10-540
carbs: 2 samples outside the range 0-300
bolus: 0 samples outside the range 0-100
basal: 22570 samples outside the range 0-10
KEY:  OhioT1DM
CGM: 0 samples outside the range 10-540
carbs: 1 samples outside the range 0-300
bolus: 0 samples outside the range 0-100
basal: 0 samples outside the range 0-10
heartrate: 0 samples outside the range 20-300


## Missing Data

In [21]:
# Get a new dataset where there are no nan CGMs in beginning or end
for key in datasets:
    print("KEY: ", key)
    
    # Trim the 'train' DataFrame
    train_df = datasets[key]['train']
    trimmed_train_df = trim_cgm_data(train_df)

    # Trim the 'test' DataFrame
    test_df = datasets[key]['test']
    trimmed_test_df = trim_cgm_data(test_df)
    
    combined_df = pd.concat([train_df, test_df])
    
    total_count = len(combined_df['CGM'])
    non_nan_count = combined_df['CGM'].notna().sum()
    percentage = (non_nan_count / total_count) * 100
    print(f"Percentage of non-NaN values after trimming: {percentage:.2f}%")



KEY:  T1DEXI
Percentage of non-NaN values after trimming: 35.13%
KEY:  T1DEXIP
Percentage of non-NaN values after trimming: 17.55%
KEY:  tidepool_dataset


ValueError: No objects to concatenate