In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
%cd /content/gdrive/MyDrive/TFG

/content/gdrive/MyDrive/TFG


In [None]:
def get_patient_ids(df):
    """

    :param df_glucose:
    :return: List of patient ids
    """
    return df['Patient_ID'].unique()

In [None]:
%cd /content/gdrive/MyDrive/TFG

df_glucose = pd.read_csv("Glucose_measurements.csv", dtype={'Glucose_measurements' : int})
df_glucose["Timestamp"] = pd.to_datetime(df_glucose["Measurement_date"] + ' '
                                + df_glucose["Measurement_time"])

/content/gdrive/MyDrive/TFG


In [None]:
df_glucose

Unnamed: 0,Patient_ID,Measurement_date,Measurement_time,Measurement,Timestamp
0,LIB193263,2020-06-09,19:08:00,99,2020-06-09 19:08:00
1,LIB193263,2020-06-09,19:23:00,92,2020-06-09 19:23:00
2,LIB193263,2020-06-09,19:38:00,86,2020-06-09 19:38:00
3,LIB193263,2020-06-09,19:53:00,85,2020-06-09 19:53:00
4,LIB193263,2020-06-09,20:08:00,85,2020-06-09 20:08:00
...,...,...,...,...,...
22671703,LIB194166,2022-03-17,15:08:00,169,2022-03-17 15:08:00
22671704,LIB194166,2022-03-17,15:23:00,206,2022-03-17 15:23:00
22671705,LIB194166,2022-03-17,15:38:00,215,2022-03-17 15:38:00
22671706,LIB194166,2022-03-17,15:53:00,204,2022-03-17 15:53:00


# Resampling

In [None]:
import datetime

%cd /content/gdrive/MyDrive/TFG

def resample(df, patients = None, checkpoint = False):
  """
    :param df: Dataframe of BG measurements
    :param patients: Subset of patients. If None, all T1DiabetesGranada patients
    :param checkpoint: If true, data is saved during the process
    :return: Array of resampled datasets per patient
  """

  def get_closest(all_dates_series, expected_date):
    """

    :return: The closest measurement to the expected date, Nan if there is no
    measurements in the range
    """
    allowed_delay = 7
    for delay in range(1, allowed_delay + 1):

      past_value   = expected_date - pd.DateOffset(minutes=delay)
      if not np.isnan(all_dates_series[past_value]):
        return all_dates_series[past_value]

      future_value = expected_date + pd.DateOffset(minutes=delay)
      if not np.isnan(all_dates_series[future_value]):
        return all_dates_series[future_value]

    return None

  if checkpoint:
    data_backup = 50
    iteration = 0

  resampled_data_per_patient = []

  if not patients:
    patients = get_patient_ids(df)
  for patient in patients:
    print(f'Patient {patient}... Iteration: {iteration}')

    df_glucose_patient = df[df['Patient_ID'] == patient]

    start_date = df_glucose_patient['Timestamp'].iloc[0]
    end_date   = df_glucose_patient['Timestamp'].iloc[-1]

    all_dates = pd.date_range(start=start_date, end=end_date, freq='T')
    all_dates_series = pd.Series(dtype='float64', index=all_dates)
    all_dates_series[df_glucose_patient['Timestamp']] = df_glucose_patient['Measurement'].values

    expected_dates = pd.date_range(start=start_date, end=end_date, freq='15T')

    for expected_date in expected_dates:
      if np.isnan(all_dates_series[expected_date]):
        all_dates_series[expected_date] = get_closest(all_dates_series, expected_date)

    resampled_data_per_patient.append(all_dates_series[expected_dates])

    iteration = iteration + 1
    if checkpoint and iteration == data_backup:
      print("Saving...")
      np.save(f'Preprocessed_data_{iteration}.npy', np.asarray(resampled_data_per_patient, dtype=object))
      iteration = 0

  return resampled_data_per_patient

/content/gdrive/MyDrive/TFG


In [None]:
complete_resampled_data = resample(df_glucose, checkpoint=True)

Patient LIB194166... Iteration: 0


In [None]:
np.save('Resampled_data.npy', complete_resampled_data)

In [None]:
resampled_data = np.load('Resampled_data.npy', allow_pickle = True)
assert len(resampled_data) == get_patient_ids(df_glucose).shape[0]

## Removing patients below a treshold (30 days of measurements)

In [None]:
# Creating a dataframe patient_ID, resampled_data

patient_IDs = get_patient_ids(df_glucose)

data_ = []
for patient_ID, series in zip(patient_IDs, resampled_data):
    # Iterate over the values in the series and create tuples with patient ID and measurement
    for measurement in series:
        data_.append((patient_ID, measurement))

df_glucose_resampled = pd.DataFrame(data_, columns=['Patient_ID', 'Measurement'])
df_glucose_resampled

Unnamed: 0,Patient_ID,Measurement
0,LIB193263,99.000000
1,LIB193263,92.000000
2,LIB193263,86.000000
3,LIB193263,85.000000
4,LIB193263,85.000000
...,...,...
31341295,LIB194166,143.000000
31341296,LIB194166,169.000000
31341297,LIB194166,206.000000
31341298,LIB194166,215.000000


In [None]:
assert len(resampled_data[0]) == df_glucose_resampled[df_glucose_resampled['Patient_ID'] == 'LIB193263'].shape[0]

In [None]:
# Creating a df with the amount of data per each patient

amount_of_data_per_patient = pd.DataFrame({'Patient_ID': patient_IDs, 'Measurement_count': [len(measurements) for measurements in resampled_data],
                                           'Measurement_no_missing_values_count': [measurements.count() for measurements in resampled_data]})
amount_of_data_per_patient

Unnamed: 0,Patient_ID,Measurement_count,Measurement_no_missing_values_count
0,LIB193263,62162,60023
1,LIB193264,62157,26785
2,LIB193265,110157,46542
3,LIB193266,50761,44373
4,LIB193267,62021,54578
...,...,...,...
731,LIB194162,34608,33397
732,LIB194163,18326,17741
733,LIB194164,60594,58241
734,LIB194165,30061,23849


In [None]:
minutes_in_hour = 60
hours_in_day = 24
samples_separation = 15

amount_of_data_per_patient['Measurement_no_missing_values_count_days'] = amount_of_data_per_patient['Measurement_no_missing_values_count'] * samples_separation / (minutes_in_hour * hours_in_day)

In [None]:
patients_below_30_days = amount_of_data_per_patient.loc[amount_of_data_per_patient['Measurement_no_missing_values_count_days'] < 30, 'Patient_ID'].tolist()

print(f"Number of deleted patients: {len(patients_below_30_days)}")

patients_mask = ~df_glucose_resampled['Patient_ID'].isin(patients_below_30_days)
preprocessed_data = df_glucose_resampled[patients_mask]

Number of deleted patients: 45


In [None]:
assert len(patient_IDs) - len(patients_below_30_days) == len(preprocessed_data['Patient_ID'].unique())

In [None]:
%cd /content/gdrive/MyDrive/TFG
preprocessed_data.to_csv('Preprocessed_data.csv')



/content/gdrive/MyDrive/TFG


# Resampling Process Unit Tests

In [None]:
# Asserts

def assert_array_is_equal(first, second, error_msg):
    if np.array_equal(first, second):
        return True
    print(error_msg)
    return False

def assert_value_is_equal(first, second, error_msg):
    if first == second:
        return True
    print(error_msg)
    return False

def assert_value_is_nan(value, error_msg):
    if np.isnan(value):
        return True
    print(error_msg)
    return False

def assert_number_of_measurements(df, expected, error_msg):
    if df.shape[0] == expected:
        return True
    print(error_msg)
    return False

#############################################################################
#############################################################################

# Helper functions

template = {
        'Patient_ID': ['LIB193263', 'LIB193263', 'LIB193264', 'LIB193265'],
        'Measurement': [86, 85, 90, 80],
        'Timestamp': ['2020-06-09 19:38:00', '2020-06-09 19:53:00', '2020-06-09 20:08:00', '2020-06-09 20:23:00']
    }

def df_glucose_creator(data):
    df_g = pd.DataFrame(data)

    df_g['Timestamp'] = pd.to_datetime(df_g['Timestamp'])

    return df_g

def df_glucose_two_patients_both_with_delay():
    data = {
        'Patient_ID': ['LIB193262', 'LIB193262', 'LIB193262', 'LIB193263', 'LIB193263'],
        'Measurement': [86, 85, 90, 60, 50],
        'Timestamp': ['2020-06-09 19:00:00', '2020-06-09 19:16:00', '2020-06-09 19:30:00', '2021-01-01 10:00:00', '2021-01-01 10:16:00']
    }

    return df_glucose_creator(data)


def df_glucose_two_patients_one_with_missing_value():
    data = {
        'Patient_ID': ['LIB193262', 'LIB193262', 'LIB193262', 'LIB193263', 'LIB193263'],
        'Measurement': [1, 2, 3, 4, 5],
        'Timestamp': ['2020-06-09 19:00:00', '2020-06-09 19:30:00', '2020-06-09 19:45:00', '2021-01-01 10:00:00', '2021-01-01 10:15:00']
    }

    return df_glucose_creator(data)

def df_glucose_two_patients_both_with_manual_readings():
    data = {
        'Patient_ID': ['LIB193262', 'LIB193262', 'LIB193262', 'LIB193262', 'LIB193263', 'LIB193263', 'LIB193263'],
        'Measurement': [1, 2, 3, 4, 5, 6, 7],
        'Timestamp': ['2020-06-09 19:00:00', '2020-06-09 19:15:00', '2020-06-09 19:22:00', '2020-06-09 19:30:00', '2021-01-01 10:00:00', '2021-01-01 10:05:00',
                      '2021-01-01 10:15:00']
    }

    return df_glucose_creator(data)

def df_glucose_two_patients_both_with_delay_and_manual_readings():
    data = {
        'Patient_ID': ['LIB193262', 'LIB193262', 'LIB193262', 'LIB193262', 'LIB193263', 'LIB193263', 'LIB193263', 'LIB193263', 'LIB193263'],
        'Measurement': [1, 2, 3, 4, 5, 6, 7, 8, 9],
        'Timestamp': ['2020-06-09 19:00:00', '2020-06-09 19:15:00', '2020-06-09 19:22:00', '2020-06-09 19:31:00', '2021-01-01 10:00:00', '2021-01-01 10:05:00',
                      '2021-01-01 10:14:00', '2021-01-01 10:29:00', '2021-01-01 10:35:00']
    }

    return df_glucose_creator(data)


def get_time(df, row):
    return df.index[row].strftime("%Y-%m-%d %H:%M:%S")

def get_measurement(df, row):
    return df[row]

#############################################################################
#############################################################################

# Tests

patients = ['LIB193262', 'LIB193263']

def run_test(test_name):
  print("#############################################################################")
  test_name()
  print("#############################################################################")


def test_dfGlucoseWithTwoPatients_ResamplingMustBeIndependentPerPatient():

    df = df_glucose_two_patients_both_with_delay()
    result = resample(df, patients)
    resampled_times = ['2020-06-09 19:00:00', '2020-06-09 19:15:00', '2020-06-09 19:30:00', '2021-01-01 10:00:00', '2021-01-01 10:15:00']

    assert_value_is_equal(get_time(result[0], 0), resampled_times[0], 'Resample failed handling multiple patients')
    assert_value_is_equal(get_time(result[0], 1), resampled_times[1], 'Resample failed handling multiple patients')
    assert_value_is_equal(get_time(result[0], 2), resampled_times[2], 'Resample failed handling multiple patients')
    assert_value_is_equal(get_time(result[1], 0), resampled_times[3], 'Resample failed handling multiple patients')
    assert_value_is_equal(get_time(result[1], 1), resampled_times[4], 'Resample failed handling multiple patients')

def test_dfGlucoseWithDelay_ResamplingMustFixDelay():
    df = df_glucose_two_patients_both_with_delay()
    result = resample(df, patients)

    assert_value_is_equal(get_time(result[0],1), '2020-06-09 19:15:00', 'Resample failed handling delays')


def test_dfGlucoseWithMissingValue_ResamplingMustAddMissingValue():
    df = df_glucose_two_patients_one_with_missing_value()
    result = resample(df, patients)

    assert_value_is_equal(get_time(result[0],0), '2020-06-09 19:00:00', 'Resample failed handling missing values t1')
    assert_value_is_equal(get_time(result[0],1), '2020-06-09 19:15:00', 'Resample failed handling missing values t2')
    assert_value_is_equal(get_time(result[0],2), '2020-06-09 19:30:00', 'Resample failed handling missing values t3')

    assert_value_is_equal(get_measurement(result[0],0), 1, 'Resample failed handling missing values m1')

    assert_value_is_nan(get_measurement(result[0],1), 'Resample failed handling missing values m2')

    assert_value_is_equal(get_measurement(result[0],2), 2, 'Resample failed handling missing values m3')


def test_dfGlucoseWithManualReadings_ResamplingMustRemoveManualReadings():
    df = df_glucose_two_patients_both_with_manual_readings()
    result = resample(df, patients)

    assert_value_is_equal(get_time(result[0],0), '2020-06-09 19:00:00', 'Resample failed handling missing values t1')
    assert_value_is_equal(get_time(result[0],1), '2020-06-09 19:15:00', 'Resample failed handling missing values t2')
    assert_value_is_equal(get_time(result[0],2), '2020-06-09 19:30:00', 'Resample failed handling missing values t3')
    assert_value_is_equal(get_time(result[1],0), '2021-01-01 10:00:00', 'Resample failed handling missing values t4')
    assert_value_is_equal(get_time(result[1],1), '2021-01-01 10:15:00', 'Resample failed handling missing values t5')

    assert_value_is_equal(get_measurement(result[0],0), 1, 'Resample failed handling missing values m1')
    assert_value_is_equal(get_measurement(result[0],1), 2, 'Resample failed handling missing values m2')
    assert_value_is_equal(get_measurement(result[0],2), 4, 'Resample failed handling missing values m3')
    assert_value_is_equal(get_measurement(result[1],0), 5, 'Resample failed handling missing values m4')
    assert_value_is_equal(get_measurement(result[1],1), 7, 'Resample failed handling missing values m5')

def test_dfGlucoseWithManualReadings_ResamplingMustFixDelayAndRemoveManualReadings():
    df = df_glucose_two_patients_both_with_delay_and_manual_readings()
    result = resample(df, patients)

    print(result)
    assert_value_is_equal(get_time(result[0],0), '2020-06-09 19:00:00', 'Resample failed handling missing values t1')
    assert_value_is_equal(get_time(result[0],1), '2020-06-09 19:15:00', 'Resample failed handling missing values t2')
    assert_value_is_equal(get_time(result[0],2), '2020-06-09 19:30:00', 'Resample failed handling missing values t3')
    assert_value_is_equal(get_time(result[1],0), '2021-01-01 10:00:00', 'Resample failed handling missing values t4')
    assert_value_is_equal(get_time(result[1],1), '2021-01-01 10:15:00', 'Resample failed handling missing values t5')
    assert_value_is_equal(get_time(result[1],2), '2021-01-01 10:30:00', 'Resample failed handling missing values t6')

    assert_value_is_equal(get_measurement(result[0],0), 1, 'Resample failed handling missing values m1')
    assert_value_is_equal(get_measurement(result[0],1), 2, 'Resample failed handling missing values m2')
    assert_value_is_equal(get_measurement(result[0],2), 4, 'Resample failed handling missing values m3')
    assert_value_is_equal(get_measurement(result[1],0), 5, 'Resample failed handling missing values m4')
    assert_value_is_equal(get_measurement(result[1],1), 7, 'Resample failed handling missing values m5')
    assert_value_is_equal(get_measurement(result[1],2), 8, 'Resample failed handling missing values m6')


print("TESTS:")
run_test(test_dfGlucoseWithTwoPatients_ResamplingMustBeIndependentPerPatient)
run_test(test_dfGlucoseWithDelay_ResamplingMustFixDelay)
run_test(test_dfGlucoseWithMissingValue_ResamplingMustAddMissingValue)
run_test(test_dfGlucoseWithManualReadings_ResamplingMustRemoveManualReadings)
run_test(test_dfGlucoseWithManualReadings_ResamplingMustFixDelayAndRemoveManualReadings)