In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.options.mode.chained_assignment = None  # default='warn'

In [13]:
def get_patient_ids(df=None):
    """

    :param df_glucose:
    :return: List of patient ids
    """
    if df is None:
        df = read_dataset()
    return df['Patient_ID'].unique()

def patients_size(df):
    return df.groupby('Patient_ID').size()

In [5]:
# Reading complete dataset and creating cols lagged for time and measurements
df_glucose = pd.read_csv('train.csv',
                      dtype={'Glucose_measurements': int})
complete_df_glucose = df_glucose.copy()


In [8]:
df_glucose

Unnamed: 0.1,Unnamed: 0,Patient_ID,t,Measurement,index
0,0,LIB193263,2020-06-09 19:00:00,99.000000,0
1,1,LIB193263,2020-06-09 19:15:00,92.000000,1
2,2,LIB193263,2020-06-09 19:30:00,86.000000,2
3,3,LIB193263,2020-06-09 19:45:00,85.000000,3
4,4,LIB193263,2020-06-09 20:00:00,85.000000,4
...,...,...,...,...,...
3681763,3681763,LIB193424,2021-12-26 04:15:00,170.000000,4090247
3681764,3681764,LIB193424,2021-12-26 04:30:00,164.000000,4090248
3681765,3681765,LIB193424,2021-12-26 04:45:00,158.000000,4090249
3681766,3681766,LIB193424,2021-12-26 05:00:00,153.000000,4090250


In [9]:
df_glucose[df_glucose['Patient_ID'] == 'LIB193263'].shape[0]

55946

In [10]:
def missing_values_count(subset):
    return subset['Measurement'].isna().sum()

def last_missing_value_pos(set):
    """

    :return: Return the index + 1 of the last missing value of the subset,
                    -1 if no missing values
    """
    missing_indexes = np.where(np.isnan(set))[0]

    return missing_indexes[-1] + 1 if len(missing_indexes) > 0 else -1

def interpolate(subset):
     subset_interpolate = subset.apply(lambda sample: sample.interpolate(method="polynomial", order=2))
     # Fixing values out of range
     subset_interpolate[(subset_interpolate["Measurement"] < 40)] = 40
     subset_interpolate[(subset_interpolate["Measurement"] > 500)] = 500

     return subset_interpolate

def linear_model(train, test, perc):
    train_split = round(train.shape[0] * perc)
    test_split = round(test.shape[0] * perc)
    x_train, y_train = train.iloc[0:train_split], train.iloc[train_split, train.shape[0]]
    x_test, y_test = test.iloc[0:test_split], test.iloc[test_split, test.shape[0]]

    model = LinearRegression().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return rmse

def non_anchored_walk_forward_optimization(data, lookback_samples, validation_samples, na_tolerance, anchored = False):

    patients = get_patient_ids(data)    # Getting ids of patients
    start = 0
    wasted_data = 0
    windows_patients = []
    error_patients = []

    for patient in patients:
        print(f"Patient {patient}...")
        windows_patient = 0
        data_patient = data[data['Patient_ID'] == patient]
        error_patient = []
        for s in range(lookback_samples, data_patient.shape[0] - validation_samples, lookback_samples):
            if not anchored:
                train_subset = data_patient.iloc[s-lookback_samples : s]
            else:
                train_subset = data_patient.iloc[start: s]
            validation_subset = data_patient.iloc[s : s + validation_samples]
            train_subset_missing_values = missing_values_count(train_subset)
            validation_subset_missing_values = missing_values_count(validation_subset)

            if train_subset_missing_values > na_tolerance or validation_subset_missing_values > 0:      #No missing values allowed in validation subset
                wasted_data = wasted_data + lookback_samples
                if anchored:
                    start = s
                continue

            if 0 < train_subset_missing_values <= na_tolerance:
                train_subset = interpolate(train_subset)

            #error_patient.append(linear_model(train_subset['Measurement'],validation_subset['Measurement'],0.8))
            windows_patient = windows_patient + 1



        error_patients.append(np.average(error_patient))
        windows_patients.append(windows_patient)
        return [error_patients, windows_patients, wasted_data]


def one_step_walk_forward(data, lookback_samples, test_samples):
    """
    First strategy tested. Walk forward with a window rolling 1 sample each time.
    :param data:
    :param lookback_samples: Number of samples used to predict
    :param test_samples: Prediction window (normally 15 or 30 minutes)
    :return:
    """

    patients = get_patient_ids(data)    # Getting ids of patients

    wasted_windows_patients = []
    windows_patients        = []
    error_patients          = []

    for patient in patients:
        print(f"Patient {patient}...")
        windows_patient         = 0
        wasted_windows_patient  = 0
        error_patient           = []

        data_patient = data[data['Patient_ID'] == patient]['Measurement'].values

        s = lookback_samples
        while s < data_patient.shape[0] - test_samples:
            train_subset = data_patient[s-lookback_samples : s]
            validation_subset = data_patient[s : s + test_samples]

            window = np.concatenate((train_subset, validation_subset))
            missing_value_pos = last_missing_value_pos(window)
            if missing_value_pos != -1:
                wasted_windows_patient = wasted_windows_patient + missing_value_pos
                s = s + missing_value_pos
                continue

            #error_patient.append(linear_model(train_subset['Measurement'],validation_subset['Measurement'],0.8))
            windows_patient = windows_patient + 1

            s = s + 1

        #error_patients.append(np.average(error_patient))
        windows_patients.append(windows_patient)
        wasted_windows_patients.append(wasted_windows_patient)

    return [error_patients, windows_patients, wasted_windows_patients]

#errors, windows, wasted_windows = one_step_walk_forward(df_glucose, lookback_samples=14, test_samples=2)

In [19]:
def get_stats(df, generated_windows, _wasted_windows):
    return pd.DataFrame({
        'Patient': get_patient_ids(df),
        'Generated Windows': generated_windows,
        'Wasted windows': _wasted_windows,
        "Lost Data": _wasted_windows / patients_size(df) * 100,
        "Patients Size": patients_size(df)
    })

In [23]:
get_stats(df_glucose, windows, wasted_windows)

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LIB193263,LIB193263,49602,12545,20.180815
LIB193264,LIB193264,22121,40020,64.385347
LIB193265,LIB193265,37298,72844,66.126836
LIB193266,LIB193266,38829,11916,23.474715
LIB193267,LIB193267,43684,18322,29.541131
...,...,...,...,...
LIB193418,LIB193418,11488,2326,16.818510
LIB193419,LIB193419,19731,6582,24.999050
LIB193420,LIB193420,23116,3554,13.317845
LIB193423,LIB193423,9507,4306,31.137465


# Note: The number of windows + wasted windows in the one step approach is not the the same as the number of samples
#bc of the needed samples before the training and for the testing

In [20]:
# Experiment: Different input lenghts based on "Deep Residual Time-Series Forecasting:
#Application to Blood Glucose Prediction" criteria
lookbacks = [8, 10, 12, 14]   # 2h, 2.5h, 3h, 3.5h
results = []
for lookback in lookbacks:
    _, windows, wasted_windows = one_step_walk_forward(df_glucose, lookback_samples=lookback, test_samples=2)
    results.append(get_stats(df_glucose, windows, wasted_windows))
    print()

Patient LIB193263...
Patient LIB193264...
Patient LIB193265...
Patient LIB193266...
Patient LIB193267...
Patient LIB193268...
Patient LIB193269...
Patient LIB193272...
Patient LIB193273...
Patient LIB193274...
Patient LIB193276...
Patient LIB193277...
Patient LIB193278...
Patient LIB193279...
Patient LIB193280...
Patient LIB193281...
Patient LIB193282...
Patient LIB193283...
Patient LIB193284...
Patient LIB193302...
Patient LIB193303...
Patient LIB193304...
Patient LIB193307...
Patient LIB193308...
Patient LIB193309...
Patient LIB193310...
Patient LIB193311...
Patient LIB193312...
Patient LIB193313...
Patient LIB193314...
Patient LIB193315...
Patient LIB193317...
Patient LIB193318...
Patient LIB193319...
Patient LIB193320...
Patient LIB193324...
Patient LIB193325...
Patient LIB193326...
Patient LIB193328...
Patient LIB193330...
Patient LIB193332...
Patient LIB193333...
Patient LIB193334...
Patient LIB193335...
Patient LIB193337...
Patient LIB193338...
Patient LIB193340...
Patient LIB19

In [21]:
results[0]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data,Patients Size
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LIB193263,LIB193263,48491,7445,13.307475,55946
LIB193264,LIB193264,20782,35149,62.832270,55941
LIB193265,LIB193265,31957,67177,67.758367,99142
LIB193266,LIB193266,37737,7937,17.373698,45684
LIB193267,LIB193267,42918,12891,23.094287,55819
...,...,...,...,...,...
LIB193418,LIB193418,10946,1491,11.978790,12447
LIB193419,LIB193419,19091,4595,19.391458,23696
LIB193420,LIB193420,21657,2350,9.784736,24017
LIB193423,LIB193423,8686,3750,30.130162,12446


In [22]:
results[1]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data,Patients Size
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LIB193263,LIB193263,47250,8684,15.522111,55946
LIB193264,LIB193264,20282,35647,63.722493,55941
LIB193265,LIB193265,30946,68188,68.778116,99142
LIB193266,LIB193266,37101,8571,18.761492,45684
LIB193267,LIB193267,41612,14195,25.430409,55819
...,...,...,...,...,...
LIB193418,LIB193418,10726,1709,13.730216,12447
LIB193419,LIB193419,18561,5123,21.619683,23696
LIB193420,LIB193420,21354,2651,11.038015,24017
LIB193423,LIB193423,8564,3870,31.094327,12446


In [23]:
results[2]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data,Patients Size
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LIB193263,LIB193263,46018,9914,17.720659,55946
LIB193264,LIB193264,19787,36140,64.603779,55941
LIB193265,LIB193265,29948,69186,69.784753,99142
LIB193266,LIB193266,36474,9197,20.131775,45684
LIB193267,LIB193267,40313,15492,27.753991,55819
...,...,...,...,...,...
LIB193418,LIB193418,10513,1920,15.425404,12447
LIB193419,LIB193419,18037,5645,23.822586,23696
LIB193420,LIB193420,21054,2949,12.278803,24017
LIB193423,LIB193423,8442,3990,32.058493,12446


In [24]:
results[3]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data,Patients Size
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LIB193263,LIB193263,44832,11098,19.836986,55946
LIB193264,LIB193264,19302,36623,65.467189,55941
LIB193265,LIB193265,28959,70175,70.782312,99142
LIB193266,LIB193266,35853,9818,21.491113,45684
LIB193267,LIB193267,39020,16783,30.066823,55819
...,...,...,...,...,...
LIB193418,LIB193418,10307,2124,17.064353,12447
LIB193419,LIB193419,17518,6162,26.004389,23696
LIB193420,LIB193420,20758,3243,13.502935,24017
LIB193423,LIB193423,8320,4110,33.022658,12446


# Training