In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def get_patient_ids(df=None):
    """

    :param df_glucose:
    :return: List of patient ids
    """
    if df is None:
        df = read_dataset()
    return df['Patient_ID'].unique()

def patients_size(df):
    return df.groupby('Patient_ID').size()

In [3]:
# Reading complete dataset and creating cols lagged for time and measurements
df_glucose = pd.read_csv('Data_preprocessed.csv',
                      dtype={'Glucose_measurements': int})
complete_df_glucose = df_glucose.copy()


In [4]:
df_glucose

Unnamed: 0.1,Unnamed: 0,t,Patient_ID,Measurement,Minutes_diff,Measurement-1,Measurement_diff
0,0,2020-06-09 19:00:00,LIB193263,99.000000,,,
1,1,2020-06-09 19:15:00,LIB193263,92.000000,15.000000,99.000000,7.000000
2,2,2020-06-09 19:30:00,LIB193263,86.000000,15.000000,92.000000,6.000000
3,3,2020-06-09 19:45:00,LIB193263,85.000000,15.000000,86.000000,1.000000
4,4,2020-06-09 20:00:00,LIB193263,85.000000,15.000000,85.000000,0.000000
...,...,...,...,...,...,...,...
4090906,4090906,2022-01-02 01:00:00,LIB193424,207.000000,15.000000,186.000000,21.000000
4090907,4090907,2022-01-02 01:15:00,LIB193424,215.000000,15.000000,207.000000,8.000000
4090908,4090908,2022-01-02 01:30:00,LIB193424,218.000000,15.000000,215.000000,3.000000
4090909,4090909,2022-01-02 01:45:00,LIB193424,222.000000,15.000000,218.000000,4.000000


In [5]:
# Dropping unnecessary cols:
df_glucose = df_glucose.drop("Minutes_diff", axis=1)
df_glucose = df_glucose.drop("Measurement-1", axis=1)
df_glucose = df_glucose.drop("Measurement_diff", axis=1)
df_glucose = df_glucose.drop("Unnamed: 0", axis=1)
df_glucose

Unnamed: 0,t,Patient_ID,Measurement
0,2020-06-09 19:00:00,LIB193263,99.000000
1,2020-06-09 19:15:00,LIB193263,92.000000
2,2020-06-09 19:30:00,LIB193263,86.000000
3,2020-06-09 19:45:00,LIB193263,85.000000
4,2020-06-09 20:00:00,LIB193263,85.000000
...,...,...,...
4090906,2022-01-02 01:00:00,LIB193424,207.000000
4090907,2022-01-02 01:15:00,LIB193424,215.000000
4090908,2022-01-02 01:30:00,LIB193424,218.000000
4090909,2022-01-02 01:45:00,LIB193424,222.000000


In [6]:
df_glucose[df_glucose['Patient_ID'] == 'LIB193263'].shape[0]

62163

In [21]:
def missing_values_count(subset):
    return subset['Measurement'].isna().sum()

def last_missing_value_pos(set):
    """

    :return: Return the index + 1 of the last missing value of the subset,
                    -1 if no missing values
    """
    missing_indexes = np.where(np.isnan(set))[0]

    return missing_indexes[-1] + 1 if len(missing_indexes) > 0 else -1

def interpolate(subset):
     subset_interpolate = subset.apply(lambda sample: sample.interpolate(method="polynomial", order=2))
     # Fixing values out of range
     subset_interpolate[(subset_interpolate["Measurement"] < 40)] = 40
     subset_interpolate[(subset_interpolate["Measurement"] > 500)] = 500

     return subset_interpolate

def linear_model(train, test, perc):
    train_split = round(train.shape[0] * perc)
    test_split = round(test.shape[0] * perc)
    x_train, y_train = train.iloc[0:train_split], train.iloc[train_split, train.shape[0]]
    x_test, y_test = test.iloc[0:test_split], test.iloc[test_split, test.shape[0]]

    model = LinearRegression().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return rmse

def non_anchored_walk_forward_optimization(data, lookback_samples, validation_samples, na_tolerance, anchored = False):

    patients = get_patient_ids(data)    # Getting ids of patients
    start = 0
    wasted_data = 0
    windows_patients = []
    error_patients = []

    for patient in patients:
        print(f"Patient {patient}...")
        windows_patient = 0
        data_patient = data[data['Patient_ID'] == patient]
        error_patient = []
        for s in range(lookback_samples, data_patient.shape[0] - validation_samples, lookback_samples):
            if not anchored:
                train_subset = data_patient.iloc[s-lookback_samples : s]
            else:
                train_subset = data_patient.iloc[start: s]
            validation_subset = data_patient.iloc[s : s + validation_samples]
            train_subset_missing_values = missing_values_count(train_subset)
            validation_subset_missing_values = missing_values_count(validation_subset)

            if train_subset_missing_values > na_tolerance or validation_subset_missing_values > 0:      #No missing values allowed in validation subset
                wasted_data = wasted_data + lookback_samples
                if anchored:
                    start = s
                continue

            if 0 < train_subset_missing_values <= na_tolerance:
                train_subset = interpolate(train_subset)

            #error_patient.append(linear_model(train_subset['Measurement'],validation_subset['Measurement'],0.8))
            windows_patient = windows_patient + 1



        error_patients.append(np.average(error_patient))
        windows_patients.append(windows_patient)
        return [error_patients, windows_patients, wasted_data]


def one_step_walk_forward(data, lookback_samples, test_samples):
    """
    First strategy tested. Walk forward with a window rolling 1 sample each time.
    :param data:
    :param lookback_samples: Number of samples used to predict
    :param test_samples: Prediction window (normally 15 or 30 minutes)
    :return:
    """

    patients = get_patient_ids(data)    # Getting ids of patients

    wasted_windows_patients = []
    windows_patients        = []
    error_patients          = []

    for patient in patients:
        print(f"Patient {patient}...")
        windows_patient         = 0
        wasted_windows_patient  = 0
        error_patient           = []

        data_patient = data[data['Patient_ID'] == patient]['Measurement'].values

        s = lookback_samples
        while s < data_patient.shape[0] - test_samples:
            train_subset = data_patient[s-lookback_samples : s]
            validation_subset = data_patient[s : s + test_samples]

            window = np.concatenate((train_subset, validation_subset))
            missing_value_pos = last_missing_value_pos(window)
            if missing_value_pos != -1:
                wasted_windows_patient = wasted_windows_patient + missing_value_pos
                s = s + missing_value_pos
                continue

            #error_patient.append(linear_model(train_subset['Measurement'],validation_subset['Measurement'],0.8))
            windows_patient = windows_patient + 1

            s = s + 1

        #error_patients.append(np.average(error_patient))
        windows_patients.append(windows_patient)
        wasted_windows_patients.append(wasted_windows_patient)

    return [error_patients, windows_patients, wasted_windows_patients]

errors, windows, wasted_windows = one_step_walk_forward(df_glucose, lookback_samples=14, test_samples=2)

Patient LIB193263...
Patient LIB193264...
Patient LIB193265...
Patient LIB193266...
Patient LIB193267...
Patient LIB193268...
Patient LIB193269...
Patient LIB193272...
Patient LIB193273...
Patient LIB193274...
Patient LIB193276...
Patient LIB193277...
Patient LIB193278...
Patient LIB193279...
Patient LIB193280...
Patient LIB193281...
Patient LIB193282...
Patient LIB193283...
Patient LIB193284...
Patient LIB193302...
Patient LIB193303...
Patient LIB193304...
Patient LIB193307...
Patient LIB193308...
Patient LIB193309...
Patient LIB193310...
Patient LIB193311...
Patient LIB193312...
Patient LIB193313...
Patient LIB193314...
Patient LIB193315...
Patient LIB193317...
Patient LIB193318...
Patient LIB193319...
Patient LIB193320...
Patient LIB193324...
Patient LIB193325...
Patient LIB193326...
Patient LIB193328...
Patient LIB193330...
Patient LIB193332...
Patient LIB193333...
Patient LIB193334...
Patient LIB193335...
Patient LIB193337...
Patient LIB193338...
Patient LIB193340...
Patient LIB19

In [22]:
def get_stats(df, generated_windows, _wasted_windows):
    return pd.DataFrame({
        'Patient': get_patient_ids(df),
        'Generated Windows': generated_windows,
        'Wasted windows': _wasted_windows,
        "Lost Data": _wasted_windows / patients_size(df) * 100
    })

In [23]:
get_stats(df_glucose, windows, wasted_windows)

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LIB193263,LIB193263,49602,12545,20.180815
LIB193264,LIB193264,22121,40020,64.385347
LIB193265,LIB193265,37298,72844,66.126836
LIB193266,LIB193266,38829,11916,23.474715
LIB193267,LIB193267,43684,18322,29.541131
...,...,...,...,...
LIB193418,LIB193418,11488,2326,16.818510
LIB193419,LIB193419,19731,6582,24.999050
LIB193420,LIB193420,23116,3554,13.317845
LIB193423,LIB193423,9507,4306,31.137465


# Note: The number of windows + wasted windows in the one step approach is not the the same as the number of samples
#bc of the needed samples before the training and for the testing

In [24]:
# Experiment: Different input lenghts based on "Deep Residual Time-Series Forecasting:
#Application to Blood Glucose Prediction" criteria
lookbacks = [8, 10, 12, 14]   # 2h, 2.5h, 3h, 3.5h
results = []
for lookback in lookbacks:
    _, windows, wasted_windows = one_step_walk_forward(df_glucose, lookback_samples=lookback, test_samples=2)
    results.append(get_stats(df_glucose, windows, wasted_windows))
    print()

Patient LIB193263...
Patient LIB193264...
Patient LIB193265...
Patient LIB193266...
Patient LIB193267...
Patient LIB193268...
Patient LIB193269...
Patient LIB193272...
Patient LIB193273...
Patient LIB193274...
Patient LIB193276...
Patient LIB193277...
Patient LIB193278...
Patient LIB193279...
Patient LIB193280...
Patient LIB193281...
Patient LIB193282...
Patient LIB193283...
Patient LIB193284...
Patient LIB193302...
Patient LIB193303...
Patient LIB193304...
Patient LIB193307...
Patient LIB193308...
Patient LIB193309...
Patient LIB193310...
Patient LIB193311...
Patient LIB193312...
Patient LIB193313...
Patient LIB193314...
Patient LIB193315...
Patient LIB193317...
Patient LIB193318...
Patient LIB193319...
Patient LIB193320...
Patient LIB193324...
Patient LIB193325...
Patient LIB193326...
Patient LIB193328...
Patient LIB193330...
Patient LIB193332...
Patient LIB193333...
Patient LIB193334...
Patient LIB193335...
Patient LIB193337...
Patient LIB193338...
Patient LIB193340...
Patient LIB19

In [25]:
results[0]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LIB193263,LIB193263,53696,8457,13.604556
LIB193264,LIB193264,23951,38196,61.450842
LIB193265,LIB193265,40920,69228,62.844278
LIB193266,LIB193266,40944,9807,19.319950
LIB193267,LIB193267,47972,14040,22.637129
...,...,...,...,...
LIB193418,LIB193418,12187,1633,11.807664
LIB193419,LIB193419,21422,4897,18.599263
LIB193420,LIB193420,24108,2568,9.623023
LIB193423,LIB193423,9927,3892,28.143756


In [26]:
results[1]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LIB193263,LIB193263,52307,9844,15.835787
LIB193264,LIB193264,23333,38812,62.441881
LIB193265,LIB193265,39701,70445,63.949055
LIB193266,LIB193266,40232,10517,20.718662
LIB193267,LIB193267,46536,15474,24.949212
...,...,...,...,...
LIB193418,LIB193418,11947,1871,13.528561
LIB193419,LIB193419,20852,5465,20.756580
LIB193420,LIB193420,23773,2901,10.870869
LIB193423,LIB193423,9787,4030,29.141659


In [27]:
results[2]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LIB193263,LIB193263,50930,11219,18.047713
LIB193264,LIB193264,22722,39421,63.421658
LIB193265,LIB193265,38495,71649,65.042031
LIB193266,LIB193266,39528,11219,22.101613
LIB193267,LIB193267,45107,16901,27.250008
...,...,...,...,...
LIB193418,LIB193418,11714,2102,15.198843
LIB193419,LIB193419,20288,6027,22.891109
LIB193420,LIB193420,23441,3231,12.107472
LIB193423,LIB193423,9647,4168,30.139562


In [28]:
results[3]

Unnamed: 0_level_0,Patient,Generated Windows,Wasted windows,Lost Data
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LIB193263,LIB193263,49602,12545,20.180815
LIB193264,LIB193264,22121,40020,64.385347
LIB193265,LIB193265,37298,72844,66.126836
LIB193266,LIB193266,38829,11916,23.474715
LIB193267,LIB193267,43684,18322,29.541131
...,...,...,...,...
LIB193418,LIB193418,11488,2326,16.818510
LIB193419,LIB193419,19731,6582,24.999050
LIB193420,LIB193420,23116,3554,13.317845
LIB193423,LIB193423,9507,4306,31.137465


# Training

In [20]:
from keras import *
import os

def get_model(past_steps):

    out_dim = int(past_steps[0] / 3)
    this_index = 0
    index = [this_index]
    for past_step in past_steps:
        this_index += past_step
        index.append(this_index)

    input_all = layers.Input(name='input_all', shape=(sum(past_steps), 2))

    # Only one feature: measurement
    feature_all = layers.Lambda(lambda x: x[:, index[1] - out_dim:index[1], 1])(input_all)
    assert feature_all.shape[1] == out_dim
    feature_all = layers.Reshape((-1, 1))(feature_all)

    locals()["feature0scale2"] = layers.Lambda(
        lambda x: x[:, index[1] - 2 * out_dim + 1:index[1]:2, 1])(input_all)
    assert locals()["feature0scale2"].shape[1] == out_dim
    locals()["feature0scale2"] = layers.Reshape((-1, 1))(locals()["feature0scale2"])
    locals()["feature0scale2"] = layers.CuDNNLSTM(units=out_dim)(locals()["feature0scale2"])
    locals()["feature0scale2"] = layers.Reshape((-1, 1))(locals()["feature0scale2"])

    locals()["feature0scale3"] = layers.Lambda(lambda x: x[:, index[1] - 3 * out_dim + 2:index[1]:3, 1])(input_all)
    assert locals()["feature0scale3"].shape[1] == out_dim
    locals()["feature0scale3"] = layers.Reshape((-1, 1))(locals()["feature0scale3"])
    locals()["feature0scale3"] = layers.CuDNNLSTM(units=out_dim)(locals()["feature0scale3"])
    locals()["feature0scale3"] = layers.Reshape((-1, 1))(locals()["feature0scale3"])

    locals()["feature0scaleall"] = layers.Concatenate(axis=2)([locals()["feature0scale2"], locals()["feature0scale3"]])
    locals()["feature0scaleall"] = layers.CuDNNLSTM(units=out_dim)(locals()["feature0scaleall"])
    locals()["feature0scaleall"] = layers.Reshape((-1, 1))(locals()["feature0scaleall"])

    feature_all = layers.Concatenate(axis=2)([feature_all, locals()["feature0scaleall"]])

    feature_all = layers.CuDNNLSTM(units=256)(feature_all)
    feature_all = layers.Dense(units=256, activation='relu')(feature_all)
    feature_all = layers.Dropout(rate=0.2)(feature_all)
    output_all = layers.Dense(units=1, activation='linear')(feature_all)

    model = keras.models.Model(inputs=[input_all], outputs=[output_all])
    return model




def train(model, x_train, y_train, x_valid, y_valid, batch_size, epochs,
        patience, shuffle, artifacts_path, learning_rate, decay):

    def optimizer(learning_rate, decay):
        return keras.optimizers.Adam(learning_rate=learning_rate, decay=decay)

    def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

    def loss_function():
        return root_mean_squared_error

    history = model.compile(optimizer=optimizer(learning_rate,decay), loss=loss_function())
    history = model.fit(
        x_train,
        y_train,
        #validation_data = (x_valid, y_valid),
        epochs          = epochs,
        batch_size      = batch_size,
        shuffle         = shuffle,
        verbose         = 0,
        callbacks       = [
            keras.callbacks.EarlyStopping(
                monitor  = 'val_loss',
                patience = patience,
                mode     = 'min'
            ),
            keras.callbacks.TensorBoard(
                log_dir=artifacts_path
            ),
            keras.callbacks.ModelCheckpoint(
                filepath       = os.path.join(artifacts_path, "test.hdf5".format()),
                monitor        = 'val_loss',
                mode           = 'min',
                save_best_only = True,
                save_freq         = 1
            )
        ]
    )
    print("training successful")
    weights_path = os.path.join(artifacts_path, "weights_path-model.hdf5".format())
    print("saving weights: {}".format(weights_path))
    return model

def LSTM(model, x_train, y_train, x_valid, y_valid):
    # Model
    batch_size = 1024
    epochs = 5000
    patience = 300
    shuffle = True
    seed = 0

    # Optimizer
    learning_rate = 1e-4
    decay = 0.0

    # Training
    artifacts_path = "../artifacts/test"

    train(model, x_train, y_train, x_valid, y_valid, batch_size, epochs,
        patience, shuffle, artifacts_path, learning_rate, decay)



def one_step_walk_forward_training(data, lookback_samples, test_samples):
    """
    First strategy tested. Walk forward with a window rolling 1 sample each time.
    :param data:
    :param lookback_samples: Number of samples used to predict
    :param test_samples: Prediction window (normally 15 or 30 minutes)
    :return:
    """

    model = get_model([lookback_samples])  #Getting the prediction model
    patients = get_patient_ids(data)    # Getting ids of patients

    wasted_windows_patients = []
    windows_patients        = []
    evaluation_patients     = []

    for patient in patients:
        print(f"Patient {patient}...")
        windows_patient         = 0
        wasted_windows_patient  = 0
        evaluation_patient      = []

        data_patient = data[data['Patient_ID'] == patient]['Measurement'].values

        s = lookback_samples
        while s < data_patient.shape[0] - test_samples:
            x_train_subset = data_patient[s-lookback_samples : s]
            y_train_subset = data_patient[s : s + test_samples]

            window = np.concatenate((x_train_subset, y_train_subset))
            missing_value_pos = last_missing_value_pos(window)
            if missing_value_pos != -1:
                wasted_windows_patient = wasted_windows_patient + missing_value_pos
                s = s + missing_value_pos
                continue

            evaluation_patient.append(LSTM(model, x_train_subset, y_train_subset, [], [],
                                           ))
            windows_patient = windows_patient + 1

            s = s + 1

        evaluation_patients.append(np.average(evaluation_patient))
        windows_patients.append(windows_patient)
        wasted_windows_patients.append(wasted_windows_patient)

    return [evaluation_patients, windows_patients, wasted_windows_patients]

errors, windows, wasted_windows = one_step_walk_forward_training(df_glucose, lookback_samples=14, test_samples=2)

ImportError: cannot import name 'layers' from partially initialized module 'keras' (most likely due to a circular import) (C:\Users\Mario\anaconda3\lib\site-packages\keras\__init__.py)