In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
def get_patient_ids(df=None):
    """

    :param df_glucose:
    :return: List of patient ids
    """
    if df is None:
        df = read_dataset()
    return df['Patient_ID'].unique()

In [3]:
# Reading complete dataset and creating cols lagged for time and measurements
df_glucose = pd.read_csv('Data_preprocessed.csv',
                      dtype={'Glucose_measurements': int})
complete_df_glucose = df_glucose.copy()


In [4]:
df_glucose

Unnamed: 0,Patient_ID,t,Measurement,Minutes_diff,Measurement-1,Measurement_diff,Interpolated
0,LIB193263,2020-06-09 19:15:00,92.000000,,,,0
1,LIB193263,2020-06-09 19:30:00,86.000000,15.000000,92.000000,6.000000,0
2,LIB193263,2020-06-09 19:45:00,85.000000,15.000000,86.000000,1.000000,0
3,LIB193263,2020-06-09 20:00:00,85.000000,15.000000,85.000000,0.000000,0
4,LIB193263,2020-06-09 20:15:00,87.000000,15.000000,85.000000,2.000000,0
...,...,...,...,...,...,...,...
4090741,LIB193424,2022-01-02 01:00:00,207.000000,15.000000,186.000000,21.000000,0
4090742,LIB193424,2022-01-02 01:15:00,215.000000,15.000000,207.000000,8.000000,0
4090743,LIB193424,2022-01-02 01:30:00,218.000000,15.000000,215.000000,3.000000,0
4090744,LIB193424,2022-01-02 01:45:00,222.000000,15.000000,218.000000,4.000000,0


In [31]:
def missing_values(subset):
    return subset['Measurement'].isna().sum()

def interpolate(subset):
     subset_interpolate = subset.apply(lambda sample: sample.interpolate(method="polynomial", order=2))
     # Fixing values out of range
     subset_interpolate[(subset_interpolate["Measurement"] < 40)] = 40
     subset_interpolate[(subset_interpolate["Measurement"] > 500)] = 500

     return subset_interpolate

def linear_model(train, test, perc):
    train_split = round(train.shape[0] * perc)
    test_split = round(test.shape[0] * perc)
    x_train, y_train = train.iloc[0:train_split], train.iloc[train_split, train.shape[0]]
    x_test, y_test = test.iloc[0:test_split], test.iloc[test_split, test.shape[0]]

    model = LinearRegression().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return rmse

def non_anchored_walk_forward_optimization(data, lookback_samples, validation_samples, na_tolerance, anchored = False):

    patients = get_patient_ids(data)    # Getting ids of patients
    start = 0
    wasted_data = 0
    windows_patients = []
    error_patients = []

    for patient in patients:
        print(f"Patient {patient}...")
        windows_patient = 0
        data_patient = data[data['Patient_ID'] == patient]
        error_patient = []
        for s in range(lookback_samples, data_patient.shape[0] - validation_samples, lookback_samples):
            if not anchored:
                train_subset = data_patient.iloc[s-lookback_samples : s]
            else:
                train_subset = data_patient.iloc[start: s]
            validation_subset = data_patient.iloc[s : s + validation_samples]
            train_subset_missing_values = missing_values(train_subset)
            validation_subset_missing_values = missing_values(validation_subset)

            if train_subset_missing_values > na_tolerance or validation_subset_missing_values > 0:      #No missing values allowed in validation subset
                wasted_data = wasted_data + lookback_samples
                if anchored:
                    start = s
                continue

            if 0 < train_subset_missing_values <= na_tolerance:
                train_subset = interpolate(train_subset)

            #error_patient.append(linear_model(train_subset['Measurement'],validation_subset['Measurement'],0.8))
            windows_patient = windows_patient + 1



        error_patients.append(np.average(error_patient))
        windows_patients.append(windows_patient)
        return [error_patients, windows_patients, wasted_data]

In [32]:
results = non_anchored_walk_forward_optimization(df_glucose, lookback_samples=4, validation_samples=2, na_tolerance=1, anchored=False)

Patient LIB193263...


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [33]:
results

[[nan], [14471], 4272]

In [34]:
results = non_anchored_walk_forward_optimization(df_glucose, lookback_samples=4, validation_samples=2, na_tolerance=1, anchored=True)

Patient LIB193263...


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [35]:
results

[[nan], [14420], 4476]

In [28]:
df_glucose[df_glucose["Patient_ID"] == "LIB193263"]

Unnamed: 0,Patient_ID,t,Measurement,Minutes_diff,Measurement-1,Measurement_diff,Interpolated
0,LIB193263,2020-06-09 19:15:00,92.000000,,,,0
1,LIB193263,2020-06-09 19:30:00,86.000000,15.000000,92.000000,6.000000,0
2,LIB193263,2020-06-09 19:45:00,85.000000,15.000000,86.000000,1.000000,0
3,LIB193263,2020-06-09 20:00:00,85.000000,15.000000,85.000000,0.000000,0
4,LIB193263,2020-06-09 20:15:00,87.000000,15.000000,85.000000,2.000000,0
...,...,...,...,...,...,...,...
62157,LIB193263,2022-03-19 06:30:00,124.000000,15.000000,123.000000,1.000000,0
62158,LIB193263,2022-03-19 06:45:00,124.000000,15.000000,124.000000,0.000000,0
62159,LIB193263,2022-03-19 07:00:00,118.000000,15.000000,124.000000,6.000000,0
62160,LIB193263,2022-03-19 07:15:00,110.000000,15.000000,118.000000,8.000000,0
