## Imports

In [162]:
import pyswarms as ps
import joblib
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import pyswarms as ps
from sklearn.metrics import mean_squared_error
import warnings

# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Parameters

In [163]:
callbacks = [lgb.early_stopping(stopping_rounds=10, first_metric_only=True, verbose=False)]
options = {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
bounds = (
    np.array([0.01, 0.01, 0, 0]), 
    np.array([0.1, 0.1, 1, 1])
)

num_feature_transformation_parameters = len(bounds[0])
iters = 10
n_particles = 30
patients = ['001', '002', '004', '006', '007', '008']
feature_names = ['simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight', 'fast_insulin', 'slow_insulin']

val_test_split = 0.7
verbose = 0

model_param_bounds = (
    np.array([0, 0]), 
    np.array([10, 10])
)

## Functions

In [164]:
def preprocess_data(patient, food_data, prediction_horizon):
    glucose_data = pd.read_csv(f"diabetes_subset_pictures-glucose-food-insulin/{patient}/glucose.csv")
    insulin_data = pd.read_csv(f"diabetes_subset_pictures-glucose-food-insulin/{patient}/insulin.csv")
    food_data = pd.read_csv(f"{food_data}_food_data/food_data_{patient}.csv")

    glucose_data["datetime"] = pd.to_datetime(glucose_data["date"] + ' ' + glucose_data["time"])
    glucose_data.drop(['type', 'comments', 'date', 'time'], axis=1, inplace=True)
    glucose_data['glucose'] *= 18.0182
    insulin_data["datetime"] = pd.to_datetime(insulin_data["date"] + ' ' + insulin_data["time"])
    insulin_data.drop(['comment', 'date', 'time'], axis=1, inplace=True)
    food_data['datetime'] = pd.to_datetime(food_data['datetime'], format='%Y:%m:%d %H:%M:%S')
    food_data = food_data[['datetime', 'simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight']]

    combined_data = pd.concat([food_data, insulin_data]).sort_values('datetime').reset_index(drop=True)
    combined_data.fillna(0, inplace=True)
    glucose_data['glucose_next'] = glucose_data['glucose'].shift(-prediction_horizon)
    glucose_data['glucose_change_5'] = glucose_data['glucose'] - glucose_data['glucose'].shift(1)

    glucose_data.dropna(subset=['glucose_next'], inplace=True)
    glucose_times = glucose_data['datetime'].values.astype('datetime64[s]').astype(np.int64)
    combined_times = combined_data['datetime'].values.astype('datetime64[s]').astype(np.int64)
    return glucose_data, combined_data, glucose_times, combined_times

def add_features_and_create_patient_data(params, preprocessed_data):
    patients_glucose_data = []
    for patient in patients:
        glucose_data, combined_data, glucose_times, combined_times = preprocessed_data[patient]
        
        for feature in range(len(feature_names)):
            time_diff_hours = (glucose_times[:, None] - combined_times[None, :]) / 3600
            decay_neg, decay_pos, delay, multiplier = params[feature*num_feature_transformation_parameters:(feature+1)*num_feature_transformation_parameters]
            weights = np.where(time_diff_hours < 0,
                   np.exp(-decay_neg * (np.abs(time_diff_hours) + delay) ** 2),
                   np.exp(-decay_pos * (np.abs(time_diff_hours) + delay) ** 2))
            glucose_data[feature_names[feature]] = np.dot(weights, combined_data.iloc[:, feature + 1].values) * multiplier
        glucose_data['patient'] = patient
        patients_glucose_data.append(glucose_data)
    patients_glucose_data = pd.concat(patients_glucose_data)
    patients_glucose_data.dropna(inplace=True)
    return patients_glucose_data

def get_lgb_params(params):
    return {
        'subsample': 0.2,
        'max_depth': 3,
        'reg_alpha': params[-2],
        'reg_lambda': params[-1],
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1
    }

def compute_rmse(params, test_patient, preprocessed_data):
    patients_glucose_data = add_features_and_create_patient_data(params[:-len(model_param_bounds[0])], preprocessed_data)
    train = patients_glucose_data[patients_glucose_data['patient'] != test_patient]
    test_patient_data = patients_glucose_data[patients_glucose_data['patient'] == test_patient]
    val = test_patient_data.iloc[:int(len(test_patient_data) * val_test_split)]

    lgb_params = get_lgb_params(params)
    model = lgb.LGBMRegressor(**lgb_params)
    
    X_train, y_train = train.drop(['glucose_next', 'datetime', 'patient'], axis=1), train['glucose_next']
    X_val, y_val = val.drop(['glucose_next', 'datetime', 'patient'], axis=1), val['glucose_next']
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', callbacks=callbacks)
    
    y_preds = model.predict(X_val)
    
    return np.sqrt(mean_squared_error(y_val, y_preds))

def objective(x, test_patient, preprocessed_data):
    return np.apply_along_axis(compute_rmse, 1, x, test_patient, preprocessed_data)

## GPT4o

In [175]:
approaches = ['gpt4o', 'gpt4', 'llava', 'nollm']
prediction_horizons = [3, 6, 9, 12]

rmse_df = pd.DataFrame(columns=['Approach', 'Prediction Horizon', 'Patient', 'RMSE'])


for approach in approaches:
    print(approach)
    for prediction_horizon in prediction_horizons:
        if approach == 'nollm':
            feature_names = ['fast_insulin', 'slow_insulin']
            preprocessed_data = {patient: preprocess_data(patient, 'gpt4o', prediction_horizon) for patient in patients}

        else:
            feature_names = ['simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers', 'weight', 'fast_insulin', 'slow_insulin']
            preprocessed_data = {patient: preprocess_data(patient, approach, prediction_horizon) for patient in patients}
        print(prediction_horizon)
        
        for test_patient in patients:
            optimizer = ps.single.GlobalBestPSO(
                n_particles=n_particles,
                dimensions=len(bounds[0]) * len(feature_names) + len(model_param_bounds[0]),  
                options=options,
                bounds=(np.concatenate([np.tile(bounds[0], len(feature_names)), model_param_bounds[0]]), 
                        np.concatenate([np.tile(bounds[1], len(feature_names)), model_param_bounds[1]]))
            )
            cost, params = optimizer.optimize(objective, iters=iters, verbose=verbose, test_patient=test_patient, preprocessed_data=preprocessed_data)
            patients_glucose_data = add_features_and_create_patient_data(params[:-len(model_param_bounds[0])], preprocessed_data)
            train = patients_glucose_data[patients_glucose_data['patient'] != test_patient]
            test_patient_data = patients_glucose_data[patients_glucose_data['patient'] == test_patient]
            val = test_patient_data.iloc[:int(len(test_patient_data) * val_test_split)]
            test = test_patient_data.iloc[int(len(test_patient_data) * val_test_split):]

            lgb_params = get_lgb_params(params)
            model = lgb.LGBMRegressor(**lgb_params)

            X_train, y_train = train.drop(['glucose_next', 'datetime', 'patient'], axis=1), train['glucose_next']
            X_val, y_val = val.drop(['glucose_next', 'datetime', 'patient'], axis=1), val['glucose_next']
            X_test, y_test = test.drop(['glucose_next', 'datetime', 'patient'], axis=1), test['glucose_next']

            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', callbacks=callbacks)

            # Ensure the model directory exists
            model_dir = f"models/{approach}"
            os.makedirs(model_dir, exist_ok=True)
            joblib.dump(model, f"{model_dir}/lightgbm_model_{test_patient}.joblib")

            y_preds = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_preds))

            # Create a new DataFrame for the current RMSE entry
            new_entry = pd.DataFrame({
                'Approach': [approach],
                'Prediction Horizon': [prediction_horizon],
                'Patient': [test_patient],
                'RMSE': [rmse]
            })

            # Concatenate the new entry with the existing RMSE DataFrame
            rmse_df = pd.concat([rmse_df, new_entry], ignore_index=True)

            # Ensure the data directory exists
            data_dir = f"data/{approach}/{prediction_horizon}"
            os.makedirs(data_dir, exist_ok=True)
            patients_glucose_data.to_csv(f"{data_dir}/patients_glucose_data_{test_patient}.csv", index=False)
            print(f"Test patient: {test_patient}, Best RMSE: {rmse}")

rmse_df.to_csv('results.csv', index=False)

nollm
3
Test patient: 001, Best RMSE: 11.00577893138643
Test patient: 002, Best RMSE: 12.258801828243694
Test patient: 004, Best RMSE: 18.28055980776433
Test patient: 006, Best RMSE: 6.445553634118173
Test patient: 007, Best RMSE: 7.737032232580883
Test patient: 008, Best RMSE: 11.346298860360365
6
Test patient: 001, Best RMSE: 22.40592805078651
Test patient: 002, Best RMSE: 27.759279350668894
Test patient: 004, Best RMSE: 35.739213390933195
Test patient: 006, Best RMSE: 14.39703762435974
Test patient: 007, Best RMSE: 14.95088813639324
Test patient: 008, Best RMSE: 22.954353871557096
9
Test patient: 001, Best RMSE: 30.46107143281547
Test patient: 002, Best RMSE: 44.68972567696052
Test patient: 004, Best RMSE: 53.03488842336938
Test patient: 006, Best RMSE: 22.27193014272193
Test patient: 007, Best RMSE: 19.551810583608685
Test patient: 008, Best RMSE: 33.82821174442868
12
Test patient: 001, Best RMSE: 37.741874517472624
Test patient: 002, Best RMSE: 60.51668404541516
Test patient: 004,