# Preprocessing

## Check which patients have 5min and 15min bg measurements

In [1]:
import os
import pandas as pd

from src.features.tuners import XGBHyperparameterTuner

In [2]:
train_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(train_data_file, index_col=0, low_memory=False)

In [3]:
patient_ids_train_data = train_data['p_num'].unique()
patient_ids_train_data

array(['p01', 'p02', 'p03', 'p04', 'p05', 'p06', 'p10', 'p11', 'p12'],
      dtype=object)

In [4]:
test_data_file = os.path.join('..', '..', '..', '..', 'data', 'interim', 'all_test_2h.csv')
test_data = pd.read_csv(test_data_file, low_memory=False)
test_data

Unnamed: 0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
0,p01,02:50:00,,,,,,,,,...,,,,,,,,,,11.57
1,p01,02:55:00,,,,,,,,,...,,,,,,,,,,11.40
2,p01,03:00:00,,,,,,,,,...,,,,,,,,,,11.57
3,p01,03:05:00,,,,,,,,,...,,,,,,,,,,11.73
4,p01,03:10:00,,,,,,,,,...,,,,,,,,,,11.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133704,p24,01:50:00,,,,,,,,,...,,,,,,,,,,6.40
133705,p24,01:55:00,,,,,,,,,...,,,,,,,,,,6.40
133706,p24,02:00:00,,,,,,,,,...,,,,,,,,,,6.40
133707,p24,02:05:00,,,,,,,,,...,,,,,,,,,,6.50


In [5]:
patient_ids_test_data = test_data['p_num'].unique()
patient_ids_test_data

array(['p01', 'p02', 'p04', 'p05', 'p06', 'p10', 'p11', 'p12', 'p15',
       'p16', 'p18', 'p19', 'p21', 'p22', 'p24'], dtype=object)

In [6]:
all_train_data = pd.concat([train_data, test_data], axis=0)
all_train_data

Unnamed: 0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
p01_0,p01,06:10:00,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,...,,,,,,,,,,12.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133704,p24,01:50:00,,,,,,,,,...,,,,,,,,,,6.4
133705,p24,01:55:00,,,,,,,,,...,,,,,,,,,,6.4
133706,p24,02:00:00,,,,,,,,,...,,,,,,,,,,6.4
133707,p24,02:05:00,,,,,,,,,...,,,,,,,,,,6.5


In [7]:
patient_ids = all_train_data['p_num'].unique()
patient_ids

array(['p01', 'p02', 'p03', 'p04', 'p05', 'p06', 'p10', 'p11', 'p12',
       'p15', 'p16', 'p18', 'p19', 'p21', 'p22', 'p24'], dtype=object)

## Train a specific model for all patients

In [None]:
import joblib
from pipelines import pipeline
from src.features.tuners import XGBHyperparameterTuner, LassoCVHyperparameterTuner

for patient_id in patient_ids:
    print(f'-----------{patient_id}-----------')
    data = all_train_data[all_train_data['p_num'] == patient_id]
    data = data.drop(columns=['p_num'])
    train_data_patient = pipeline.fit_transform(data)
    X = train_data_patient.drop(columns=['bg+1:00'])
    y = train_data_patient['bg+1:00']
    
    tuner = XGBHyperparameterTuner(search_space='deep')
    tuner.fit(X=X, y=y)
    print('Best hyperparameters found.')
    print(f'RMSE: {tuner.get_rmse()}')
    tuner.show_chart()
    joblib.dump(tuner.get_best_model(), f'XGBRegressor.{patient_id}.model.pkl')
    
    tuner = XGBHyperparameterTuner(search_space='wide')

-----------p01-----------


In [None]:
all_train_data.to_csv('train_data.csv')