# Predict for each patient

In [1]:
# Mute warnings
import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

from data import load_data

train_data, additional_train_data, test_data, unique_patients = load_data()

In [3]:
from pipelines import pipeline

data = {}
for p_num in unique_patients:
    train_data_patient = pd.concat([train_data[train_data['p_num'] == p_num], additional_train_data[additional_train_data['p_num'] == p_num]])
    test_data_patient = test_data[test_data['p_num'] == p_num]

    train_data_patient_transformed = pipeline.fit_transform(train_data_patient)
    test_data_patient_transformed = pipeline.transform(test_data_patient)

    data[p_num] = {
        'X_train': train_data_patient_transformed.drop(columns=['bg+1:00']),
        'y_train': train_data_patient_transformed['bg+1:00'],
        'X_test': test_data_patient_transformed,
        'y_pred': test_data_patient_transformed[[]]
    }

In [4]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import StackingRegressor
import datetime
from regressors_v2 import get_hgb_regressor, get_xgb_regressor, get_knn_regressor, get_keras_regressor, get_lasso_lars_ic_regressor


def get_date_time_now():
    return datetime.datetime.now().strftime('%H:%M:%S')


models = {}
results = test_data[[]].copy()
results.loc[:, 'bg+1:00'] = -1.0
results['bg+1:00'] = results['bg+1:00'].astype(float)
for p_num in data.keys():
    print(f'{get_date_time_now()} - {p_num} - Predicting for patient {p_num}')
    print(f'{get_date_time_now()} - {p_num} - Transforming the data -')

    X_train = data[p_num]['X_train']
    y_train = data[p_num]['y_train']
    X_test = data[p_num]['X_test']

    print(f'{get_date_time_now()} - {p_num} - Fitting the model -')

    _, X_val, _, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)

    estimators = [
        ('hgb', get_hgb_regressor()),
        ('xgb', get_xgb_regressor()),
        ('llic', get_lasso_lars_ic_regressor()),
        ('knn', get_knn_regressor()),
        ('dnn', get_keras_regressor(
            p_num=p_num,
            X_train=X_train,
            X_val=X_val,
            y_train=y_train,
            y_val=y_val))
    ]

    model = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=0.1), n_jobs=1, verbose=2)
    model.fit(X=X_train, y=y_train)

    print(f'{get_date_time_now()} - {p_num} - Predicting -')
    y_pred = model.predict(X=X_test)

    # count and replace negative values
    if np.sum(y_pred < 0) > 0:
        print(f'{get_date_time_now()} - {p_num} - Number of negative values: {np.sum(y_pred < 0)}')
        bg_min_train = np.min(y_train)
        print(f'{get_date_time_now()} - {p_num} - Min value: {np.min(y_pred)}')
        y_pred = np.where(y_pred < 0, bg_min_train, y_pred)

    data[p_num]['y_pred'].loc[data[p_num]['X_test'].index, 'bg+1:00'] = y_pred
    results.loc[data[p_num]['X_test'].index, 'bg+1:00'] = y_pred
    print(f'{get_date_time_now()} - {p_num} - Done -')

results.head()

07:54:02 - p01 - Predicting for patient p01
07:54:02 - p01 - Transforming the data -
07:54:02 - p01 - Fitting the model -
Epoch 1/100
601/601 - 1s - 1ms/step - loss: 26.2364 - rmse: 4.8417 - val_loss: 10.9407 - val_rmse: 3.1213
Epoch 2/100
601/601 - 0s - 428us/step - loss: 17.9715 - rmse: 3.9921 - val_loss: 7.9639 - val_rmse: 2.7736
Epoch 3/100
601/601 - 0s - 432us/step - loss: 13.7194 - rmse: 3.6382 - val_loss: 7.5957 - val_rmse: 2.7237
Epoch 4/100
601/601 - 0s - 430us/step - loss: 12.5062 - rmse: 3.4844 - val_loss: 7.4998 - val_rmse: 2.7071
Epoch 5/100
601/601 - 0s - 429us/step - loss: 11.9685 - rmse: 3.4090 - val_loss: 7.3388 - val_rmse: 2.6808
Epoch 6/100
601/601 - 0s - 427us/step - loss: 11.2498 - rmse: 3.3116 - val_loss: 7.2025 - val_rmse: 2.6564
Epoch 7/100
601/601 - 0s - 431us/step - loss: 10.6484 - rmse: 3.2167 - val_loss: 6.7102 - val_rmse: 2.5631
Epoch 8/100
601/601 - 0s - 432us/step - loss: 10.2480 - rmse: 3.1618 - val_loss: 6.6994 - val_rmse: 2.5605
Epoch 9/100
601/601 - 0

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,9.430662
p01_8460,6.31624
p01_8461,10.907337
p01_8462,11.3217
p01_8463,8.268017


In [5]:
results[results['bg+1:00'] < 0]

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1


In [6]:
# check if all results are greater than 0
assert (results['bg+1:00'] >= 0).all()

## Prepare the submission file

In [7]:
submission = results
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,9.430662
p01_8460,6.316240
p01_8461,10.907337
p01_8462,11.321700
p01_8463,8.268017
...,...
p24_256,6.677586
p24_257,11.246602
p24_258,6.514036
p24_259,6.941969


### Save the submission file

In [8]:
submission.to_csv(f'submission-3.32-v2.0.csv')

In [9]:
train_data['bg+1:00'].describe()

count    150996.000000
mean          8.224292
std           2.967680
min           2.200000
25%           6.100000
50%           7.600000
75%           9.800000
max          27.800000
Name: bg+1:00, dtype: float64

In [10]:
train_data[train_data['p_num'] == 'p16']['bg+1:00'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: bg+1:00, dtype: float64