# Predict for each patient

In [1]:
from data import load_data

train_data, test_data = load_data()

unique_patients = test_data['p_num'].unique()
train_data = train_data[train_data['p_num'].isin(unique_patients)]
test_data = test_data[test_data['p_num'].isin(unique_patients)]

train_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,p01,06:10:00,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,...,,,,,,,,,,12.7


In [2]:
from sklearn.svm import SVR
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from pipelines import pipeline
import os
import datetime


def get_date_time_now():
    return datetime.datetime.now().strftime('%H:%M:%S')


models = {}
results = test_data[[]].copy()
results.loc[:, 'bg+1:00'] = -1.0
results['bg+1:00'] = results['bg+1:00'].astype(float)
for p_num in unique_patients:
    print(f'{get_date_time_now()} - {p_num} - Predicting for patient {p_num}')
    print(f'{get_date_time_now()} - {p_num} - Transforming the data -')
    train_data_patient = train_data[train_data['p_num'] == p_num]
    train_data_patient = pipeline.fit_transform(train_data_patient)

    X_train = train_data_patient.drop(columns=['bg+1:00'])
    y_train = train_data_patient['bg+1:00']

    print(f'{get_date_time_now()} - {p_num} - Fitting the model -')
    base_models = [
        ('hgb', HistGradientBoostingRegressor(max_iter=1000, max_depth=5, learning_rate=0.01)),
        ('xgb', XGBRegressor(objective='reg:squarederror', random_state=42, n_estimators=5000, max_depth=5, learning_rate=0.01)),
        ('rf', RandomForestRegressor(n_estimators=500, max_depth=10, random_state=42)),
        ('svr', SVR(kernel='rbf', C=1.0, gamma='scale')),
        ('knn', KNeighborsRegressor(n_neighbors=10, weights='distance')),
        ('elastic', ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000))
    ]

    meta_model = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000)
    model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
    model.fit(X=X_train, y=y_train)

    print(f'{get_date_time_now()} - {p_num} - Predicting -')
    test_data_patient = test_data[test_data['p_num'] == p_num]
    X_test = pipeline.transform(test_data_patient)
    y_pred = model.predict(X=X_test)

    # count and replace negative values
    if np.sum(y_pred < 0) > 0:
        print(f'{get_date_time_now()} - {p_num} - Number of negative values: {np.sum(y_pred < 0)}')
        bg_min_train = np.min(y_train)
        print(f'{get_date_time_now()} - {p_num} - Min value: {np.min(y_pred)}')
        y_pred = y_pred.apply(lambda x: bg_min_train if x < 0 else x)

    results.loc[test_data_patient.index, 'bg+1:00'] = y_pred
    print(f'{get_date_time_now()} - {p_num} - Done -')

results.head()

00:19:22 - p01 - Predicting for patient p01
00:19:22 - p01 - Transforming the data -
00:19:25 - p01 - Fitting the model -
00:46:12 - p01 - Predicting -
00:46:13 - p01 - Done -
00:46:13 - p02 - Predicting for patient p02
00:46:13 - p02 - Transforming the data -
00:46:17 - p02 - Fitting the model -
01:29:59 - p02 - Predicting -
01:30:00 - p02 - Done -
01:30:00 - p04 - Predicting for patient p04
01:30:00 - p04 - Transforming the data -
01:30:03 - p04 - Fitting the model -
02:16:32 - p04 - Predicting -
02:16:33 - p04 - Done -
02:16:33 - p05 - Predicting for patient p05
02:16:33 - p05 - Transforming the data -
02:16:35 - p05 - Fitting the model -
02:42:19 - p05 - Predicting -
02:42:20 - p05 - Done -
02:42:20 - p06 - Predicting for patient p06
02:42:20 - p06 - Transforming the data -
02:42:22 - p06 - Fitting the model -
03:05:19 - p06 - Predicting -
03:05:19 - p06 - Done -
03:05:19 - p10 - Predicting for patient p10
03:05:19 - p10 - Transforming the data -
03:05:22 - p10 - Fitting the model 

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.578611
p01_8460,6.248291
p01_8461,8.578479
p01_8462,10.606884
p01_8463,6.587078


In [4]:
results[results['bg+1:00'] < 0]

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1


In [6]:
# check if all results are greater than 0
assert (results['bg+1:00'] >= 0).all()

## Prepare the submission file

In [7]:
submission = results
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.578611
p01_8460,6.248291
p01_8461,8.578479
p01_8462,10.606884
p01_8463,6.587078
...,...
p24_256,6.677037
p24_257,9.663669
p24_258,6.543265
p24_259,8.301728


### Save the submission file

In [8]:
submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')

In [9]:
train_data['bg+1:00'].describe()

count    277524.000000
mean          8.446084
std           3.108568
min           2.200000
25%           6.200000
50%           7.800000
75%          10.100000
max          27.800000
Name: bg+1:00, dtype: float64

In [10]:
train_data[train_data['p_num'] == 'p16']['bg+1:00'].describe()

count    8905.000000
mean        8.331803
std         1.915485
min         3.400000
25%         6.900000
50%         8.000000
75%         9.700000
max        15.400000
Name: bg+1:00, dtype: float64