# Predict for each patient

In [1]:
from data import load_data

train_data, test_data = load_data()

unique_patients = test_data['p_num'].unique()
train_data = train_data[train_data['p_num'].isin(unique_patients)]
test_data = test_data[test_data['p_num'].isin(unique_patients)]

train_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,p01,06:10:00,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,...,,,,,,,,,,12.7


In [2]:
from sklearn.ensemble import VotingRegressor
from pipelines import pipeline
import joblib
import re
import os
import datetime


def get_date_time_now():
    return datetime.datetime.now().strftime('%H:%M:%S')


models = {}
results = test_data[[]].copy()
results.loc[:, 'bg+1:00'] = -1.0
results['bg+1:00'] = results['bg+1:00'].astype(float)
for p_num in unique_patients:
    print(f'{get_date_time_now()} - {p_num} - Predicting for patient {p_num}')
    stored_models = [file for file in os.listdir() if file.endswith('.model.pkl') and file.startswith(f'{p_num}')]
    # filter out all models that have extratrees and LGBM in the name
    stored_models = [file for file in stored_models if not re.search(r'ExtraTrees|LGBM', file)]
    estimators = []
    print(f'{get_date_time_now()} - {p_num} - Found {len(stored_models)} models')
    for model_file in stored_models:
        print(f'{get_date_time_now()} - {p_num} - Loading {model_file}')
        model = joblib.load(model_file)
        estimators.append((
            model_file.replace('.model.pkl', ''),
            model
        ))

    print(f'{get_date_time_now()} - {p_num} - Transforming the data -')
    train_data_patient = train_data[train_data['p_num'] == p_num]
    train_data_patient = pipeline.fit_transform(train_data_patient)

    X_train = train_data_patient.drop(columns=['bg+1:00'])
    y_train = train_data_patient['bg+1:00']

    print(f'{get_date_time_now()} - {p_num} - Fitting the model -')
    model = VotingRegressor(estimators=estimators, verbose=True)
    model.fit(X=X_train, y=y_train)

    print(f'{get_date_time_now()} - {p_num} - Predicting -')
    test_data_patient = test_data[test_data['p_num'] == p_num]
    X_test = pipeline.transform(test_data_patient)
    results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(X=X_test)
    print(f'{get_date_time_now()} - {p_num} - Done -')

results.head()

15:32:26 - p01 - Predicting for patient p01
15:32:26 - p01 - Found 3 models
15:32:26 - p01 - Loading p01.LassoLarsIC.model.pkl
15:32:26 - p01 - Loading p01.XGBRegressor.model.pkl
15:32:26 - p01 - Loading p01.HistGradientBoostingRegressor.model.pkl
15:32:26 - p01 - Transforming the data -
15:32:28 - p01 - Fitting the model -
[Voting] .......... (1 of 3) Processing p01.LassoLarsIC, total=   0.0s
[Voting] ......... (2 of 3) Processing p01.XGBRegressor, total=  19.0s
[Voting]  (3 of 3) Processing p01.HistGradientBoostingRegressor, total=   5.4s
15:32:52 - p01 - Predicting -
15:32:52 - p01 - Done -
15:32:52 - p02 - Predicting for patient p02
15:32:52 - p02 - Found 3 models
15:32:52 - p02 - Loading p02.XGBRegressor.model.pkl
15:32:53 - p02 - Loading p02.LassoLarsIC.model.pkl
15:32:53 - p02 - Loading p02.HistGradientBoostingRegressor.model.pkl
15:32:53 - p02 - Transforming the data -
15:32:56 - p02 - Fitting the model -
[Voting] ......... (1 of 3) Processing p02.XGBRegressor, total=   9.5s
[V

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.999951
p01_8460,6.07264
p01_8461,8.654432
p01_8462,10.230786
p01_8463,6.663246


In [3]:
# check if all results are greater than 0
assert (results['bg+1:00'] >= 0).all()

## Prepare the submission file

In [4]:
submission = results
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.999951
p01_8460,6.072640
p01_8461,8.654432
p01_8462,10.230786
p01_8463,6.663246
...,...
p24_256,6.550876
p24_257,10.138702
p24_258,6.363535
p24_259,7.973009


### Save the submission file

In [5]:
submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')