# Predict the bg+1:00 values

In [1]:
import os
import joblib
import pandas as pd

In [2]:
# Load train and test data
train_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(train_data_file, index_col=0, low_memory=False)
test_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'test.csv')
test_data = pd.read_csv(test_data_file, index_col=0, low_memory=False)

In [3]:
train_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,p01,06:10:00,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,...,,,,,,,,,,12.7


In [4]:
test_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_8459,p01,06:45:00,,9.2,,,10.2,,,10.3,...,,,,,,,,,,
p01_8460,p01,11:25:00,,,9.9,,,9.4,,,...,,,,,,,,Walk,Walk,Walk
p01_8461,p01,14:45:00,,5.5,,,5.5,,,5.2,...,,,,,,,,,,
p01_8462,p01,04:30:00,,3.4,,,3.9,,,4.7,...,,,,,,,,,,
p01_8463,p01,04:20:00,,,8.3,,,10.0,,,...,,,,,,,,,,


In [5]:
patients_with_own_model = sorted(list(set(train_data['p_num'].unique()) & set(test_data['p_num'].unique())))
patients_with_own_model

['p01', 'p02', 'p04', 'p05', 'p06', 'p10', 'p11', 'p12']

In [6]:
patients_with_generic_model = sorted(list(set(test_data['p_num'].unique()) - set(train_data['p_num'].unique())))
patients_with_generic_model

['p15', 'p16', 'p18', 'p19', 'p21', 'p22', 'p24']

## Load generic model

In [7]:
from sklearn.ensemble import VotingRegressor
from pipelines import pipeline

generic_model_files = [file for file in os.listdir() if file.endswith('generic.model.pkl')]

estimators = []
for generic_model_file in generic_model_files:
    print(f'Loading {generic_model_file}')
    model = joblib.load(generic_model_file)
    estimators.append((
        generic_model_file.replace('.model.pkl', ''),
        model
    ))

generic_model = VotingRegressor(estimators=estimators, verbose=True)
X = train_data.drop(columns=['bg+1:00'])
y = train_data['bg+1:00']
X = pipeline.fit_transform(X=X)
generic_model.fit(X=X, y=y)

Loading XGBRegressor.generic.model.pkl
Loading ExtraTreesRegressor.generic.model.pkl
Loading BaggingRegressor.generic.model.pkl
[Voting] ..... (1 of 3) Processing XGBRegressor.generic, total=   1.1s
[Voting]  (2 of 3) Processing ExtraTreesRegressor.generic, total= 3.4min
[Voting] . (3 of 3) Processing BaggingRegressor.generic, total= 7.7min


## Load patient-specific models

In [8]:
from pipelines import pipeline

specific_models = {}
for p_num in patients_with_own_model:
    specific_model_file = f'XGBRegressor.{p_num}.model.pkl'
    print(f'Loading {specific_model_file}')
    model = joblib.load(specific_model_file)
    model_name = specific_model_file.replace('.model.pkl', '')

    print(f'Preparing data for {p_num}')
    patient_data = train_data[train_data['p_num'] == p_num]
    X = patient_data.drop(columns=['bg+1:00'])
    y = patient_data['bg+1:00']
    X = pipeline.fit_transform(X=X)

    print(f'Fitting {specific_model_file}')
    model.fit(X=X, y=y)
    specific_models[p_num] = model

Loading XGBRegressor.p01.model.pkl
Preparing data for p01
Fitting XGBRegressor.p01.model.pkl
Loading XGBRegressor.p02.model.pkl
Preparing data for p02
Fitting XGBRegressor.p02.model.pkl
Loading XGBRegressor.p04.model.pkl
Preparing data for p04
Fitting XGBRegressor.p04.model.pkl
Loading XGBRegressor.p05.model.pkl
Preparing data for p05
Fitting XGBRegressor.p05.model.pkl
Loading XGBRegressor.p06.model.pkl
Preparing data for p06
Fitting XGBRegressor.p06.model.pkl
Loading XGBRegressor.p10.model.pkl
Preparing data for p10
Fitting XGBRegressor.p10.model.pkl
Loading XGBRegressor.p11.model.pkl
Preparing data for p11
Fitting XGBRegressor.p11.model.pkl
Loading XGBRegressor.p12.model.pkl
Preparing data for p12
Fitting XGBRegressor.p12.model.pkl


# Prepare test results

In [9]:
submission = pd.DataFrame(index=test_data.index.copy())
submission['bg+1:00'] = -1.0

In [10]:
# Predict the bg+1:00 values
for p_num in patients_with_own_model:
    print(f'----------------------------------------')
    print(f'Load model for {p_num}')
    model = specific_models[p_num]

    print(f'Prepare data for {p_num}')
    raw_train_data = train_data[train_data['p_num'] == p_num]
    raw_test_data = test_data[test_data['p_num'] == p_num]
    pipeline.fit(raw_train_data)
    raw_test_data = pipeline.transform(raw_test_data)

    print(f'Predict for {p_num}')
    prediction = model.predict(X=raw_test_data)
    submission.loc[submission.index.isin(raw_test_data.index), 'bg+1:00'] = prediction

----------------------------------------
Load model for p01
Prepare data for p01
Predict for p01
----------------------------------------
Load model for p02
Prepare data for p02
Predict for p02
----------------------------------------
Load model for p04
Prepare data for p04
Predict for p04
----------------------------------------
Load model for p05
Prepare data for p05
Predict for p05
----------------------------------------
Load model for p06
Prepare data for p06
Predict for p06
----------------------------------------
Load model for p10
Prepare data for p10
Predict for p10
----------------------------------------
Load model for p11
Prepare data for p11
Predict for p11
----------------------------------------
Load model for p12
Prepare data for p12
Predict for p12


In [11]:
raw_train_data = train_data.copy()
pipeline.fit(raw_train_data)

for p_num in patients_with_generic_model:
    print(f'----------------------------------------')
    print(f'Predict with generic model for {p_num}')
    raw_test_data = test_data[test_data['p_num'] == p_num]
    raw_test_data = pipeline.transform(raw_test_data)

    prediction = generic_model.predict(X=raw_test_data)
    submission.loc[submission.index.isin(raw_test_data.index), 'bg+1:00'] = prediction

----------------------------------------
Predict with generic model for p15
----------------------------------------
Predict with generic model for p16
----------------------------------------
Predict with generic model for p18
----------------------------------------
Predict with generic model for p19
----------------------------------------
Predict with generic model for p21
----------------------------------------
Predict with generic model for p22
----------------------------------------
Predict with generic model for p24


## Prepare the submission file

### Save the submission file

In [12]:
submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,9.588513
p01_8460,5.770772
p01_8461,7.892138
p01_8462,9.541103
p01_8463,6.812788
...,...
p24_256,6.539559
p24_257,9.418495
p24_258,6.788259
p24_259,8.844975
