# Univariate Modelling

In [1]:
import os
import joblib
import pandas as pd

stored_models = [file for file in os.listdir() if file.endswith('.model.pkl')]
stored_models

['ridge.model.pkl', 'xgb.model.pkl', 'hgb.model.pkl']

In [2]:
from sklearn.ensemble import VotingRegressor

estimators = []
for model_file in stored_models:
    print(f'Loading {model_file}')
    model = joblib.load(model_file)
    estimators.append((
        model_file.replace('.model.pkl', ''),
        model
    ))

estimators

Loading ridge.model.pkl
Loading xgb.model.pkl
Loading hgb.model.pkl


[('ridge', RidgeCV(alpha_per_target=True, alphas=[0.1, 1.0, 10.0])),
 ('xgb',
  XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=1.0, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=0.5, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.1, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=7, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=200, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...)),
 ('hgb',
  HistGradientBoostingRegressor(l2_regularization=0.5, max_depth=7, max_iter=300,
                                max_leaf_nodes=63))]

In [3]:
train_data = pd.read_csv('train_data.csv')
X_train = train_data.drop(columns=['bg+1:00'])
y_train = train_data['bg+1:00']

ensemble = VotingRegressor(estimators=estimators, verbose=True)
ensemble.fit(X=X_train, y=y_train)
ensemble.score(X=X_train, y=y_train)

[Voting] .................... (1 of 3) Processing ridge, total=   0.0s
[Voting] ...................... (2 of 3) Processing xgb, total=   0.8s
[Voting] ...................... (3 of 3) Processing hgb, total=   4.8s


0.5817991078188063

In [4]:
X_train

Unnamed: 0,bg-0:00,insulin-0:00,hr-0:00,cals-0:00,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon
0,2.275154,-0.152482,0.929993,-0.458394,False,True,False,False
1,2.041687,-0.152482,0.929993,-0.458394,False,True,False,False
2,1.874925,-0.152482,0.929993,-0.458394,False,True,False,False
3,1.841572,-0.152482,0.929993,-0.458394,False,True,False,False
4,1.708162,-0.152482,0.929993,-0.458394,False,True,False,False
...,...,...,...,...,...,...,...,...
177019,0.507474,0.082239,1.175840,-0.202605,False,False,True,False
177020,0.674236,0.872021,1.175840,-0.202605,False,False,True,False
177021,0.807646,0.368976,1.175840,-0.202605,False,False,True,False
177022,0.907704,-0.144131,1.175840,-0.202605,False,False,True,False


# Prepare test results

In [5]:
test_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'test.csv')

In [6]:
## Load the test data
test_data = pd.read_csv(test_file, index_col=0)
test_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_8459,p01,06:45:00,,9.2,,,10.2,,,10.3,...,,,,,,,,,,
p01_8460,p01,11:25:00,,,9.9,,,9.4,,,...,,,,,,,,Walk,Walk,Walk
p01_8461,p01,14:45:00,,5.5,,,5.5,,,5.2,...,,,,,,,,,,
p01_8462,p01,04:30:00,,3.4,,,3.9,,,4.7,...,,,,,,,,,,
p01_8463,p01,04:20:00,,,8.3,,,10.0,,,...,,,,,,,,,,


In [7]:
from pipelines import pipeline

test_data = pipeline.fit_transform(test_data)
test_data.head()

Unnamed: 0_level_0,bg-0:00,insulin-0:00,hr-0:00,cals-0:00,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
p01_8459,0.297295,-0.164395,-1.42932,-0.166561,False,True,False,False
p01_8460,-1.256018,-0.147,1.989071,3.490763,False,False,False,True
p01_8461,-0.199765,-0.155698,-0.264931,0.402079,False,False,False,False
p01_8462,0.390494,-0.147,-1.151576,-0.341314,False,False,True,False
p01_8463,-1.038554,-0.208093,-1.092822,-0.341314,False,False,True,False


In [8]:
# Predict the bg+1:00 values
test_data['bg+1:00'] = ensemble.predict(test_data)
test_data.head()

Unnamed: 0_level_0,bg-0:00,insulin-0:00,hr-0:00,cals-0:00,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
p01_8459,0.297295,-0.164395,-1.42932,-0.166561,False,True,False,False,8.467961
p01_8460,-1.256018,-0.147,1.989071,3.490763,False,False,False,True,5.847002
p01_8461,-0.199765,-0.155698,-0.264931,0.402079,False,False,False,False,7.511862
p01_8462,0.390494,-0.147,-1.151576,-0.341314,False,False,True,False,8.149937
p01_8463,-1.038554,-0.208093,-1.092822,-0.341314,False,False,True,False,5.836534


## Prepare the submission file

In [9]:
submission = pd.DataFrame(test_data['bg+1:00'])
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.467961
p01_8460,5.847002
p01_8461,7.511862
p01_8462,8.149937
p01_8463,5.836534
...,...
p24_256,6.314424
p24_257,8.943484
p24_258,6.724515
p24_259,7.856204


### Save the submission file

In [10]:
submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')