# Univariate Modelling

In [1]:
import os
import joblib
import pandas as pd

stored_models = [file for file in os.listdir() if file.endswith('.model.pkl')]
stored_models

['xgb.model.pkl', 'lgbm.model.pkl', 'lasso.model.pkl']

In [2]:
from sklearn.ensemble import VotingRegressor

estimators = []
for model_file in stored_models:
    print(f'Loading {model_file}')
    model = joblib.load(model_file)
    estimators.append((
        model_file.replace('.model.pkl', ''),
        model
    ))
    
estimators

Loading xgb.model.pkl
Loading lgbm.model.pkl
Loading lasso.model.pkl


[('xgb',
  XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.1, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=3, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=100, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...)),
 ('lgbm', LGBMRegressor(colsample_bytree=0.8, max_depth=3, subsample=0.8)),
 ('lasso', LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10], max_iter=300))]

In [3]:
df = pd.read_csv('train_data.csv')
X = df[['bg']]
y = df['bg+1:00']

ensemble = VotingRegressor(estimators=estimators, verbose=True)
ensemble.fit(X=X, y=y)
ensemble.score(X=X, y=y)

[Voting] ...................... (1 of 3) Processing xgb, total=   0.9s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 226
[LightGBM] [Info] Number of data points in the train set: 177024, number of used features: 1
[LightGBM] [Info] Start training from score 8.277045
[Voting] ..................... (2 of 3) Processing lgbm, total=   0.9s
[Voting] .................... (3 of 3) Processing lasso, total=   0.1s


0.5027838843153354

# Prepare test results

In [4]:
test_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'test.csv')

In [5]:
## Load the test data
test_data = pd.read_csv(test_file, index_col=0)
test_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_8459,p01,06:45:00,,9.2,,,10.2,,,10.3,...,,,,,,,,,,
p01_8460,p01,11:25:00,,,9.9,,,9.4,,,...,,,,,,,,Walk,Walk,Walk
p01_8461,p01,14:45:00,,5.5,,,5.5,,,5.2,...,,,,,,,,,,
p01_8462,p01,04:30:00,,3.4,,,3.9,,,4.7,...,,,,,,,,,,
p01_8463,p01,04:20:00,,,8.3,,,10.0,,,...,,,,,,,,,,


In [6]:
test_data = test_data[['bg-0:00']]
test_data.isna().sum()

bg-0:00    132
dtype: int64

## Interpolate missing values in bg column and fill with mean

In [7]:
test_data['bg-0:00'] = test_data['bg-0:00'].fillna(test_data['bg-0:00'].mean())

In [8]:
# Predict the bg+1:00 values
test_data.rename(columns={'bg-0:00': 'bg'}, inplace=True)
test_data['bg+1:00'] = ensemble.predict(test_data[['bg']])
test_data.head()

Unnamed: 0_level_0,bg,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1
p01_8459,9.6,9.12024
p01_8460,4.6,6.009172
p01_8461,8.0,7.921656
p01_8462,9.9,9.370769
p01_8463,5.3,6.293297


## Prepare the submission file

In [9]:
submission = pd.DataFrame(test_data['bg+1:00'])
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,9.120240
p01_8460,6.009172
p01_8461,7.921656
p01_8462,9.370769
p01_8463,6.293297
...,...
p24_256,6.764071
p24_257,9.731243
p24_258,7.176356
p24_259,8.429322


In [10]:
folder_name = os.path.basename(os.getcwd())
submission.to_csv(f'submission-{folder_name}.csv')