# Univariate Modelling

In [1]:
import os
import pandas as pd

train_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(train_data_file, index_col=0, low_memory=False)

validation_data_file = os.path.join('..', '..', '..', '..', 'data', 'interim', 'all_test_4_55h.csv')
validation_data = pd.read_csv(validation_data_file, index_col=0, low_memory=False)

additional_train_data_file = os.path.join('..', '..', '..', '..', 'data', 'interim', 'all_test_3h.csv')
additional_train_data = pd.read_csv(additional_train_data_file, index_col=0, low_memory=False)
# remove from additional data patients that are not in validation data ids
additional_train_data = additional_train_data[~additional_train_data.index.isin(validation_data.index.unique())]

# merge train and additional data
train_data = pd.concat([train_data, additional_train_data], axis=0)

# do not train with patients that are not have to be predicted
test_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'test.csv')
test_data = pd.read_csv(test_data_file, index_col=0, low_memory=False)

unique_patients = test_data['p_num'].unique()
train_data = train_data[train_data['p_num'].isin(unique_patients)]
validation_data = validation_data[validation_data['p_num'].isin(unique_patients)]
test_data = test_data[test_data['p_num'].isin(unique_patients)]

train_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,p01,06:10:00,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,...,,,,,,,,,,12.7


In [2]:
from sklearn.ensemble import VotingRegressor
from pipelines import pipeline
import joblib

models = {}
results = test_data[[]]
results['bg+1:00'] = -1
for p_num in unique_patients:
    stored_models = [file for file in os.listdir() if file.endswith('.model.pkl') and file.startswith(f'{p_num}')]
    estimators = []
    for model_file in stored_models:
        print(f'Loading {model_file}')
        model = joblib.load(model_file)
        estimators.append((
            model_file.replace('.model.pkl', ''),
            model
        ))
    
    model = VotingRegressor(estimators=estimators, verbose=True)
    
    train_data_patient = train_data[train_data['p_num'] == p_num]
    validation_data_patient = validation_data[validation_data['p_num'] == p_num]
    train_and_validation_data_patient = pipeline.fit_transform(pd.concat([train_data_patient, validation_data_patient], axis=0))
    
    X_train = train_and_validation_data_patient.drop(columns=['bg+1:00'])
    y_train = train_and_validation_data_patient['bg+1:00']
    
    print(f'Training model for patient {p_num}')
    model.fit(X=X_train, y=y_train)

    print(f'Predicting for patient {p_num}')    
    test_data_patient = test_data[test_data['p_num'] == p_num]
    test_data_patient = pipeline.transform(test_data_patient)
    
    results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


results.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['bg+1:00'] = -1


Loading p01.LassoLarsIC.model.pkl
Loading p01.XGBRegressor.model.pkl
Loading p01.HistGradientBoostingRegressor.model.pkl
Training model for patient p01
[Voting] .......... (1 of 3) Processing p01.LassoLarsIC, total=   0.0s
[Voting] ......... (2 of 3) Processing p01.XGBRegressor, total=   3.3s
[Voting]  (3 of 3) Processing p01.HistGradientBoostingRegressor, total=   1.8s
Predicting for patient p01
Loading p02.XGBRegressor.model.pkl
Loading p02.LassoLarsIC.model.pkl
Loading p02.HistGradientBoostingRegressor.model.pkl


  6.804224    9.47201838  9.7684981   5.96400098  6.93366853  6.93711185
  9.86604075  5.78035968  6.26182715  9.96870013  4.59820856 13.90160918
 15.0864996   7.39428996  9.07082301  6.73507504  9.37408746 10.22285685
 11.41480091  5.99554706  7.22360601  7.9667077   7.84167217  9.0977995
  8.28063922  7.94735856  7.53772573  7.92852406 10.47132552  6.99960163
 11.27959374  7.82215014  9.23497675  8.3789502   7.5765245   7.67811069
  5.25294508  8.5164906   9.64428037  6.39057401 10.85236129  8.47575452
  7.61200955  6.07037908 15.5939031   8.34889954  8.81801734  9.16220966
  7.60787802 15.6275585   5.20412651  7.5384146   8.81353656  8.55134841
  8.91414634  6.16832173 11.26170151 11.73107008  8.80309861  9.84308849
  6.53529658  7.43909429  6.57744221  6.85503485  5.033267    6.44375015
  6.97114701  8.76874027  8.45847015  6.69290469 14.59196774  6.85266892
  8.91632984  8.8673066  12.45980886 11.01406563  8.05612471  6.75310221
  8.84066528  6.97393248  7.53831548 11.5371492  11.

Training model for patient p02
[Voting] ......... (1 of 3) Processing p02.XGBRegressor, total=  11.9s
[Voting] .......... (2 of 3) Processing p02.LassoLarsIC, total=   0.1s
[Voting]  (3 of 3) Processing p02.HistGradientBoostingRegressor, total=  12.6s
Predicting for patient p02


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Loading p04.LassoLarsIC.model.pkl
Loading p04.HistGradientBoostingRegressor.model.pkl
Loading p04.XGBRegressor.model.pkl
Training model for patient p04
[Voting] .......... (1 of 3) Processing p04.LassoLarsIC, total=   0.1s
[Voting]  (2 of 3) Processing p04.HistGradientBoostingRegressor, total=   6.8s
[Voting] ......... (3 of 3) Processing p04.XGBRegressor, total=  24.9s
Predicting for patient p04
Loading p05.LassoLarsIC.model.pkl
Loading p05.XGBRegressor.model.pkl
Loading p05.HistGradientBoostingRegressor.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p05
[Voting] .......... (1 of 3) Processing p05.LassoLarsIC, total=   0.1s
[Voting] ......... (2 of 3) Processing p05.XGBRegressor, total=  37.5s
[Voting]  (3 of 3) Processing p05.HistGradientBoostingRegressor, total=   6.6s
Predicting for patient p05
Loading p06.HistGradientBoostingRegressor.model.pkl
Loading p06.LassoLarsIC.model.pkl
Loading p06.XGBRegressor.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p06
[Voting]  (1 of 3) Processing p06.HistGradientBoostingRegressor, total=   4.0s
[Voting] .......... (2 of 3) Processing p06.LassoLarsIC, total=   0.1s
[Voting] ......... (3 of 3) Processing p06.XGBRegressor, total=   1.4s
Predicting for patient p06
Loading p10.HistGradientBoostingRegressor.model.pkl
Loading p10.XGBRegressor.model.pkl
Loading p10.LassoLarsIC.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p10
[Voting]  (1 of 3) Processing p10.HistGradientBoostingRegressor, total=  15.4s
[Voting] ......... (2 of 3) Processing p10.XGBRegressor, total=   3.9s
[Voting] .......... (3 of 3) Processing p10.LassoLarsIC, total=   0.1s
Predicting for patient p10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Loading p11.XGBRegressor.model.pkl
Loading p11.LassoLarsIC.model.pkl
Loading p11.HistGradientBoostingRegressor.model.pkl
Training model for patient p11
[Voting] ......... (1 of 3) Processing p11.XGBRegressor, total=  13.3s
[Voting] .......... (2 of 3) Processing p11.LassoLarsIC, total=   0.1s
[Voting]  (3 of 3) Processing p11.HistGradientBoostingRegressor, total=  19.8s
Predicting for patient p11
Loading p12.XGBRegressor.model.pkl
Loading p12.HistGradientBoostingRegressor.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Loading p12.LassoLarsIC.model.pkl
Training model for patient p12
[Voting] ......... (1 of 3) Processing p12.XGBRegressor, total=  43.7s
[Voting]  (2 of 3) Processing p12.HistGradientBoostingRegressor, total=   7.3s
[Voting] .......... (3 of 3) Processing p12.LassoLarsIC, total=   0.1s
Predicting for patient p12
Loading p15.HistGradientBoostingRegressor.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Loading p15.XGBRegressor.model.pkl
Loading p15.LassoLarsIC.model.pkl
Training model for patient p15
[Voting]  (1 of 3) Processing p15.HistGradientBoostingRegressor, total=   2.7s
[Voting] ......... (2 of 3) Processing p15.XGBRegressor, total=  24.1s
[Voting] .......... (3 of 3) Processing p15.LassoLarsIC, total=   0.0s
Predicting for patient p15
Loading p16.XGBRegressor.model.pkl
Loading p16.HistGradientBoostingRegressor.model.pkl
Loading p16.LassoLarsIC.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p16
[Voting] ......... (1 of 3) Processing p16.XGBRegressor, total=   2.4s
[Voting]  (2 of 3) Processing p16.HistGradientBoostingRegressor, total=   2.3s
[Voting] .......... (3 of 3) Processing p16.LassoLarsIC, total=   0.0s
Predicting for patient p16
Loading p18.HistGradientBoostingRegressor.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Loading p18.XGBRegressor.model.pkl
Loading p18.LassoLarsIC.model.pkl
Training model for patient p18
[Voting]  (1 of 3) Processing p18.HistGradientBoostingRegressor, total=   5.7s
[Voting] ......... (2 of 3) Processing p18.XGBRegressor, total=   1.4s
[Voting] .......... (3 of 3) Processing p18.LassoLarsIC, total=   0.0s
Predicting for patient p18
Loading p19.LassoLarsIC.model.pkl
Loading p19.HistGradientBoostingRegressor.model.pkl
Loading p19.XGBRegressor.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p19
[Voting] .......... (1 of 3) Processing p19.LassoLarsIC, total=   0.1s
[Voting]  (2 of 3) Processing p19.HistGradientBoostingRegressor, total=   2.2s
[Voting] ......... (3 of 3) Processing p19.XGBRegressor, total=   8.8s
Predicting for patient p19
Loading p21.HistGradientBoostingRegressor.model.pkl
Loading p21.LassoLarsIC.model.pkl
Loading p21.XGBRegressor.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p21
[Voting]  (1 of 3) Processing p21.HistGradientBoostingRegressor, total=   4.9s
[Voting] .......... (2 of 3) Processing p21.LassoLarsIC, total=   0.0s
[Voting] ......... (3 of 3) Processing p21.XGBRegressor, total=   7.6s
Predicting for patient p21
Loading p22.HistGradientBoostingRegressor.model.pkl
Loading p22.XGBRegressor.model.pkl
Loading p22.LassoLarsIC.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p22
[Voting]  (1 of 3) Processing p22.HistGradientBoostingRegressor, total=   1.9s
[Voting] ......... (2 of 3) Processing p22.XGBRegressor, total=  11.2s
[Voting] .......... (3 of 3) Processing p22.LassoLarsIC, total=   0.0s
Predicting for patient p22
Loading p24.XGBRegressor.model.pkl
Loading p24.HistGradientBoostingRegressor.model.pkl
Loading p24.LassoLarsIC.model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Training model for patient p24
[Voting] ......... (1 of 3) Processing p24.XGBRegressor, total=  31.2s
[Voting]  (2 of 3) Processing p24.HistGradientBoostingRegressor, total=   5.3s
[Voting] .......... (3 of 3) Processing p24.LassoLarsIC, total=   0.0s
Predicting for patient p24


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[test_data_patient.index, 'bg+1:00'] = model.predict(test_data_patient)


Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.822844
p01_8460,6.601804
p01_8461,8.336941
p01_8462,10.435052
p01_8463,6.477518


In [3]:
# check if results are correct
results[results['bg+1:00'] == -1]

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1


# Prepare test results

## Prepare the submission file

In [4]:
submission = results
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.822844
p01_8460,6.601804
p01_8461,8.336941
p01_8462,10.435052
p01_8463,6.477518
...,...
p24_256,6.497994
p24_257,10.054035
p24_258,6.128431
p24_259,8.246970


### Save the submission file

In [5]:
submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')