# Create a baseline model

In [1]:
from data import load_data

_, additional_train_data, test_data = load_data()


## Predict for validation data and check results

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LassoLarsIC, Ridge
from pipelines import pipeline

train_data = pipeline.fit_transform(additional_train_data)

X_train = train_data.drop(columns=['bg+1:00'])
y_train = train_data['bg+1:00']

base_models = [
    ('hgb', HistGradientBoostingRegressor(max_iter=1000, max_depth=5, learning_rate=0.01)),
    ('xgb', XGBRegressor(objective='reg:squarederror', random_state=42, n_estimators=5000, max_depth=5, learning_rate=0.01)),
    ('knn', KNeighborsRegressor(n_neighbors=5)),
    ('lasso', LassoLarsIC(criterion='bic', max_iter=5000))
]

meta_model = Ridge(alpha=0.1)

model = StackingRegressor(estimators=base_models, final_estimator=meta_model, n_jobs=-1, verbose=2)
model.fit(X=X_train, y=y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.7s remaining:   14.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.9s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   37.6s remaining:   56.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.7s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.0min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.7min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.7min finished


In [4]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)))
scoring = {
    'r2': 'r2',  # Predefined scorer
    'rmse': rmse_scorer  # Custom scorer
}

cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=scoring)

# Print results
print(f"Cross-validation R² scores: {cv_results['test_r2']}")
print(f"Mean R² score: {np.mean(cv_results['test_r2'])}")
print(f"Cross-validation RMSE scores: {cv_results['test_rmse']}")
print(f"Mean RMSE score: {np.mean(cv_results['test_rmse'])}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.1s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   23.8s remaining:   35.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   25.5s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.6min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.2min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurr

Cross-validation R² scores: [0.50755317 0.63477346 0.56990727 0.51833276 0.58491951]
Mean R² score: 0.5630972334933411
Cross-validation RMSE scores: [2.34652238 2.07229714 1.93801378 2.1189596  2.23836215]
Mean RMSE score: 2.1428310109263258


# Prepare test results

In [5]:
import numpy as np

test_data = pipeline.transform(test_data)
y_pred = model.predict(test_data)

# count and replace negative values
if np.sum(y_pred < 0) > 0:
    print(f'Number of negative values: {np.sum(y_pred < 0)}')
    bg_min_train = np.min(y_train)
    print(f'Min value: {np.min(y_pred)}')
    y_pred = y_pred.apply(lambda x: bg_min_train if x < 0 else x)

test_data['bg+1:00'] = y_pred
test_data.head()

Unnamed: 0_level_0,hour_sin,hour_cos,bg-1:00,bg-0:55,bg-0:50,bg-0:45,bg-0:40,bg-0:35,bg-0:30,bg-0:25,...,p_num_p11,p_num_p12,p_num_p15,p_num_p16,p_num_p18,p_num_p19,p_num_p21,p_num_p22,p_num_p24,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_8459,1.384937,-0.269037,1.429,1.278222,1.096882,0.915361,0.703233,0.52132,0.430554,0.370231,...,0,0,0,0,0,0,0,0,0,9.018437
p01_8460,0.212445,-1.39036,-1.486497,-1.425876,-1.365118,-1.304324,-1.121868,-0.908825,-0.725967,-0.725569,...,0,0,0,0,0,0,0,0,0,5.700801
p01_8461,-0.935722,-1.056026,-1.1828,-1.000512,-0.848402,-0.665784,-0.513501,-0.36111,-0.208576,-0.177669,...,0,0,0,0,0,0,0,0,0,7.986597
p01_8462,1.30442,0.547671,-0.727253,-0.605532,-0.514057,-0.392125,-0.361409,-0.330681,-0.29988,-0.238547,...,0,0,0,0,0,0,0,0,0,10.704939
p01_8463,1.279557,0.604121,-1.668716,-1.668941,-1.638674,-1.638797,-1.669399,-1.669541,-1.69988,-1.638735,...,0,0,0,0,0,0,0,0,0,7.164982


## Prepare the submission file

In [6]:
import pandas as pd

submission = pd.DataFrame(test_data['bg+1:00'])
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,9.018437
p01_8460,5.700801
p01_8461,7.986597
p01_8462,10.704939
p01_8463,7.164982
...,...
p24_256,6.163332
p24_257,10.649018
p24_258,6.629352
p24_259,8.241639


### Save the submission file

In [7]:
import os

submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')