# Create a baseline model

In [1]:
from data import load_data

_, additional_train_data, test_data = load_data()


## Predict for validation data and check results

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LassoLarsIC, Ridge
from pipelines import pipeline

train_data = pipeline.fit_transform(additional_train_data)

X_train = train_data.drop(columns=['bg+1:00'])
y_train = train_data['bg+1:00']

base_models = [
    ('hgb', HistGradientBoostingRegressor(max_iter=1000, max_depth=5, learning_rate=0.01)),
    ('xgb', XGBRegressor(objective='reg:squarederror', random_state=42, n_estimators=5000, max_depth=5, learning_rate=0.01)),
    ('knn', KNeighborsRegressor(n_neighbors=5)),
    ('lasso', LassoLarsIC(criterion='bic', max_iter=5000))
]

meta_model = Ridge(alpha=0.1)

model = StackingRegressor(estimators=base_models, final_estimator=meta_model, n_jobs=-1, verbose=2)
model.fit(X=X_train, y=y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.3s remaining:    6.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.4s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   37.1s remaining:   55.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.2s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.7min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.2min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.2min finished


In [3]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)))
scoring = {
    'r2': 'r2',  # Predefined scorer
    'rmse': rmse_scorer  # Custom scorer
}

cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=scoring)

# Print results
print(f"Cross-validation R² scores: {cv_results['test_r2']}")
print(f"Mean R² score: {np.mean(cv_results['test_r2'])}")
print(f"Cross-validation RMSE scores: {cv_results['test_rmse']}")
print(f"Mean RMSE score: {np.mean(cv_results['test_rmse'])}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.2s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   20.5s remaining:   30.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.0s finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.0min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.0min finished
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.4min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.4min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurr

Cross-validation R² scores: [0.47278428 0.60635417 0.5382869  0.45576503 0.55389192]
Mean R² score: 0.5254164622561835
Cross-validation RMSE scores: [2.36631457 2.14357764 1.96595725 2.15604635 2.32159805]
Mean RMSE score: 2.190698770985323


# Prepare test results

In [4]:
import numpy as np

test_data = pipeline.transform(test_data)
y_pred = model.predict(test_data)

# count and replace negative values
if np.sum(y_pred < 0) > 0:
    print(f'Number of negative values: {np.sum(y_pred < 0)}')
    bg_min_train = np.min(y_train)
    print(f'Min value: {np.min(y_pred)}')
    y_pred = y_pred.apply(lambda x: bg_min_train if x < 0 else x)

test_data['bg+1:00'] = y_pred
test_data.head()

Unnamed: 0_level_0,hour_sin,hour_cos,bg-1:00,bg-0:55,bg-0:50,bg-0:45,bg-0:40,bg-0:35,bg-0:30,bg-0:25,...,p_num_p11,p_num_p12,p_num_p15,p_num_p16,p_num_p18,p_num_p19,p_num_p21,p_num_p22,p_num_p24,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_8459,1.392127,-0.27099,1.467987,1.314231,1.129353,0.944336,0.728126,0.542818,0.450412,0.389085,...,0,0,0,0,0,0,0,0,0,8.924344
p01_8460,0.219131,-1.391824,-1.503779,-1.441986,-1.380058,-1.318073,-1.132092,-0.914954,-0.728557,-0.728108,...,0,0,0,0,0,0,0,0,0,6.190216
p01_8461,-0.929527,-1.057637,-1.19422,-1.008424,-0.853391,-0.667243,-0.512019,-0.356658,-0.201123,-0.169511,...,0,0,0,0,0,0,0,0,0,8.144193
p01_8462,1.311575,0.545362,-0.729882,-0.605831,-0.512607,-0.388316,-0.357001,-0.325642,-0.2942,-0.231577,...,0,0,0,0,0,0,0,0,0,10.459691
p01_8463,1.286702,0.601787,-1.689515,-1.689736,-1.658881,-1.658984,-1.690157,-1.690365,-1.721373,-1.659101,...,0,0,0,0,0,0,0,0,0,6.936171


## Prepare the submission file

In [5]:
import pandas as pd

submission = pd.DataFrame(test_data['bg+1:00'])
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,8.924344
p01_8460,6.190216
p01_8461,8.144193
p01_8462,10.459691
p01_8463,6.936171
...,...
p24_256,6.174520
p24_257,10.503989
p24_258,6.567591
p24_259,8.255177


### Save the submission file

In [6]:
import os

submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')