In [4]:
import pandas as pd
import numpy as np
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch

In [2]:
#openml.config.apikey = 'FILL_IN_OPENML_API_KEY'  # set the OpenML Api Key
SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task = openml.tasks.get_task(361072)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)



In [3]:
# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X.values]

mahalanobis_dist=pd.Series(mahalanobis_dist,index=X.index)
far_index=mahalanobis_dist.index[np.where(mahalanobis_dist>=np.quantile(mahalanobis_dist,0.8))[0]]
close_index=mahalanobis_dist.index[np.where(mahalanobis_dist<np.quantile(mahalanobis_dist,0.8))[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

mean = np.mean(X_train, axis=0)
cov = np.cov(X_train.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist_ = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X_train.values]

mahalanobis_dist_=pd.Series(mahalanobis_dist_,index=X_train.index)
far_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_>=np.quantile(mahalanobis_dist_,0.8))[0]]
close_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_<np.quantile(mahalanobis_dist_,0.8))[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]

In [8]:
N_TRIALS=5

def gp(trial):

    params = {'lenghtscale': trial.suggest_float('lenghtscale', 0, 10)}
    
    gp=GaussianProcessRegressor(kernel=Matern(length_scale=params['lenghtscale'], nu=1.5))
    gp.fit(X_train_, y_train_)
    y_val_hat_gp=gp.predict(X_val)
    RMSE_gp=np.sqrt(np.mean((y_val-y_val_hat_gp)**2))
    return RMSE_gp

sampler_gp = optuna.samplers.TPESampler(seed=10)
study_gp = optuna.create_study(sampler=sampler_gp, direction='minimize')
study_gp.optimize(gp, n_trials=N_TRIALS)

gp_model=GaussianProcessRegressor(kernel=Matern(length_scale=study_gp.best_params['lenghtscale'], nu=1.5))


def boosted(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5),
              'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              #'num_leaves': trial.suggest_int('num_leaves', 2, 100),
              'min_child_samples': trial.suggest_int('min_child_samples', 10, 100)}
    
    boosted_tree_model=lgbm.LGBMRegressor(**params)
    boosted_tree_model.fit(X_train_, y_train_)
    y_val_hat_boost=boosted_tree_model.predict(X_val)
    RMSE_boost=np.sqrt(np.mean((y_val-y_val_hat_boost)**2))

    return RMSE_boost

sampler_boost = optuna.samplers.TPESampler(seed=10)
study_boost = optuna.create_study(sampler=sampler_boost, direction='minimize')
study_boost.optimize(boosted, n_trials=N_TRIALS)

boosted_model=lgbm.LGBMRegressor(**study_boost.best_params)


def rf(trial):

    params = {'boosting_type':"rf",
              'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
              'subsample': trial.suggest_float('subsample', 0.5, 1),
              'feature_fraction_bynode': trial.suggest_float('feature_fraction_bynode', 0, 1),
              #'num_leaves': trial.suggest_int('num_leaves', 2, 100),
              'min_child_samples': trial.suggest_int('min_child_samples', 10, 100)}
    
    rf_model=lgbm.LGBMRegressor(**params)
    rf_model.fit(X_train_, y_train_)
    y_val_hat_rf=rf_model.predict(X_val)
    RMSE_rf=np.sqrt(np.mean((y_val-y_val_hat_rf)**2))

    return RMSE_rf

sampler_rf = optuna.samplers.TPESampler(seed=10)
study_rf = optuna.create_study(sampler=sampler_rf, direction='minimize')
study_rf.optimize(rf, n_trials=N_TRIALS)

rf_model=lgbm.LGBMRegressor(**study_rf.best_params)

def engressor_NN(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5),
              'num_epoches': trial.suggest_int('num_epoches', 100, 1000),
              'num_layer': trial.suggest_int('num_layer', 2, 5),
              'hidden_dim': trial.suggest_int('hidden_dim', 50, 100),
              'noise_dim': trial.suggest_int('noise_dim', 50, 100),}
    
    engressor_model=engression(torch.Tensor(np.array(X_train_)), torch.Tensor(np.array(y_train_).reshape(-1,1)), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
    y_val_hat_engression=engressor_model.predict(torch.Tensor(np.array(X_val)), target="mean")
    RMSE_engression=np.sqrt((((torch.Tensor(np.array(y_val).reshape(-1,1)))-y_val_hat_engression)**2).mean(axis=0))

    return RMSE_engression

sampler_engression = optuna.samplers.TPESampler(seed=10)
study_engression = optuna.create_study(sampler=sampler_engression, direction='minimize')
study_engression.optimize(engressor_NN, n_trials=N_TRIALS)

[I 2023-12-04 16:52:36,341] A new study created in memory with name: no-name-76f857dd-846f-45c7-a01d-187f2ac6312b
[I 2023-12-04 16:54:07,410] Trial 0 finished with value: 36.3988536368184 and parameters: {'lenghtscale': 7.71320643266746}. Best is trial 0 with value: 36.3988536368184.
[I 2023-12-04 16:54:17,239] Trial 1 finished with value: 79.97454305109773 and parameters: {'lenghtscale': 0.207519493594015}. Best is trial 0 with value: 36.3988536368184.
[I 2023-12-04 16:55:48,661] Trial 2 finished with value: 36.398853635730084 and parameters: {'lenghtscale': 6.336482349262754}. Best is trial 2 with value: 36.398853635730084.
[I 2023-12-04 16:58:50,774] Trial 3 finished with value: 36.398854693719755 and parameters: {'lenghtscale': 7.488038825386118}. Best is trial 2 with value: 36.398853635730084.
[I 2023-12-04 17:02:23,089] Trial 4 finished with value: 36.39890062205961 and parameters: {'lenghtscale': 4.9850701230259045}. Best is trial 2 with value: 36.398853635730084.
[I 2023-12-04 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001871 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:23,549] Trial 1 finished with value: 22.810531505875442 and parameters: {'learning_rate': 0.11317352611989298, 'n_estimators': 179, 'reg_lambda': 0.0699481785242808, 'max_depth': 6, 'min_child_samples': 18}. Best is trial 1 with value: 22.810531505875442.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:24,143] Trial 2 finished with value: 22.85055837450557 and parameters: {'learning_rate': 0.3429945493655308, 'n_estimators': 482, 'reg_lambda': 1.08526150100961e-08, 'max_depth': 16, 'min_child_samples': 83}. Best is trial 1 with value: 22.810531505875442.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:24,584] Trial 3 finished with value: 22.88449372285418 and parameters: {'learning_rate': 0.3066505073478647, 'n_estimators': 389, 'reg_lambda': 4.235304245072407e-06, 'max_depth': 28, 'min_child_samples': 75}. Best is trial 1 with value: 22.810531505875442.
[I 2023-12-04 17:02:24,768] Trial 4 finished with value: 22.964538631424723 and parameters: {'learning_rate': 0.27172963963761937, 'n_estimators': 157, 'reg_lambda': 2.2912202578440842e-05, 'max_depth': 21, 'min_child_samples': 50}. Best is trial 1 with value: 22.810531505875442.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:24,776] A new study created in memory with name: no-name-c0168aba-99ef-4fef-b419-1802efd524b0


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:25,369] Trial 0 finished with value: 23.17178639471135 and parameters: {'n_estimators': 409, 'reg_lambda': 1.537331564587801e-08, 'max_depth': 20, 'subsample_freq': 8, 'subsample': 0.7492535061512953, 'feature_fraction_bynode': 0.22479664553084766, 'min_child_samples': 28}. Best is trial 0 with value: 23.17178639471135.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:25,571] Trial 1 finished with value: 23.717464823127113 and parameters: {'n_estimators': 404, 'reg_lambda': 3.32657660618516e-07, 'max_depth': 3, 'subsample_freq': 7, 'subsample': 0.9766966730974682, 'feature_fraction_bynode': 0.003948266327914451, 'min_child_samples': 56}. Best is trial 0 with value: 23.17178639471135.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:26,191] Trial 2 finished with value: 23.160358983735637 and parameters: {'n_estimators': 425, 'reg_lambda': 0.003256376421394008, 'max_depth': 22, 'subsample_freq': 3, 'subsample': 0.9588870612564717, 'feature_fraction_bynode': 0.7145757833976906, 'min_child_samples': 59}. Best is trial 2 with value: 23.160358983735637.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:26,405] Trial 3 finished with value: 23.17696691866533 and parameters: {'n_estimators': 157, 'reg_lambda': 2.2912202578440842e-05, 'max_depth': 21, 'subsample_freq': 5, 'subsample': 0.7170069966666468, 'feature_fraction_bynode': 0.6177669784693172, 'min_child_samples': 56}. Best is trial 2 with value: 23.160358983735637.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3766
[LightGBM] [Info] Number of data points in the train set: 5242, number of used features: 21
[LightGBM] [Info] Start training from score 91.111980


[I 2023-12-04 17:02:26,985] Trial 4 finished with value: 23.139511147941665 and parameters: {'n_estimators': 360, 'reg_lambda': 0.0025665550309028774, 'max_depth': 25, 'subsample_freq': 6, 'subsample': 0.9543244404043341, 'feature_fraction_bynode': 0.3192360889885453, 'min_child_samples': 18}. Best is trial 4 with value: 23.139511147941665.
[I 2023-12-04 17:02:26,985] A new study created in memory with name: no-name-5cfbfe91-4ddc-4892-811c-d43e05addf88


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 1.1156,  E(|Y-Yhat|): 2.6892,  E(|Yhat-Yhat'|): 3.1472
[Epoch 100 (84%), batch 6] energy-loss: 0.2423,  E(|Y-Yhat|): 0.4988,  E(|Yhat-Yhat'|): 0.5131

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.6624,  E(|Y-Yhat|): 2.3485,  E(|Yhat-Yhat'|): 1.3722

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2023-12-04 17:02:43,009] Trial 0 finished with value: 23.97712516784668 and parameters: {'learning_rate': 0.38588900099010626, 'num_epoches': 118, 'num_layer': 4, 'hidden_dim': 88, 'noise_dim': 75}. Best is trial 0 with value: 23.97712516784668.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.6543,  E(|Y-Yhat|): 1.2957,  E(|Yhat-Yhat'|): 1.2829
[Epoch 100 (36%), batch 6] energy-loss: 0.2030,  E(|Y-Yhat|): 0.4312,  E(|Yhat-Yhat'|): 0.4564
[Epoch 200 (72%), batch 6] energy-loss: 0.1917,  E(|Y-Yhat|): 0.4500,  E(|Yhat-Yhat'|): 0.5166

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.2158,  E(|Y-Yhat|): 2.0559,  E(|Yhat-Yhat'|): 1.6801

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2023-12-04 17:03:17,051] Trial 1 finished with value: 13.67954158782959 and parameters: {'learning_rate': 0.11317352611989298, 'num_epoches': 278, 'num_layer': 5, 'hidden_dim': 58, 'noise_dim': 54}. Best is trial 1 with value: 13.67954158782959.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.7065,  E(|Y-Yhat|): 1.9986,  E(|Yhat-Yhat'|): 2.5843
[Epoch 100 (10%), batch 6] energy-loss: 0.2301,  E(|Y-Yhat|): 0.4942,  E(|Yhat-Yhat'|): 0.5281
[Epoch 200 (21%), batch 6] energy-loss: 0.2387,  E(|Y-Yhat|): 0.4149,  E(|Yhat-Yhat'|): 0.3525
[Epoch 300 (31%), batch 6] energy-loss: 0.2588,  E(|Y-Yhat|): 0.4397,  E(|Yhat-Yhat'|): 0.3617
[Epoch 400 (42%), batch 6] energy-loss: 0.2555,  E(|Y-Yhat|): 0.4489,  E(|Yhat-Yhat'|): 0.3867
[Epoch 500 (52%), batch 6] energy-loss: 0.2133,  E(|Y-Yhat|): 0.4156,  E(|Yhat-Yhat'|): 0.4047
[Epoch 600 (62%), batch 6] energy-loss: 0.2685,  E(|Y-Yhat|): 0.4865,  E(|Yhat-Yhat'|): 0.4360
[Epoch 700 (73%), batch 6] energy-loss: 0.

[I 2023-12-04 17:04:35,226] Trial 2 finished with value: 127.4811782836914 and parameters: {'learning_rate': 0.3429945493655308, 'num_epoches': 959, 'num_layer': 2, 'hidden_dim': 76, 'noise_dim': 91}. Best is trial 1 with value: 13.67954158782959.



Training loss on the original (non-standardized) scale:
	Energy-loss: 1.2577,  E(|Y-Yhat|): 2.6051,  E(|Yhat-Yhat'|): 2.6949

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 
Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.7243,  E(|Y-Yhat|): 1.5722,  E(|Yhat-Yhat'|): 1.6958
[Epoch 100 (13%), batch 6] energy-loss: 0.2150,  E(|Y-Yhat|): 0.4371,  E(|Yhat-Yhat'|): 0.4441
[Epoch 200 (27%), batch 6] energy-loss: 0.2290,  E(|Y-Yhat|): 0.4668,  E(|Yhat-Yhat'|): 0.4756
[Epoch 300 (40%), batch 6] energy-loss: 0.2033,  E(|Y-Yhat|): 0.4286,  E(|Yhat-Yhat'|): 0.4506
[Epoch 400 (53%)

[I 2023-12-04 17:05:57,508] Trial 3 finished with value: 170.4100799560547 and parameters: {'learning_rate': 0.3066505073478647, 'num_epoches': 750, 'num_layer': 3, 'hidden_dim': 96, 'noise_dim': 86}. Best is trial 1 with value: 13.67954158782959.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.8017,  E(|Y-Yhat|): 1.2023,  E(|Yhat-Yhat'|): 0.8012
[Epoch 100 (43%), batch 6] energy-loss: 0.2099,  E(|Y-Yhat|): 0.4407,  E(|Yhat-Yhat'|): 0.4616
[Epoch 200 (87%), batch 6] energy-loss: 0.2332,  E(|Y-Yhat|): 0.4366,  E(|Yhat-Yhat'|): 0.4068

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.0694,  E(|Y-Yhat|): 2.2098,  E(|Yhat-Yhat'|): 2.2809

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2023-12-04 17:06:21,196] Trial 4 finished with value: 30.25997543334961 and parameters: {'learning_rate': 0.27172963963761937, 'num_epoches': 228, 'num_layer': 3, 'hidden_dim': 84, 'noise_dim': 72}. Best is trial 1 with value: 13.67954158782959.


In [9]:
gp_model.fit(X_train, y_train)
y_test_hat_gp=gp_model.predict(X_test)
RMSE_gp=np.sqrt(np.mean((y_test-y_test_hat_gp)**2))

boosted_model.fit(X_train, y_train)
y_test_hat_boosted=boosted_model.predict(X_test)
RMSE_boosted=np.sqrt(np.mean((y_test-y_test_hat_boosted)**2))

rf_model.fit(X_train, y_train)
y_test_hat_rf=rf_model.predict(X_test)
RMSE_rf=np.sqrt(np.mean((y_test-y_test_hat_rf)**2))

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_hat_linreg=lin_reg.predict(X_test)
RMSE_linreg=np.sqrt(np.mean((y_test-y_test_hat_linreg)**2))

params=study_engression.best_params
engressor_model=engression(torch.Tensor(np.array(X_train)), torch.Tensor(np.array(y_train).reshape(-1,1)), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
y_test_hat_engression=engressor_model.predict(torch.Tensor(np.array(X_test)), target="mean")
RMSE_engression=np.sqrt((((torch.Tensor(np.array(y_test).reshape(-1,1)))-y_test_hat_engression)**2).mean(axis=0))

print("RMSE linear regression: ",RMSE_linreg)
print("RMSE boosted trees", RMSE_boosted)
print("RMSE random forest", RMSE_rf)
print("RMSE gaussian process", RMSE_gp)
print("RMSE engression", RMSE_engression)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4351
[LightGBM] [Info] Number of data points in the train set: 6553, number of used features: 21
[LightGBM] [Info] Start training from score 88.320464
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4351
[LightGBM] [Info] Number of data points in the train set: 6553, number of used features: 21
[LightGBM] [Info] Start training from score 88.320464
Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch

RuntimeError: The size of tensor a (1311) must match the size of tensor b (1639) at non-singleton dimension 0

In [10]:
RMSE_engression=np.sqrt((((torch.Tensor(np.array(y_test).reshape(-1,1)))-y_test_hat_engression)**2).mean(axis=0))

print("RMSE linear regression: ",RMSE_linreg)
print("RMSE boosted trees", RMSE_boosted)
print("RMSE random forest", RMSE_rf)
print("RMSE gaussian process", RMSE_gp)
print("RMSE engression", RMSE_engression)

RMSE linear regression:  62.070783717577555
RMSE boosted trees 5.276375538454642
RMSE random forest 5.220881219934237
RMSE gaussian process 38.12484784299995
RMSE engression tensor([34.8711])
