# Notebook for the custom stacking model, using catboost an LGBM

In [202]:
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
import pandas as pd 
%store -r dm

dm = dm


## Our model 

We are training two models in parallel, then we are training a model on the weighted sum of the predictions of the two other models. 

catboost -> catboost_preds
adaBoost -> adaboost_preds

-> randomforest -> final_preds

In [205]:
# [I 2023-10-08 00:40:41,291] Trial 21 finished with value: 0.9862843295936936 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.10931167365445349, 'colsample_bylevel': 0.09963123954233088, 'max_depth': 15, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 21 with value: 0.9862843295936936.

cat_A = CatBoostRegressor(objective="MultiRMSE", learning_rate=10931167365445349, colsample_bylevel=0.09963123954233088, max_depth=15, boosting_type="Plain", bootstrap_type="MVS")
ada_A = AdaBoostRegressor()
xgb_A = XGBRegressor()

cat_B = CatBoostRegressor(objective="MultiRMSE", learning_rate=10931167365445349, colsample_bylevel=0.09963123954233088, max_depth=15, boosting_type="Plain", bootstrap_type="MVS")
ada_B = AdaBoostRegressor()
xgb_B = XGBRegressor()

cat_C = CatBoostRegressor(objective="MultiRMSE", learning_rate=10931167365445349, colsample_bylevel=0.09963123954233088, max_depth=15, boosting_type="Plain", bootstrap_type="MVS")
ada_C = AdaBoostRegressor()
xgb_C = XGBRegressor()

forest_A = RandomForestRegressor()
forest_B = RandomForestRegressor()
forest_C = RandomForestRegressor()


estimators = [
    ("cat", CatBoostRegressor(objective="MultiRMSE", learning_rate=0.10931167365445349, colsample_bylevel=0.09963123954233088, max_depth=15, boosting_type="Plain", bootstrap_type="MVS")),
    ("xbg", XGBRegressor()),
    ("ada", AdaBoostRegressor())
]

reg_A = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor()
)

reg_B = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor()
)

reg_C = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor()
)



In [206]:
#preparing data

X_A = dm.data_A.iloc[:,2:-1] #independent columns
y_A = dm.data_A.iloc[:,0]   #target column i.e pv measurement

X_B = dm.data_B.iloc[:,2:-1] #independent columns
y_B = dm.data_B.iloc[:,0]    #target column i.e pv measurement

X_C = dm.data_C.iloc[:,2:-1] #independent columns
y_C = dm.data_C.iloc[:,0]   #target column i.e pv measurement


X_A_train, X_A_test, y_A_train, y_A_test = train_test_split(X_A, y_A)
X_B_train, X_B_test, y_B_train, y_B_test = train_test_split(X_B, y_B)
X_C_train, X_C_test, y_C_train, y_C_test = train_test_split(X_C, y_C)

X_test_A = dm.X_test_estimated_a[dm.X_test_estimated_a.columns.intersection(X_A_train.columns)]
X_test_B = dm.X_test_estimated_b[X_B_train.columns.intersection(dm.X_test_estimated_b.columns)]
X_test_C = dm.X_test_estimated_c[X_C_train.columns.intersection(dm.X_test_estimated_c.columns)]

## Finding hyperparameter

In [201]:
import catboost
import optuna
import numpy as np

def accuracy_score(truth, predictions): 
    e = np.abs(truth - predictions)

    n = e.shape[0]

    return e.sum() / n



def objective(trial):


    #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2)

    param = {
        "objective": trial.suggest_categorical("objective", ["MultiRMSE", "MAE"]),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.3),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 1, 15),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
    }
    

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_uniform("subsample", 0.1, 1)

    gbm = catboost.CatBoostRegressor(**param, iterations = 10000)

    gbm.fit(X_A_train, y_A_train, eval_set = [(X_A_test, y_A_test)], verbose = 0, early_stopping_rounds = 100)

    accuracy = gbm.score(X_A_test, y_A_test)
    
    return accuracy

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 200, show_progress_bar = True)

[I 2023-10-07 23:54:19,415] A new study created in memory with name: no-name-e4ff9a2a-e99c-4ab3-b100-19503ae4a445
Best trial: 0. Best value: 0.667628:   0%|          | 1/200 [01:06<3:42:01, 66.94s/it]

[I 2023-10-07 23:55:26,362] Trial 0 finished with value: 0.6676280346526495 and parameters: {'objective': 'MAE', 'learning_rate': 0.0035054867842904654, 'colsample_bylevel': 0.059479504906839134, 'max_depth': 1, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.93124545991672}. Best is trial 0 with value: 0.6676280346526495.


Best trial: 1. Best value: 0.853601:   1%|          | 2/200 [01:59<3:12:28, 58.32s/it]

[I 2023-10-07 23:56:18,655] Trial 1 finished with value: 0.853601414723894 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.003635882536866051, 'colsample_bylevel': 0.03979924292192242, 'max_depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6458755467814817}. Best is trial 1 with value: 0.853601414723894.


Best trial: 1. Best value: 0.853601:   2%|▏         | 3/200 [03:06<3:24:46, 62.37s/it]

[I 2023-10-07 23:57:25,830] Trial 2 finished with value: 0.7099762449813387 and parameters: {'objective': 'MAE', 'learning_rate': 0.003982000460450549, 'colsample_bylevel': 0.019308623043597293, 'max_depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.853601414723894.


Best trial: 1. Best value: 0.853601:   2%|▏         | 4/200 [04:29<3:49:51, 70.36s/it]

[I 2023-10-07 23:58:48,452] Trial 3 finished with value: 0.826399279407149 and parameters: {'objective': 'MAE', 'learning_rate': 0.002870761427327919, 'colsample_bylevel': 0.05512461826445632, 'max_depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.609487137496304}. Best is trial 1 with value: 0.853601414723894.


Best trial: 1. Best value: 0.853601:   2%|▎         | 5/200 [05:14<3:19:20, 61.33s/it]

[I 2023-10-07 23:59:33,777] Trial 4 finished with value: 0.8340634783990233 and parameters: {'objective': 'MAE', 'learning_rate': 0.0024292929074243336, 'colsample_bylevel': 0.06904641638582049, 'max_depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.19431704375599732}. Best is trial 1 with value: 0.853601414723894.


Best trial: 5. Best value: 0.864068:   3%|▎         | 6/200 [16:03<14:05:00, 261.34s/it]

[I 2023-10-08 00:10:23,374] Trial 5 finished with value: 0.8640680266080638 and parameters: {'objective': 'MAE', 'learning_rate': 0.0026840787307424864, 'colsample_bylevel': 0.07215057860246354, 'max_depth': 15, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.37689611592109}. Best is trial 5 with value: 0.8640680266080638.


Best trial: 6. Best value: 0.970462:   4%|▎         | 7/200 [16:40<10:03:47, 187.71s/it]

[I 2023-10-08 00:10:59,471] Trial 6 finished with value: 0.9704618465860364 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.08624492006536619, 'colsample_bylevel': 0.09475985185895697, 'max_depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5099297822966387}. Best is trial 6 with value: 0.9704618465860364.


Best trial: 6. Best value: 0.970462:   4%|▍         | 8/200 [17:38<7:48:49, 146.51s/it] 

[I 2023-10-08 00:11:57,771] Trial 7 finished with value: 0.8922755298757569 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.021377193950269376, 'colsample_bylevel': 0.09535746806501581, 'max_depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.3506913256350602}. Best is trial 6 with value: 0.9704618465860364.


Best trial: 6. Best value: 0.970462:   4%|▍         | 9/200 [19:19<7:01:20, 132.36s/it]

[I 2023-10-08 00:13:39,021] Trial 8 finished with value: 0.9594073924058611 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.025849443712435936, 'colsample_bylevel': 0.06640020109768738, 'max_depth': 15, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 6 with value: 0.9704618465860364.


Best trial: 6. Best value: 0.970462:   5%|▌         | 10/200 [20:26<5:55:13, 112.17s/it]

[I 2023-10-08 00:14:45,995] Trial 9 finished with value: 0.8269023481171818 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.0010966548084566648, 'colsample_bylevel': 0.0944391889643105, 'max_depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.335566291100328}. Best is trial 6 with value: 0.9704618465860364.


Best trial: 10. Best value: 0.980766:   6%|▌         | 11/200 [21:08<4:45:17, 90.57s/it] 

[I 2023-10-08 00:15:27,580] Trial 10 finished with value: 0.9807663708241661 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.15465951326479635, 'colsample_bylevel': 0.0834413738682415, 'max_depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9602811486963978}. Best is trial 10 with value: 0.9807663708241661.


Best trial: 11. Best value: 0.982357:   6%|▌         | 12/200 [22:01<4:08:01, 79.16s/it]

[I 2023-10-08 00:16:20,628] Trial 11 finished with value: 0.9823567966071528 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.17524389746586597, 'colsample_bylevel': 0.08594637881193697, 'max_depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9982233559478759}. Best is trial 11 with value: 0.9823567966071528.


Best trial: 11. Best value: 0.982357:   6%|▋         | 13/200 [22:49<3:37:56, 69.93s/it]

[I 2023-10-08 00:17:09,320] Trial 12 finished with value: 0.9811316186225559 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.22567648831456744, 'colsample_bylevel': 0.0815715636207689, 'max_depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9798457634905741}. Best is trial 11 with value: 0.9823567966071528.


Best trial: 11. Best value: 0.982357:   7%|▋         | 14/200 [24:08<3:44:32, 72.43s/it]

[I 2023-10-08 00:18:27,539] Trial 13 finished with value: 0.9817328535250294 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.2817791232752512, 'colsample_bylevel': 0.07800444007555962, 'max_depth': 13, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9352398714725687}. Best is trial 11 with value: 0.9823567966071528.


Best trial: 11. Best value: 0.982357:   8%|▊         | 15/200 [25:30<3:52:57, 75.55s/it]

[I 2023-10-08 00:19:50,328] Trial 14 finished with value: 0.9821995997690764 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.27897153166188177, 'colsample_bylevel': 0.08087398817193063, 'max_depth': 13, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7997210624595146}. Best is trial 11 with value: 0.9823567966071528.


Best trial: 15. Best value: 0.986072:   8%|▊         | 16/200 [27:56<4:56:40, 96.74s/it]

[I 2023-10-08 00:22:16,272] Trial 15 finished with value: 0.9860722927874278 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.08510017570275764, 'colsample_bylevel': 0.09997278260045005, 'max_depth': 13, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7779095038163446}. Best is trial 15 with value: 0.9860722927874278.


Best trial: 15. Best value: 0.986072:   8%|▊         | 17/200 [30:25<5:42:26, 112.28s/it]

[I 2023-10-08 00:24:44,672] Trial 16 finished with value: 0.9856396074746052 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.07134434766466158, 'colsample_bylevel': 0.09969576429420006, 'max_depth': 13, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 15 with value: 0.9860722927874278.


Best trial: 15. Best value: 0.986072:   9%|▉         | 18/200 [32:49<6:10:07, 122.02s/it]

[I 2023-10-08 00:27:09,370] Trial 17 finished with value: 0.9847666528842852 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.05889189075680605, 'colsample_bylevel': 0.09859514422418575, 'max_depth': 13, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 15 with value: 0.9860722927874278.


Best trial: 15. Best value: 0.986072:  10%|▉         | 19/200 [33:12<4:38:12, 92.23s/it] 

[I 2023-10-08 00:27:32,193] Trial 18 finished with value: 0.9161162805341986 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.052492271187599984, 'colsample_bylevel': 0.045494976307166736, 'max_depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 15 with value: 0.9860722927874278.


Best trial: 15. Best value: 0.986072:  10%|█         | 20/200 [36:27<6:09:00, 123.00s/it]

[I 2023-10-08 00:30:46,930] Trial 19 finished with value: 0.9852957300994807 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.10130999367497383, 'colsample_bylevel': 0.08916088643231418, 'max_depth': 14, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 15 with value: 0.9860722927874278.


Best trial: 15. Best value: 0.986072:  10%|█         | 21/200 [38:22<5:59:53, 120.64s/it]

[I 2023-10-08 00:32:42,044] Trial 20 finished with value: 0.9796028192705675 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.03377418800104841, 'colsample_bylevel': 0.08962732530734048, 'max_depth': 13, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 15 with value: 0.9860722927874278.


Best trial: 21. Best value: 0.986284:  11%|█         | 22/200 [46:21<11:17:10, 228.26s/it]

[I 2023-10-08 00:40:41,291] Trial 21 finished with value: 0.9862843295936936 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.10931167365445349, 'colsample_bylevel': 0.09963123954233088, 'max_depth': 15, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 21 with value: 0.9862843295936936.


Best trial: 21. Best value: 0.986284:  12%|█▏        | 23/200 [54:25<14:59:09, 304.80s/it]

[I 2023-10-08 00:48:44,612] Trial 22 finished with value: 0.985865981305033 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.10975787042596942, 'colsample_bylevel': 0.09904111311137576, 'max_depth': 15, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 21 with value: 0.9862843295936936.


Best trial: 21. Best value: 0.986284:  12%|█▏        | 23/200 [1:01:06<7:50:18, 159.42s/it] 

[W 2023-10-08 00:55:26,187] Trial 23 failed with parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.15597396081675063, 'colsample_bylevel': 0.09998914418344669, 'max_depth': 15, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/r_/m51x_wn949x1s34wqwqf6hkr0000gn/T/ipykernel_19308/2526846710.py", line 37, in objective
    gbm.fit(X_A_train, y_A_train, eval_set = [(X_A_test, y_A_test)], verbose = 0, early_stopping_rounds = 100)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/catboost/core.py", line 5703, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "/Library/Frame




KeyboardInterrupt: 

In [207]:
# [I 2023-10-08 00:22:16,272] Trial 15 finished with value: 0.9860722927874278 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.08510017570275764, 'colsample_bylevel': 0.09997278260045005, 'max_depth': 13, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7779095038163446}. Best is trial 15 with value: 0.9860722927874278.
# [I 2023-10-08 00:10:23,374] Trial 5 finished with value: 0.8640680266080638 and parameters: {'objective': 'MAE', 'learning_rate': 0.0026840787307424864, 'colsample_bylevel': 0.07215057860246354, 'max_depth': 15, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.37689611592109}. Best is trial 5 with value: 0.8640680266080638.
# [I 2023-10-08 00:40:41,291] Trial 21 finished with value: 0.9862843295936936 and parameters: {'objective': 'MultiRMSE', 'learning_rate': 0.10931167365445349, 'colsample_bylevel': 0.09963123954233088, 'max_depth': 15, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 21 with value: 0.9862843295936936.


# cat_A.fit(X_A_train, y_A_train)
# cat_B.fit(X_B_train, y_B_train)
# cat_C.fit(X_C_train, y_C_train)
# # cat.fit(X_B_train, y_B_train)
# # cat.fit(X_C_train, y_C_train)

# ada_A.fit(X_A_train, y_A_train)
# ada_B.fit(X_B_train, y_B_train)
# ada_C.fit(X_C_train, y_C_train)
# # ada.fit(X_B_train, y_B_train)
# # ada.fit(X_C_train, y_C_train)

# xgb_A.fit(X_A_train, y_A_train)
# xgb_B.fit(X_B_train, y_B_train)
# xgb_C.fit(X_C_train, y_C_train)

# ----------------------------------------------------------------------------------------------------

reg_A.fit(X_A_train, y_A_train)
reg_B.fit(X_B_train, y_B_train)
reg_C.fit(X_C_train, y_C_train)



CatBoostError: /Users/zomb-ml-platform-msk/go-agent-21.2.0/pipelines/BuildMaster/catboost.git/catboost/private/libs/options/json_helper.h:173: Can't parse parameter "learning_rate" with value: 10931167365445349

In [188]:

cat_preds_A = cat_A.predict(X_A_test)
ada_preds_A = ada_A.predict(X_A_test)
xgb_preds_A = xgb_A.predict(X_A_test)

cat_preds_B = cat_B.predict(X_B_test)
ada_preds_B = ada_B.predict(X_B_test)
xgb_preds_B = xgb_B.predict(X_B_test)

cat_preds_C = cat_C.predict(X_C_test)
ada_preds_C = ada_C.predict(X_C_test)
xgb_preds_C = xgb_C.predict(X_C_test)



cat_A_df = pd.DataFrame(data=cat_preds_A, columns=["pv_preds"])
ada_A_df = pd.DataFrame(data=ada_preds_A, columns=["pv_preds"])
xgb_A_df = pd.DataFrame(data=xgb_preds_A, columns=["pv_preds"])

cat_B_df = pd.DataFrame(data=cat_preds_B, columns=["pv_preds"])
ada_B_df = pd.DataFrame(data=ada_preds_B, columns=["pv_preds"])
xgb_B_df = pd.DataFrame(data=xgb_preds_B, columns=["pv_preds"])

cat_C_df = pd.DataFrame(data=cat_preds_C, columns=["pv_preds"])
ada_C_df = pd.DataFrame(data=ada_preds_C, columns=["pv_preds"])
xgb_C_df = pd.DataFrame(data=xgb_preds_C, columns=["pv_preds"])

df_A = (cat_A_df + ada_A_df + xgb_A_df) / 3
df_B = (cat_B_df + ada_B_df + xgb_B_df) / 3
df_C = (cat_C_df + ada_C_df + xgb_C_df) / 3



In [189]:

X_A_train_forest, X_A_test_forest, y_A_train_forest, y_A_test_forest = train_test_split(df_A, y_A_test)
X_B_train_forest, X_B_test_forest, y_B_train_forest, y_B_test_forest = train_test_split(df_B, y_B_test)
X_C_train_forest, X_C_test_forest, y_C_train_forest, y_C_test_forest = train_test_split(df_C, y_C_test)


In [190]:
forest_A.fit(X_A_train_forest, y_A_train_forest)
forest_B.fit(X_B_train_forest, y_B_train_forest)
forest_C.fit(X_C_train_forest, y_C_train_forest)

In [191]:
final_score_A = forest_A.score(X_A_test_forest, y_A_test_forest)
final_score_B = forest_B.score(X_B_test_forest, y_B_test_forest)
final_score_C = forest_C.score(X_C_test_forest, y_C_test_forest)

print(final_score_A)
print(final_score_B)
print(final_score_C)

0.8851480412089952
0.8470164858320357
0.9095269377802983


In [195]:
## Specify the model to be used

cat_preds_A = cat_A.predict(X_test_A)
ada_preds_A = ada_A.predict(X_test_A)
xgb_preds_A = xgb_A.predict(X_test_A)

cat_preds_B = cat_B.predict(X_test_B)
ada_preds_B = ada_B.predict(X_test_B)
xgb_preds_B = xgb_B.predict(X_test_B)

cat_preds_C = cat_C.predict(X_test_C)
ada_preds_C = ada_C.predict(X_test_C)
xgb_preds_C = xgb_C.predict(X_test_C)

cat_A_df = pd.DataFrame(data=cat_preds_A, columns=["pv_preds"])
ada_A_df = pd.DataFrame(data=ada_preds_A, columns=["pv_preds"])
xgb_A_df = pd.DataFrame(data=xgb_preds_A, columns=["pv_preds"])

cat_B_df = pd.DataFrame(data=cat_preds_B, columns=["pv_preds"])
ada_B_df = pd.DataFrame(data=ada_preds_B, columns=["pv_preds"])
xgb_B_df = pd.DataFrame(data=xgb_preds_B, columns=["pv_preds"])

cat_C_df = pd.DataFrame(data=cat_preds_C, columns=["pv_preds"])
ada_C_df = pd.DataFrame(data=ada_preds_C, columns=["pv_preds"])
xgb_C_df = pd.DataFrame(data=xgb_preds_C, columns=["pv_preds"])

df_A = (cat_A_df + ada_A_df + xgb_A_df) / 3
df_B = (cat_B_df + ada_B_df + xgb_B_df) / 3
df_C = (cat_C_df + ada_C_df + xgb_C_df) / 3

pred_A = forest_A.predict(df_A)
pred_B = forest_B.predict(df_B)
pred_C = forest_C.predict(df_C)

df_A = pd.DataFrame()

df_A["prediction"] = pred_A
df_A["location"] = "A"

df_B = pd.DataFrame()

df_B["prediction"] = pred_B
df_B["location"] = "B"

df_C = pd.DataFrame()

df_C["prediction"] = pred_C
df_C["location"] = "C"

df_mid = pd.concat([df_A, df_B], ignore_index=True)

df = pd.concat([df_mid, df_C], join="inner", ignore_index=True)



df = df.drop("location", axis=1)


# df["id"] = test["id"]

# df = df[["id", "prediction"]]

#df[df<0] = 0

# NAME THE FILE 
df.to_csv("sub20.csv")

df

Unnamed: 0,prediction
0,0.00000
1,0.00000
2,14.48095
3,163.75425
4,681.61445
...,...
2155,59.77510
2156,25.05370
2157,0.00000
2158,0.00000
