# Imports

In [11]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="catboost")



# Load datasets

In [12]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


# Data clean up

In [13]:
# making shure that target values line up with x_values

def data_allign(x_train, y_train):

  y_train.dropna(inplace=True)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train

import data_func.aggregation as data_agg

for i in range(len(X_frames_train)):
    X_frames_train[i] = data_agg.gen_agg(X_frames_train[i], 'mean')
    X_frames_train[i], Y_frames_train[i] = data_allign(X_frames_train[i], Y_frames_train[i])


for j in range(len(X_frames_test)):
    X_frames_test[j] = data_agg.gen_agg(X_frames_test[j], 'mean')

print(len(X_frames_train[0]))
print(len(Y_frames_train[0]))
print(len(X_frames_test[0]))


29667
29667
720


# Feature engineering

In [14]:
import data_func.timeseasonality as DTS
import data_func.date_forecast as DTF
import data_func.one_hot_encoding as OHE

categorical_features = ['dew_or_rime:idx', 'is_day:idx', 'is_in_shadow:idx', 'precip_type_5min:idx']

for i in range(len(X_frames_train)):
    X_frames_train[i] = DTS.append_seasonal_columns(X_frames_train[i])
    X_frames_train[i] = DTF.date_forecast_columns(X_frames_train[i])
    X_frames_train[i] = OHE.one_hot_encode(X_frames_train[i], categorical_features)
    # X_frames_train[i].drop(columns=['absolute_humidity_2m:gm3'], inplace=True)
    # X_frames_train[i].drop(columns=['air_density_2m:kgm3'], inplace=True)
    # X_frames_train[i]['ceiling_height_agl:m'] = X_frames_train[i]['ceiling_height_agl:m'].fillna(0)
    # X_frames_train[i]['cloud_base_agl:m'] = X_frames_train[i]['cloud_base_agl:m'].fillna(100000)



for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i] = DTF.date_forecast_columns(X_frames_test[i])
    X_frames_test[i] = OHE.one_hot_encode(X_frames_test[i], categorical_features)




# Hyperparameter optimization

In [15]:
import optuna
import catboost as cat # Change to model to optimize

# Split the data into training and validation sets
x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_frames_train[0], Y_frames_train[0], test_size=0.17, random_state=None)
x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_frames_train[1], Y_frames_train[1], test_size=0.17, random_state=None)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_frames_train[2], Y_frames_train[2], test_size=0.17, random_state=None)


def objective(trial):
    
  params = {
      "objective": trial.suggest_categorical("objective", ["RMSE", "MAE"]),
      "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
      "depth": trial.suggest_int("depth", 1, 12),
      "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
      "bootstrap_type": trial.suggest_categorical(
          "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
      ),
      "used_ram_limit": "3gb",
  }

  if params["bootstrap_type"] == "Bayesian":
      params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
  elif params["bootstrap_type"] == "Bernoulli":
      params["subsample"] = trial.suggest_float("subsample", 0.1, 1)


  model_a = cat.CatBoostRegressor(**params) # Change to model to optimize
  model_a.fit(x_train_a, y_train_a)

  # Make predictions on the validation set
  y_pred = model_a.predict(x_val_a)

  # Calculate the Mean Squared Error (MSE) as the metric to optimize
  mse = mean_squared_error(y_val_a, y_pred)

  return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

best = study.best_params
# Print the best hyperparameters found
print("Best hyperparameters:", best)

[I 2023-10-27 11:11:22,932] A new study created in memory with name: no-name-e05d2ff3-285b-43c0-a791-5fd7536ab492


Learning rate set to 0.067922
0:	learn: 1170.1908828	total: 6.85ms	remaining: 6.85s
1:	learn: 1129.8060019	total: 22.5ms	remaining: 11.2s
2:	learn: 1087.4119521	total: 39.8ms	remaining: 13.2s
3:	learn: 1053.6121429	total: 55.5ms	remaining: 13.8s
4:	learn: 1023.1027081	total: 71.2ms	remaining: 14.2s
5:	learn: 987.8987837	total: 86.6ms	remaining: 14.3s
6:	learn: 949.5640763	total: 101ms	remaining: 14.3s
7:	learn: 919.1332255	total: 116ms	remaining: 14.4s
8:	learn: 898.4666558	total: 131ms	remaining: 14.5s
9:	learn: 872.0489931	total: 148ms	remaining: 14.6s
10:	learn: 847.2826879	total: 163ms	remaining: 14.6s
11:	learn: 818.9561248	total: 178ms	remaining: 14.6s
12:	learn: 801.1864376	total: 183ms	remaining: 13.9s
13:	learn: 797.9235944	total: 194ms	remaining: 13.6s
14:	learn: 779.2783593	total: 207ms	remaining: 13.6s
15:	learn: 765.8084926	total: 222ms	remaining: 13.7s
16:	learn: 750.4365729	total: 238ms	remaining: 13.8s
17:	learn: 733.4570524	total: 254ms	remaining: 13.8s
18:	learn: 715.

[I 2023-10-27 11:11:35,910] Trial 0 finished with value: 200642.36784850113 and parameters: {'objective': 'RMSE', 'colsample_bylevel': 0.06619207765494703, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.12676809649857243}. Best is trial 0 with value: 200642.36784850113.


999:	learn: 456.4974728	total: 12.7s	remaining: 0us
Learning rate set to 0.067922
0:	learn: 1135.4027556	total: 28.5ms	remaining: 28.5s
1:	learn: 1082.4458568	total: 59.6ms	remaining: 29.8s
2:	learn: 1036.6077786	total: 97.1ms	remaining: 32.3s
3:	learn: 990.0594013	total: 131ms	remaining: 32.6s
4:	learn: 945.7932856	total: 160ms	remaining: 31.9s
5:	learn: 907.4080207	total: 191ms	remaining: 31.7s
6:	learn: 868.9587646	total: 222ms	remaining: 31.5s
7:	learn: 833.0326880	total: 255ms	remaining: 31.6s
8:	learn: 801.4611858	total: 288ms	remaining: 31.7s
9:	learn: 772.5070402	total: 320ms	remaining: 31.7s
10:	learn: 746.4469629	total: 350ms	remaining: 31.5s
11:	learn: 721.9276492	total: 381ms	remaining: 31.4s
12:	learn: 700.2086324	total: 412ms	remaining: 31.3s
13:	learn: 678.6859198	total: 445ms	remaining: 31.3s
14:	learn: 657.5095100	total: 471ms	remaining: 30.9s
15:	learn: 638.6962169	total: 502ms	remaining: 30.9s
16:	learn: 622.8622113	total: 534ms	remaining: 30.9s
17:	learn: 607.860248

[I 2023-10-27 11:12:08,764] Trial 1 finished with value: 133467.7774293086 and parameters: {'objective': 'RMSE', 'colsample_bylevel': 0.08290847891191566, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.317566404709167}. Best is trial 1 with value: 133467.7774293086.


Learning rate set to 0.067922
0:	learn: 1143.2891737	total: 30.1ms	remaining: 30.1s
1:	learn: 1100.1720617	total: 37.7ms	remaining: 18.8s
2:	learn: 1075.0017118	total: 59.8ms	remaining: 19.9s
3:	learn: 1037.7016077	total: 86.2ms	remaining: 21.5s
4:	learn: 1009.9139062	total: 108ms	remaining: 21.5s
5:	learn: 966.5260017	total: 125ms	remaining: 20.7s
6:	learn: 927.9740984	total: 143ms	remaining: 20.3s
7:	learn: 901.7526462	total: 170ms	remaining: 21s
8:	learn: 881.0669168	total: 198ms	remaining: 21.8s
9:	learn: 849.9412892	total: 228ms	remaining: 22.6s
10:	learn: 849.5295194	total: 230ms	remaining: 20.7s
11:	learn: 848.9035479	total: 234ms	remaining: 19.3s
12:	learn: 819.9730671	total: 240ms	remaining: 18.2s
13:	learn: 796.7882295	total: 265ms	remaining: 18.7s
14:	learn: 772.1278093	total: 289ms	remaining: 19s
15:	learn: 746.1180216	total: 313ms	remaining: 19.2s
16:	learn: 723.5312609	total: 339ms	remaining: 19.6s
17:	learn: 717.1681669	total: 342ms	remaining: 18.6s
18:	learn: 697.292023

[I 2023-10-27 11:12:28,227] Trial 2 finished with value: 159430.05035760277 and parameters: {'objective': 'RMSE', 'colsample_bylevel': 0.03503326210349412, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 133467.7774293086.


992:	learn: 358.5616495	total: 19s	remaining: 134ms
993:	learn: 358.5065464	total: 19.1s	remaining: 115ms
994:	learn: 358.4642168	total: 19.1s	remaining: 95.9ms
995:	learn: 358.4286184	total: 19.1s	remaining: 76.7ms
996:	learn: 358.4038804	total: 19.1s	remaining: 57.5ms
997:	learn: 358.2508768	total: 19.1s	remaining: 38.3ms
998:	learn: 358.1604814	total: 19.2s	remaining: 19.2ms
999:	learn: 358.1594127	total: 19.2s	remaining: 0us
Learning rate set to 0.067922
0:	learn: 1137.4387493	total: 117ms	remaining: 1m 56s
1:	learn: 1085.6030012	total: 222ms	remaining: 1m 50s
2:	learn: 1035.9928613	total: 328ms	remaining: 1m 49s
3:	learn: 989.5327680	total: 397ms	remaining: 1m 38s
4:	learn: 946.2096388	total: 476ms	remaining: 1m 34s
5:	learn: 904.6195075	total: 540ms	remaining: 1m 29s
6:	learn: 877.1390263	total: 637ms	remaining: 1m 30s
7:	learn: 846.7473752	total: 726ms	remaining: 1m 30s
8:	learn: 820.2923195	total: 732ms	remaining: 1m 20s
9:	learn: 790.2165218	total: 845ms	remaining: 1m 23s
10:	

[I 2023-10-27 11:13:52,643] Trial 3 finished with value: 148069.44939265883 and parameters: {'objective': 'RMSE', 'colsample_bylevel': 0.047953183425054606, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 133467.7774293086.


999:	learn: 301.9517729	total: 1m 23s	remaining: 0us
Learning rate set to 0.067922
0:	learn: 1138.0700128	total: 3.71ms	remaining: 3.7s
1:	learn: 1104.1648950	total: 33.4ms	remaining: 16.6s
2:	learn: 1054.3605233	total: 81ms	remaining: 26.9s
3:	learn: 1006.1418704	total: 143ms	remaining: 35.5s
4:	learn: 964.1519310	total: 152ms	remaining: 30.2s
5:	learn: 923.6179409	total: 192ms	remaining: 31.7s
6:	learn: 890.1790677	total: 242ms	remaining: 34.4s
7:	learn: 856.7030956	total: 296ms	remaining: 36.7s
8:	learn: 823.1804495	total: 345ms	remaining: 38s
9:	learn: 792.1687469	total: 356ms	remaining: 35.3s
10:	learn: 766.5491280	total: 415ms	remaining: 37.3s
11:	learn: 745.0437925	total: 471ms	remaining: 38.8s
12:	learn: 723.8848950	total: 474ms	remaining: 36s
13:	learn: 702.0498899	total: 526ms	remaining: 37s
14:	learn: 681.9349151	total: 579ms	remaining: 38s
15:	learn: 664.6934365	total: 634ms	remaining: 39s
16:	learn: 652.0968303	total: 663ms	remaining: 38.3s
17:	learn: 635.7004411	total: 71

[I 2023-10-27 11:14:36,606] Trial 4 finished with value: 129145.95761257509 and parameters: {'objective': 'RMSE', 'colsample_bylevel': 0.042552213124753616, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 4 with value: 129145.95761257509.


0:	learn: 660.0799186	total: 44.4ms	remaining: 44.3s
1:	learn: 650.0983850	total: 95.6ms	remaining: 47.7s
2:	learn: 642.3758328	total: 103ms	remaining: 34.1s
3:	learn: 632.5135953	total: 152ms	remaining: 37.8s
4:	learn: 622.5874774	total: 176ms	remaining: 35.1s
5:	learn: 611.8369457	total: 215ms	remaining: 35.6s
6:	learn: 601.0988775	total: 223ms	remaining: 31.6s
7:	learn: 589.5525762	total: 261ms	remaining: 32.3s
8:	learn: 579.5332142	total: 308ms	remaining: 33.9s
9:	learn: 570.7765192	total: 355ms	remaining: 35.2s
10:	learn: 563.8394223	total: 365ms	remaining: 32.9s
11:	learn: 554.1637257	total: 396ms	remaining: 32.6s
12:	learn: 546.1966001	total: 448ms	remaining: 34s
13:	learn: 539.5630041	total: 455ms	remaining: 32s
14:	learn: 531.7329726	total: 466ms	remaining: 30.6s
15:	learn: 523.5487274	total: 501ms	remaining: 30.8s
16:	learn: 516.7064707	total: 546ms	remaining: 31.6s
17:	learn: 506.6604059	total: 607ms	remaining: 33.1s
18:	learn: 497.5702896	total: 633ms	remaining: 32.7s
19:	l

[I 2023-10-27 11:15:16,081] Trial 5 finished with value: 201826.762413073 and parameters: {'objective': 'MAE', 'colsample_bylevel': 0.04130107717615696, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.1448620937611183}. Best is trial 4 with value: 129145.95761257509.


0:	learn: 670.3630771	total: 5.17ms	remaining: 5.16s
1:	learn: 665.7197756	total: 15.5ms	remaining: 7.73s
2:	learn: 656.7872348	total: 27.3ms	remaining: 9.06s
3:	learn: 656.7742017	total: 29.5ms	remaining: 7.34s
4:	learn: 655.9936547	total: 34.8ms	remaining: 6.93s
5:	learn: 655.9792855	total: 37.1ms	remaining: 6.14s
6:	learn: 646.3505802	total: 42.9ms	remaining: 6.09s
7:	learn: 646.2875202	total: 45.9ms	remaining: 5.69s
8:	learn: 646.2600268	total: 48ms	remaining: 5.28s
9:	learn: 645.5348869	total: 52.8ms	remaining: 5.23s
10:	learn: 645.5050185	total: 55ms	remaining: 4.95s
11:	learn: 645.4792111	total: 57.2ms	remaining: 4.71s
12:	learn: 636.3169996	total: 60.5ms	remaining: 4.59s
13:	learn: 636.2845231	total: 62.5ms	remaining: 4.4s
14:	learn: 622.4891299	total: 88.3ms	remaining: 5.79s
15:	learn: 622.4559255	total: 90.3ms	remaining: 5.56s
16:	learn: 608.8644437	total: 119ms	remaining: 6.86s
17:	learn: 598.7276349	total: 121ms	remaining: 6.61s
18:	learn: 589.8104561	total: 129ms	remaining

[I 2023-10-27 11:15:21,819] Trial 6 finished with value: 227345.50843746343 and parameters: {'objective': 'MAE', 'colsample_bylevel': 0.017075614613494966, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.17744648879052355}. Best is trial 4 with value: 129145.95761257509.


984:	learn: 223.3315705	total: 5.37s	remaining: 81.9ms
985:	learn: 223.3314897	total: 5.38s	remaining: 76.3ms
986:	learn: 223.3229320	total: 5.38s	remaining: 70.9ms
987:	learn: 223.3216750	total: 5.38s	remaining: 65.4ms
988:	learn: 223.3125318	total: 5.39s	remaining: 59.9ms
989:	learn: 223.3120492	total: 5.39s	remaining: 54.4ms
990:	learn: 223.2307687	total: 5.42s	remaining: 49.2ms
991:	learn: 223.2280159	total: 5.42s	remaining: 43.7ms
992:	learn: 223.1761813	total: 5.42s	remaining: 38.2ms
993:	learn: 223.1759006	total: 5.43s	remaining: 32.8ms
994:	learn: 223.1634764	total: 5.43s	remaining: 27.3ms
995:	learn: 223.1539476	total: 5.43s	remaining: 21.8ms
996:	learn: 223.1505645	total: 5.43s	remaining: 16.3ms
997:	learn: 223.1505643	total: 5.43s	remaining: 10.9ms
998:	learn: 223.1502499	total: 5.44s	remaining: 5.44ms
999:	learn: 223.0818435	total: 5.44s	remaining: 0us
0:	learn: 656.7833276	total: 5.96ms	remaining: 5.96s
1:	learn: 655.6783261	total: 10.9ms	remaining: 5.42s
2:	learn: 646.249

[I 2023-10-27 11:15:26,555] Trial 7 finished with value: 251517.05860223103 and parameters: {'objective': 'MAE', 'colsample_bylevel': 0.043118913828657617, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.875099410221233}. Best is trial 4 with value: 129145.95761257509.


989:	learn: 250.2663082	total: 4.45s	remaining: 45ms
990:	learn: 250.1986533	total: 4.46s	remaining: 40.5ms
991:	learn: 250.1982689	total: 4.46s	remaining: 36ms
992:	learn: 250.1906428	total: 4.47s	remaining: 31.5ms
993:	learn: 250.1894787	total: 4.47s	remaining: 27ms
994:	learn: 250.1858814	total: 4.48s	remaining: 22.5ms
995:	learn: 250.1853655	total: 4.48s	remaining: 18ms
996:	learn: 250.1831997	total: 4.48s	remaining: 13.5ms
997:	learn: 250.1649933	total: 4.5s	remaining: 9.02ms
998:	learn: 249.9526899	total: 4.5s	remaining: 4.51ms
999:	learn: 249.9460823	total: 4.51s	remaining: 0us
Learning rate set to 0.067922
0:	learn: 1142.3975313	total: 3.99ms	remaining: 3.99s
1:	learn: 1095.0763116	total: 20.1ms	remaining: 10s
2:	learn: 1053.2193356	total: 33.3ms	remaining: 11.1s
3:	learn: 1012.4163591	total: 48.4ms	remaining: 12.1s
4:	learn: 1004.3066008	total: 51.8ms	remaining: 10.3s
5:	learn: 1000.1833644	total: 54.8ms	remaining: 9.07s
6:	learn: 964.3577114	total: 67.8ms	remaining: 9.61s
7:	

[I 2023-10-27 11:15:35,414] Trial 8 finished with value: 187660.47262020133 and parameters: {'objective': 'RMSE', 'colsample_bylevel': 0.024766864845721243, 'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.4981773647226166}. Best is trial 4 with value: 129145.95761257509.


985:	learn: 429.2580424	total: 8.48s	remaining: 120ms
986:	learn: 429.2356867	total: 8.49s	remaining: 112ms
987:	learn: 429.2354566	total: 8.49s	remaining: 103ms
988:	learn: 429.2354566	total: 8.49s	remaining: 94.5ms
989:	learn: 429.2303545	total: 8.51s	remaining: 85.9ms
990:	learn: 429.1441987	total: 8.52s	remaining: 77.4ms
991:	learn: 429.0159362	total: 8.54s	remaining: 68.8ms
992:	learn: 428.9785964	total: 8.55s	remaining: 60.2ms
993:	learn: 428.9555828	total: 8.55s	remaining: 51.6ms
994:	learn: 428.9302250	total: 8.56s	remaining: 43ms
995:	learn: 428.9302249	total: 8.56s	remaining: 34.4ms
996:	learn: 428.9024331	total: 8.57s	remaining: 25.8ms
997:	learn: 428.8402772	total: 8.58s	remaining: 17.2ms
998:	learn: 428.8401253	total: 8.59s	remaining: 8.6ms
999:	learn: 428.7705554	total: 8.6s	remaining: 0us
Learning rate set to 0.067922
0:	learn: 1193.1878037	total: 2.55ms	remaining: 2.54s
1:	learn: 1193.1878037	total: 4.7ms	remaining: 2.34s
2:	learn: 1165.5214339	total: 8.09ms	remaining: 

[I 2023-10-27 11:15:44,628] Trial 9 finished with value: 198251.60994744778 and parameters: {'objective': 'RMSE', 'colsample_bylevel': 0.01670175699390116, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.12386735622688248}. Best is trial 4 with value: 129145.95761257509.


981:	learn: 447.4525567	total: 8.88s	remaining: 163ms
982:	learn: 447.4525567	total: 8.89s	remaining: 154ms
983:	learn: 447.4510629	total: 8.89s	remaining: 145ms
984:	learn: 447.4079406	total: 8.89s	remaining: 135ms
985:	learn: 447.3895361	total: 8.91s	remaining: 126ms
986:	learn: 447.3895148	total: 8.91s	remaining: 117ms
987:	learn: 447.3837331	total: 8.91s	remaining: 108ms
988:	learn: 447.3836784	total: 8.91s	remaining: 99.2ms
989:	learn: 447.3315561	total: 8.93s	remaining: 90.2ms
990:	learn: 447.3315561	total: 8.93s	remaining: 81.1ms
991:	learn: 447.3206112	total: 8.93s	remaining: 72ms
992:	learn: 447.2846274	total: 8.94s	remaining: 63ms
993:	learn: 447.2842018	total: 8.95s	remaining: 54ms
994:	learn: 447.2842018	total: 8.95s	remaining: 45ms
995:	learn: 447.2823733	total: 8.95s	remaining: 36ms
996:	learn: 447.2739006	total: 8.96s	remaining: 27ms
997:	learn: 447.2734168	total: 8.96s	remaining: 18ms
998:	learn: 447.2720439	total: 8.96s	remaining: 8.97ms
999:	learn: 447.2720439	total: 