# Imports

In [24]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="catboost")



# Load datasets

In [25]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


# Data clean up

In [26]:
# Making sure that target values line up with x_values

def data_allign(x_train, y_train):
  
  y_train.dropna(inplace=True)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train

import data_func.aggregation as data_agg

categorical_features = ['date_forecast', 'dew_or_rime:idx', 'is_day:idx', 'is_in_shadow:idx', 'precip_type_5min:idx']
categorical_features_to_drop = ['dew_or_rime:idx', 'is_day:idx', 'is_in_shadow:idx', 'precip_type_5min:idx']

for i in range(len(X_frames_train)):
  categorical_frame = X_frames_train[i][categorical_features]
  categorical_frame = data_agg.gen_agg(categorical_frame, data_agg.stocastic_median)

  X_frames_train[i] = X_frames_train[i].drop(columns=categorical_features_to_drop)
  X_frames_train[i] = data_agg.gen_agg(X_frames_train[i], 'mean')
  X_frames_train[i] = pd.merge(X_frames_train[i], categorical_frame, on='date_forecast')
  X_frames_train[i], Y_frames_train[i] = data_allign(X_frames_train[i], Y_frames_train[i])

for j in range(len(X_frames_test)):
  categorical_frame = X_frames_test[i][categorical_features]
  categorical_frame = data_agg.gen_agg(categorical_frame, data_agg.stocastic_median)

  X_frames_test[i] = X_frames_test[i].drop(columns=categorical_features_to_drop)
  X_frames_test[i] = pd.merge(X_frames_test[i], categorical_frame, on='date_forecast')
  X_frames_test[j] = data_agg.gen_agg(X_frames_test[j], 'mean')



# Feature engineering

In [27]:
import data_func.timeseasonality as DTS
import data_func.date_forecast as DTF
import data_func.one_hot_encoding as OHE

categorical_features_to_one_hot = ['dew_or_rime:idx', 'precip_type_5min:idx']

for i in range(len(X_frames_train)):
    X_frames_train[i] = DTS.append_seasonal_columns(X_frames_train[i])
    X_frames_train[i] = DTF.date_forecast_columns(X_frames_train[i])
    # X_frames_train[i] = OHE.one_hot_encode(X_frames_train[i], categorical_features_to_one_hot)
    X_frames_train[i].drop(columns=['snow_drift:idx'], inplace=True)

    # X_frames_train[i].drop(columns=['absolute_humidity_2m:gm3'], inplace=True)
    # X_frames_train[i].drop(columns=['air_density_2m:kgm3'], inplace=True)
    # X_frames_train[i]['ceiling_height_agl:m'] = X_frames_train[i]['ceiling_height_agl:m'].fillna(0)
    # X_frames_train[i]['cloud_base_agl:m'] = X_frames_train[i]['cloud_base_agl:m'].fillna(100000)



for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i] = DTF.date_forecast_columns(X_frames_test[i])
    # X_frames_test[i] = OHE.one_hot_encode(X_frames_test[i], categorical_features_to_one_hot)




# Hyperparameter optimization

In [28]:
import optuna
import catboost as cat # Change to model to optimize

# Split the data into training and validation sets
x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_frames_train[0], Y_frames_train[0], test_size=0.17, random_state=None)
x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_frames_train[1], Y_frames_train[1], test_size=0.17, random_state=None)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_frames_train[2], Y_frames_train[2], test_size=0.17, random_state=None)


def objective(trial):
    
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "cat_features": categorical_features_to_drop,
        "random_seed": 42,
        "loss_function": "MAE",
        "eval_metric": "MAE",
    }

    model_c = cat.CatBoostRegressor(**params) # Change to model to optimize
    model_c.fit(x_train_c, y_train_c)

    # Make predictions on the validation set
    y_pred = model_c.predict(x_val_c)

    # Calculate the Mean Squared Error (MSE) as the metric to optimize
    mae = mean_absolute_error(y_val_c, y_pred)

    return mae

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

best = study.best_params
# Print the best hyperparameters found
print("Best hyperparameters:", best)

[I 2023-10-30 10:36:36,270] A new study created in memory with name: no-name-bed4b793-c011-44f1-b0a7-d54a44143018


0:	learn: 80.2776363	total: 9.59ms	remaining: 9.58s
1:	learn: 77.3581000	total: 15.6ms	remaining: 7.8s
2:	learn: 74.4293077	total: 25ms	remaining: 8.32s
3:	learn: 71.8788530	total: 30.2ms	remaining: 7.52s
4:	learn: 70.0388602	total: 35.9ms	remaining: 7.13s
5:	learn: 67.6543190	total: 44.2ms	remaining: 7.32s
6:	learn: 64.7843777	total: 49.7ms	remaining: 7.04s
7:	learn: 64.7641159	total: 72.3ms	remaining: 8.96s
8:	learn: 64.7641159	total: 76.4ms	remaining: 8.41s
9:	learn: 62.5519747	total: 80.9ms	remaining: 8.01s
10:	learn: 60.7243693	total: 85.9ms	remaining: 7.72s
11:	learn: 58.7608853	total: 98.3ms	remaining: 8.09s
12:	learn: 57.4643564	total: 105ms	remaining: 7.96s
13:	learn: 56.1402986	total: 112ms	remaining: 7.88s
14:	learn: 53.9997679	total: 117ms	remaining: 7.66s
15:	learn: 52.3731404	total: 129ms	remaining: 7.92s
16:	learn: 51.4499477	total: 133ms	remaining: 7.69s
17:	learn: 50.2823720	total: 140ms	remaining: 7.62s
18:	learn: 49.0488346	total: 146ms	remaining: 7.55s
19:	learn: 47

[I 2023-10-30 10:36:43,576] Trial 0 finished with value: 21.449755781609742 and parameters: {'learning_rate': 0.058625673355017845, 'depth': 4, 'subsample': 0.12085282233486801, 'colsample_bylevel': 0.09265478612536795, 'min_data_in_leaf': 44}. Best is trial 0 with value: 21.449755781609742.


983:	learn: 22.1975102	total: 6.91s	remaining: 112ms
984:	learn: 22.1972185	total: 6.91s	remaining: 105ms
985:	learn: 22.1972101	total: 6.92s	remaining: 98.3ms
986:	learn: 22.1950199	total: 6.94s	remaining: 91.4ms
987:	learn: 22.1755164	total: 6.95s	remaining: 84.4ms
988:	learn: 22.1754703	total: 6.95s	remaining: 77.3ms
989:	learn: 22.1750993	total: 6.96s	remaining: 70.3ms
990:	learn: 22.1750954	total: 6.96s	remaining: 63.2ms
991:	learn: 22.1738634	total: 6.97s	remaining: 56.2ms
992:	learn: 22.1738431	total: 6.97s	remaining: 49.2ms
993:	learn: 22.1738199	total: 6.98s	remaining: 42.2ms
994:	learn: 22.1732554	total: 6.99s	remaining: 35.1ms
995:	learn: 22.1732531	total: 7s	remaining: 28.1ms
996:	learn: 22.1732341	total: 7.01s	remaining: 21.1ms
997:	learn: 22.1715668	total: 7.02s	remaining: 14.1ms
998:	learn: 22.1711454	total: 7.02s	remaining: 7.03ms
999:	learn: 22.1711324	total: 7.04s	remaining: 0us
0:	learn: 78.6288757	total: 42.6ms	remaining: 42.6s
1:	learn: 75.1534333	total: 90.7ms	rem

[I 2023-10-30 10:37:30,128] Trial 1 finished with value: 17.60753052796567 and parameters: {'learning_rate': 0.060183973978509046, 'depth': 10, 'subsample': 0.28819995411353627, 'colsample_bylevel': 0.39818319507473743, 'min_data_in_leaf': 77}. Best is trial 1 with value: 17.60753052796567.


0:	learn: 82.4522665	total: 42.3ms	remaining: 42.3s
1:	learn: 82.0562880	total: 76.7ms	remaining: 38.3s
2:	learn: 81.6046317	total: 124ms	remaining: 41s
3:	learn: 81.1823876	total: 164ms	remaining: 40.7s
4:	learn: 80.7560482	total: 201ms	remaining: 40s
5:	learn: 80.3388647	total: 236ms	remaining: 39s
6:	learn: 79.8745139	total: 278ms	remaining: 39.5s
7:	learn: 79.4332795	total: 320ms	remaining: 39.6s
8:	learn: 78.9875427	total: 353ms	remaining: 38.9s
9:	learn: 78.5817716	total: 391ms	remaining: 38.7s
10:	learn: 78.2379465	total: 424ms	remaining: 38.1s
11:	learn: 77.8007583	total: 457ms	remaining: 37.6s
12:	learn: 77.3957655	total: 500ms	remaining: 38s
13:	learn: 76.9607759	total: 542ms	remaining: 38.2s
14:	learn: 76.6109514	total: 578ms	remaining: 38s
15:	learn: 76.2478534	total: 609ms	remaining: 37.4s
16:	learn: 75.8888393	total: 651ms	remaining: 37.7s
17:	learn: 75.5392649	total: 689ms	remaining: 37.6s
18:	learn: 75.0977877	total: 731ms	remaining: 37.8s
19:	learn: 74.6993403	total: 7

[I 2023-10-30 10:38:08,981] Trial 2 finished with value: 19.152340872260748 and parameters: {'learning_rate': 0.007206806564398666, 'depth': 10, 'subsample': 0.5027712835030906, 'colsample_bylevel': 0.30737173066829593, 'min_data_in_leaf': 45}. Best is trial 1 with value: 17.60753052796567.


0:	learn: 78.2441827	total: 76.1ms	remaining: 1m 16s
1:	learn: 72.9920693	total: 132ms	remaining: 1m 6s
2:	learn: 69.2611444	total: 194ms	remaining: 1m 4s
3:	learn: 66.1099514	total: 269ms	remaining: 1m 6s
4:	learn: 62.5252761	total: 334ms	remaining: 1m 6s
5:	learn: 58.6920712	total: 411ms	remaining: 1m 8s
6:	learn: 55.7008822	total: 475ms	remaining: 1m 7s
7:	learn: 53.2089273	total: 567ms	remaining: 1m 10s
8:	learn: 50.2957744	total: 624ms	remaining: 1m 8s
9:	learn: 47.7809104	total: 711ms	remaining: 1m 10s
10:	learn: 45.3753502	total: 782ms	remaining: 1m 10s
11:	learn: 43.4064726	total: 864ms	remaining: 1m 11s
12:	learn: 41.5916704	total: 916ms	remaining: 1m 9s
13:	learn: 39.7742714	total: 986ms	remaining: 1m 9s
14:	learn: 38.0158640	total: 1.07s	remaining: 1m 10s
15:	learn: 36.4108994	total: 1.11s	remaining: 1m 8s
16:	learn: 35.2641613	total: 1.17s	remaining: 1m 7s
17:	learn: 33.9088246	total: 1.24s	remaining: 1m 7s
18:	learn: 32.7079320	total: 1.29s	remaining: 1m 6s
19:	learn: 31.6

[I 2023-10-30 10:39:17,329] Trial 3 finished with value: 17.632387448444526 and parameters: {'learning_rate': 0.07915417700584262, 'depth': 9, 'subsample': 0.9988801502596505, 'colsample_bylevel': 0.9835305905526186, 'min_data_in_leaf': 3}. Best is trial 1 with value: 17.60753052796567.


0:	learn: 82.4042930	total: 58.7ms	remaining: 58.6s
1:	learn: 82.0310599	total: 152ms	remaining: 1m 16s
2:	learn: 81.5509806	total: 225ms	remaining: 1m 14s
3:	learn: 81.1238440	total: 306ms	remaining: 1m 16s
4:	learn: 80.7213030	total: 370ms	remaining: 1m 13s
5:	learn: 80.2965782	total: 436ms	remaining: 1m 12s
6:	learn: 79.9162205	total: 525ms	remaining: 1m 14s
7:	learn: 79.4903032	total: 623ms	remaining: 1m 17s
8:	learn: 79.0844162	total: 710ms	remaining: 1m 18s
9:	learn: 78.7020660	total: 790ms	remaining: 1m 18s
10:	learn: 78.2572409	total: 863ms	remaining: 1m 17s
11:	learn: 77.8421398	total: 941ms	remaining: 1m 17s
12:	learn: 77.4406406	total: 1.05s	remaining: 1m 19s
13:	learn: 77.0501669	total: 1.14s	remaining: 1m 20s
14:	learn: 76.6004076	total: 1.22s	remaining: 1m 20s
15:	learn: 76.2508082	total: 1.3s	remaining: 1m 19s
16:	learn: 75.8711786	total: 1.36s	remaining: 1m 18s
17:	learn: 75.5501715	total: 1.42s	remaining: 1m 17s
18:	learn: 75.1656381	total: 1.5s	remaining: 1m 17s
19:	l

[I 2023-10-30 10:40:36,204] Trial 4 finished with value: 19.35139351733557 and parameters: {'learning_rate': 0.006998436329975298, 'depth': 9, 'subsample': 0.7062012112206776, 'colsample_bylevel': 0.6706058105925081, 'min_data_in_leaf': 12}. Best is trial 1 with value: 17.60753052796567.


999:	learn: 19.1814927	total: 1m 18s	remaining: 0us
0:	learn: 81.7696133	total: 6.19ms	remaining: 6.19s
1:	learn: 80.2708639	total: 11.9ms	remaining: 5.92s
2:	learn: 79.0984646	total: 18.4ms	remaining: 6.12s
3:	learn: 77.7170286	total: 24.4ms	remaining: 6.06s
4:	learn: 77.6395803	total: 29.6ms	remaining: 5.89s
5:	learn: 76.1755659	total: 34.8ms	remaining: 5.76s
6:	learn: 75.0927848	total: 39.2ms	remaining: 5.55s
7:	learn: 74.0065355	total: 46.1ms	remaining: 5.71s
8:	learn: 72.6947507	total: 53.2ms	remaining: 5.86s
9:	learn: 71.1420771	total: 60.4ms	remaining: 5.98s
10:	learn: 70.4095793	total: 65.2ms	remaining: 5.86s
11:	learn: 69.0940450	total: 71.9ms	remaining: 5.92s
12:	learn: 67.8525593	total: 78.7ms	remaining: 5.97s
13:	learn: 66.7229165	total: 85.6ms	remaining: 6.03s
14:	learn: 65.9268180	total: 90.4ms	remaining: 5.94s
15:	learn: 64.7609401	total: 97.6ms	remaining: 6s
16:	learn: 63.6976071	total: 104ms	remaining: 6.04s
17:	learn: 62.4374049	total: 112ms	remaining: 6.1s
18:	learn:

[I 2023-10-30 10:40:41,069] Trial 5 finished with value: 25.859734775104943 and parameters: {'learning_rate': 0.039370297597807684, 'depth': 1, 'subsample': 0.05603370490649154, 'colsample_bylevel': 0.44739567789598983, 'min_data_in_leaf': 59}. Best is trial 1 with value: 17.60753052796567.


960:	learn: 27.8121588	total: 4.47s	remaining: 181ms
961:	learn: 27.8121482	total: 4.47s	remaining: 177ms
962:	learn: 27.8121342	total: 4.47s	remaining: 172ms
963:	learn: 27.8121231	total: 4.48s	remaining: 167ms
964:	learn: 27.8121223	total: 4.49s	remaining: 163ms
965:	learn: 27.8121222	total: 4.49s	remaining: 158ms
966:	learn: 27.8121221	total: 4.49s	remaining: 153ms
967:	learn: 27.8121220	total: 4.5s	remaining: 149ms
968:	learn: 27.8121218	total: 4.5s	remaining: 144ms
969:	learn: 27.8121217	total: 4.51s	remaining: 139ms
970:	learn: 27.8121217	total: 4.51s	remaining: 135ms
971:	learn: 27.8121182	total: 4.51s	remaining: 130ms
972:	learn: 27.8121182	total: 4.52s	remaining: 125ms
973:	learn: 27.8121180	total: 4.52s	remaining: 121ms
974:	learn: 27.8047639	total: 4.53s	remaining: 116ms
975:	learn: 27.8047632	total: 4.53s	remaining: 111ms
976:	learn: 27.8047616	total: 4.54s	remaining: 107ms
977:	learn: 27.8047615	total: 4.54s	remaining: 102ms
978:	learn: 27.8047574	total: 4.55s	remaining: 9

[I 2023-10-30 10:42:01,981] Trial 6 finished with value: 20.681930799848722 and parameters: {'learning_rate': 0.00409138377270166, 'depth': 9, 'subsample': 0.5817770497722817, 'colsample_bylevel': 0.703124178239859, 'min_data_in_leaf': 77}. Best is trial 1 with value: 17.60753052796567.


0:	learn: 81.9975756	total: 8.12ms	remaining: 8.11s
1:	learn: 80.7754441	total: 14.2ms	remaining: 7.09s
2:	learn: 79.8445637	total: 21.9ms	remaining: 7.28s
3:	learn: 78.6242323	total: 28.1ms	remaining: 7s
4:	learn: 77.6162596	total: 33ms	remaining: 6.56s
5:	learn: 76.3018595	total: 38.6ms	remaining: 6.39s
6:	learn: 75.0864890	total: 44.8ms	remaining: 6.36s
7:	learn: 73.9960713	total: 50.4ms	remaining: 6.25s
8:	learn: 72.8615054	total: 55.9ms	remaining: 6.16s
9:	learn: 71.5627160	total: 63.5ms	remaining: 6.29s
10:	learn: 70.6253159	total: 71ms	remaining: 6.38s
11:	learn: 70.0192553	total: 76.3ms	remaining: 6.28s
12:	learn: 68.9117048	total: 81.3ms	remaining: 6.17s
13:	learn: 67.9029574	total: 86.9ms	remaining: 6.12s
14:	learn: 67.8551221	total: 91.8ms	remaining: 6.03s
15:	learn: 66.9768797	total: 96.3ms	remaining: 5.92s
16:	learn: 65.9245420	total: 101ms	remaining: 5.84s
17:	learn: 64.9361538	total: 106ms	remaining: 5.78s
18:	learn: 64.1425472	total: 115ms	remaining: 5.93s
19:	learn: 63

[I 2023-10-30 10:42:06,899] Trial 7 finished with value: 23.746997365378906 and parameters: {'learning_rate': 0.027747981706630467, 'depth': 2, 'subsample': 0.6745584906355134, 'colsample_bylevel': 0.1671603326691699, 'min_data_in_leaf': 42}. Best is trial 1 with value: 17.60753052796567.


977:	learn: 25.1218496	total: 4.59s	remaining: 103ms
978:	learn: 25.1210565	total: 4.59s	remaining: 98.6ms
979:	learn: 25.1209874	total: 4.6s	remaining: 93.8ms
980:	learn: 25.1208992	total: 4.6s	remaining: 89.1ms
981:	learn: 25.1189456	total: 4.61s	remaining: 84.4ms
982:	learn: 25.1189453	total: 4.61s	remaining: 79.7ms
983:	learn: 25.1163328	total: 4.61s	remaining: 75ms
984:	learn: 25.1142396	total: 4.62s	remaining: 70.3ms
985:	learn: 25.1142394	total: 4.62s	remaining: 65.6ms
986:	learn: 25.1142394	total: 4.63s	remaining: 60.9ms
987:	learn: 25.1124903	total: 4.63s	remaining: 56.2ms
988:	learn: 25.1024645	total: 4.63s	remaining: 51.6ms
989:	learn: 25.1004962	total: 4.64s	remaining: 46.9ms
990:	learn: 25.0959885	total: 4.64s	remaining: 42.2ms
991:	learn: 25.0959881	total: 4.64s	remaining: 37.5ms
992:	learn: 25.0959881	total: 4.65s	remaining: 32.8ms
993:	learn: 25.0903843	total: 4.66s	remaining: 28.1ms
994:	learn: 25.0903811	total: 4.66s	remaining: 23.4ms
995:	learn: 25.0903537	total: 4.6

[I 2023-10-30 10:42:30,787] Trial 8 finished with value: 20.335258654082057 and parameters: {'learning_rate': 0.011641240541167342, 'depth': 6, 'subsample': 0.2939597767927058, 'colsample_bylevel': 0.33935343589213207, 'min_data_in_leaf': 52}. Best is trial 1 with value: 17.60753052796567.


996:	learn: 20.9824658	total: 23.5s	remaining: 70.8ms
997:	learn: 20.9812066	total: 23.6s	remaining: 47.2ms
998:	learn: 20.9777855	total: 23.6s	remaining: 23.6ms
999:	learn: 20.9776776	total: 23.6s	remaining: 0us
0:	learn: 82.6013154	total: 70.5ms	remaining: 1m 10s
1:	learn: 82.2489557	total: 125ms	remaining: 1m 2s
2:	learn: 81.9681007	total: 185ms	remaining: 1m 1s
3:	learn: 81.6045788	total: 241ms	remaining: 1m
4:	learn: 81.3072912	total: 300ms	remaining: 59.8s
5:	learn: 80.9820662	total: 366ms	remaining: 1m
6:	learn: 80.6712300	total: 426ms	remaining: 1m
7:	learn: 80.3579063	total: 491ms	remaining: 1m
8:	learn: 80.0682692	total: 554ms	remaining: 1m 1s
9:	learn: 79.7543095	total: 621ms	remaining: 1m 1s
10:	learn: 79.4319735	total: 686ms	remaining: 1m 1s
11:	learn: 79.1524418	total: 746ms	remaining: 1m 1s
12:	learn: 78.8799582	total: 806ms	remaining: 1m 1s
13:	learn: 78.6062575	total: 868ms	remaining: 1m 1s
14:	learn: 78.3355050	total: 923ms	remaining: 1m
15:	learn: 78.0069643	total: 9

[I 2023-10-30 10:43:30,398] Trial 9 finished with value: 19.84705936045429 and parameters: {'learning_rate': 0.005040034669162006, 'depth': 10, 'subsample': 0.16883178170052698, 'colsample_bylevel': 0.5939332139971534, 'min_data_in_leaf': 67}. Best is trial 1 with value: 17.60753052796567.


Best hyperparameters: {'learning_rate': 0.060183973978509046, 'depth': 10, 'subsample': 0.28819995411353627, 'colsample_bylevel': 0.39818319507473743, 'min_data_in_leaf': 77}
