# Imports

In [6]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")



# Load datasets

In [7]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


# Data clean up

In [8]:
# making shure that target values line up with x_values

def data_allign(x_train, y_train):

  y_train.dropna(inplace=True)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train

import data_func.aggregation as data_agg

for i in range(len(X_frames_train)):
    X_frames_train[i] = data_agg.gen_agg(X_frames_train[i], 'mean')
    X_frames_train[i], Y_frames_train[i] = data_allign(X_frames_train[i], Y_frames_train[i])


for j in range(len(X_frames_test)):
    X_frames_test[j] = data_agg.gen_agg(X_frames_test[j], 'mean')

print(len(X_frames_train[0]))
print(len(Y_frames_train[0]))
print(len(X_frames_test[0]))


29667
29667
720


# Feature engineering

In [9]:
import data_func.timeseasonality as DTS
for i in range(len(X_frames_train)):
    X_frames_train[i] = DTS.append_seasonal_columns(X_frames_train[i])
    X_frames_train[i].drop(columns=['date_forecast'], inplace=True)

for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i].drop(columns=['date_forecast'], inplace=True)

# Hyperparameter optimization

In [11]:
import optuna
import xgboost as xgb # Change to model to optimize

# Split the data into training and validation sets
x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_frames_train[0], Y_frames_train[0], test_size=0.17, random_state=None)
x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_frames_train[1], Y_frames_train[1], test_size=0.17, random_state=None)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_frames_train[2], Y_frames_train[2], test_size=0.17, random_state=None)


def objective(trial):

  params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
    }

  model_c = xgb.XGBRegressor(**params) # Change to model to optimize
  model_c.fit(x_train_c, y_train_c)

  # Make predictions on the validation set
  y_pred = model_c.predict(x_val_c)

  # Calculate the Mean Squared Error (MSE) as the metric to optimize
  mse = mean_squared_error(y_val_c, y_pred)

  return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

best = study.best_params
# Print the best hyperparameters found
print("Best hyperparameters:", best)

[I 2023-10-17 18:25:41,094] A new study created in memory with name: no-name-a06ccc76-c7af-4cb2-ab43-603af035aa65
[I 2023-10-17 18:25:59,361] Trial 0 finished with value: 2368.194074049965 and parameters: {'n_estimators': 690, 'max_depth': 9, 'learning_rate': 0.05335673602208645, 'subsample': 0.932804582895975, 'colsample_bytree': 0.9885642849465077}. Best is trial 0 with value: 2368.194074049965.
[I 2023-10-17 18:26:07,832] Trial 1 finished with value: 2319.427506832132 and parameters: {'n_estimators': 356, 'max_depth': 8, 'learning_rate': 0.076614824667391, 'subsample': 0.610845042291492, 'colsample_bytree': 0.7956039908084882}. Best is trial 1 with value: 2319.427506832132.
[I 2023-10-17 18:26:17,960] Trial 2 finished with value: 2429.05450275033 and parameters: {'n_estimators': 547, 'max_depth': 8, 'learning_rate': 0.018712167851818416, 'subsample': 0.7154168576917876, 'colsample_bytree': 0.9401711777169734}. Best is trial 1 with value: 2319.427506832132.
[I 2023-10-17 18:26:19,887

Best hyperparameters: {'n_estimators': 356, 'max_depth': 8, 'learning_rate': 0.076614824667391, 'subsample': 0.610845042291492, 'colsample_bytree': 0.7956039908084882}
