In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C, DotProduct
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import matplotlib.pyplot as plt
from pathlib import Path
import itertools
from pathlib import Path
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
# load your samples (parameters + outputs)
df = pd.read_csv("../../data/outputs/optimal_config_results.csv")

X = df[['hp','chp','boiler','pv','supply_config','hr_mode','T_dhw_sp','hwt_volume']]
y = df[['costs', 'co2', 'aux_heater']]


# Drop rows where y is NaN
mask = ~y.isna().any(axis=1)

X = X[mask]
y = y[mask]


In [3]:
# --- train/test split ---
from sklearn.model_selection import train_test_split

# 90/10 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)

y1_train = y_train["costs"]   
y2_train = y_train["co2"]
y3_train = y_train["aux_heater"]

y1_test = y_test["costs"]
y2_test = y_test["co2"]
y3_test = y_test["aux_heater"]


In [4]:
# --- Preprocessing ---
categorical = ['hp', 'supply_config', 'hr_mode']
numeric = ['chp', 'boiler', 'pv', 'T_dhw_sp', 'hwt_volume']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), categorical),
    ('num', StandardScaler(), numeric)
])

X_processed = preprocessor.fit_transform(X)

In [5]:
kernel = C(1.0, (1e-3, 1e4)) * Matern(length_scale=1.0, nu=1.5) + WhiteKernel(noise_level=1)
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True)
rf = RandomForestRegressor(random_state=0, n_jobs=-1, max_depth=None, min_samples_split=2, n_estimators=600)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)

In [6]:
# Fit and predict y1 with GPR, y2 with RF using the pipeline
pipe_gpr = Pipeline([
    ("preprocess", preprocessor),
    ("gpr", gpr)
])
pipe_rf = Pipeline([
    ("preprocess", preprocessor),
    ("rf", rf)
])

pipe_gb = Pipeline([
    ("preprocess", preprocessor),
    ("gb", gb)
])

# Fit and predict for y1 (costs) with GPR
pipe_gpr.fit(X_train, y1_train)
y1_pred = pipe_gpr.predict(X_test)
mae_y1 = mean_absolute_error(y1_test, y1_pred)
r2_y1 = r2_score(y1_test, y1_pred)
rel_mae_y1 = mae_y1 *100 / np.mean(np.abs(y1_test))
print(f"GPR y1 (costs) MAE: {mae_y1:.2f}")
print(f"GPR y1 (costs) R2: {r2_y1:.3f}")
print(f"GPR y1 (costs) Relative MAE: {rel_mae_y1:.2f}%")

# Fit and predict for y2 (co2) with RF
pipe_rf.fit(X_train, y2_train)
y2_pred = pipe_rf.predict(X_test)
mae_y2 = mean_absolute_error(y2_test, y2_pred)
r2_y2 = r2_score(y2_test, y2_pred)
rel_mae_y2 = mae_y2 * 100 / np.mean(np.abs(y2_test))
print(f"RF y2 (co2) MAE: {mae_y2:.2f}")
print(f"RF y2 (co2) R2: {r2_y2:.3f}")
print(f"RF y2 (co2) Relative MAE: {rel_mae_y2:.2f}%")

# Fit and predict for y3 (aux_heater) with GB
pipe_gb.fit(X_train, y3_train)
y3_pred = pipe_gb.predict(X_test)
mae_y3 = mean_absolute_error(y3_test, y3_pred)
r2_y3 = r2_score(y3_test, y3_pred)
rel_mae_y3 = mae_y3 * 100 / np.mean(np.abs(y3_test))
print(f"GB y3 (aux_heater) MAE: {mae_y3:.2f}")
print(f"GB y3 (aux_heater) R2: {r2_y3:.3f}")
print(f"GB y3 (aux_heater) Relative MAE: {rel_mae_y3:.2f}%")


ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


GPR y1 (costs) MAE: 1295.36
GPR y1 (costs) R2: 1.000
GPR y1 (costs) Relative MAE: 0.04%
RF y2 (co2) MAE: 11.08
RF y2 (co2) R2: 0.933
RF y2 (co2) Relative MAE: 3.80%
GB y3 (aux_heater) MAE: 641.71
GB y3 (aux_heater) R2: 0.705
GB y3 (aux_heater) Relative MAE: 109.97%


In [58]:
# Load Excel file
xlsx_path = Path('configurations.xlsx')
df_raw = pd.read_excel(xlsx_path, header=None)

# Create dictionary with parameter values
parameter_values = {}
for i, row in df_raw.iterrows():
    param = row.iloc[0]  # First value in the row = parameter name
    # Convert remaining values (excluding NaN) to list
    values = row.iloc[1:].dropna().tolist()
    parameter_values[param] = values

# Build all combinations
keys = list(parameter_values.keys())
combinations = list(itertools.product(*parameter_values.values()))

# DataFrame with all combinations
df_combinations = pd.DataFrame(combinations, columns=keys)

# Delete all configurations with total heat generator capacity less than 200 kW
df_combinations = df_combinations[df_combinations['hp'].str[4:6].astype(int) + df_combinations['chp'] + df_combinations['boiler'] >= 200]


In [59]:
# Find combinations not in optimal_config_results.csv
existing = df[['hp','chp','boiler','pv','supply_config','hr_mode','T_dhw_sp','hwt_volume']].drop_duplicates()
merge_cols = ['hp','chp','boiler','pv','supply_config','hr_mode','T_dhw_sp','hwt_volume']
df_new = df_combinations.merge(existing, on=merge_cols, how='left', indicator=True)
df_new = df_new[df_new['_merge'] == 'left_only'].drop('_merge', axis=1)

# Predict costs and co2 for new combinations
if not df_new.empty:
    # Use the same column order as training
    X_new = df_new[merge_cols]
    costs_pred = pipe_gpr.predict(X_new)
    co2_pred = pipe_rf.predict(X_new)
    df_new['costs'] = costs_pred
    df_new['co2'] = co2_pred
else:
    print('No new combinations to predict.')

# df_new now contains all new combinations with predicted costs and co2

In [60]:
# Merge df_new and existing results
existing_results = df[merge_cols + ['costs', 'co2']]
df_all = pd.concat([existing_results, df_new[merge_cols + ['costs', 'co2']]], ignore_index=True)
print(f"Total configurations: {df_all.shape[0]}")

Total configurations: 4860


In [61]:
df_all.to_csv('all_configurations.csv')