In [1]:
# Re-run sampling with robust distance computation and safety for large n_total.
import pandas as pd
import itertools
import numpy as np
from pathlib import Path

In [2]:
# Excel-Datei laden
xlsx_path = Path('configurations.xlsx')
df_raw = pd.read_excel(xlsx_path, header=None)

# Dictionary mit Parameterwerten erzeugen
parameter_values = {}
for i, row in df_raw.iterrows():
    param = row.iloc[0]  # Erster Wert in der Zeile = Parametername
    # Restliche Werte (ohne NaN) in Liste umwandeln
    values = row.iloc[1:].dropna().tolist()
    parameter_values[param] = values

# Alle Kombinationen bilden
keys = list(parameter_values.keys())
combinations = list(itertools.product(*parameter_values.values()))

# DataFrame mit allen Kombinationen
df_combinations = pd.DataFrame(combinations, columns=keys)

# Lösche alle Konfigurationen mit einer Wärmeerzeuger-Kapazität weniger 200 kW
df_combinations = df_combinations[df_combinations['hp'].str[4:6].astype(int) + df_combinations['chp'] + df_combinations['boiler'] >= 200]


In [3]:
n_total = len(df_combinations)
n = 300
n_sample = n if n_total >= n else n_total

# Encode combinations
encodings = np.zeros((n_total, len(parameter_values)), dtype=np.float32)
for j, param in enumerate(parameter_values.keys()):
    unique_vals = list(dict.fromkeys(parameter_values[param]))
    mapping = {v: idx for idx, v in enumerate(unique_vals)}
    n_levels = len(unique_vals)
    if n_levels == 1:
        encodings[:, j] = 0.0
    else:
        encodings[:, j] = np.array([mapping[v] for v in df_combinations[param]], dtype=np.float32) / float(max(1, n_levels-1))


In [4]:
# Try to compute full distance matrix using scipy if available, else numpy (float32)
use_full_matrix = False
try:
    from scipy.spatial.distance import cdist
    dist_matrix = cdist(encodings, encodings, metric='euclidean')
    use_full_matrix = True
except Exception as e:
    # fallback: compute using numpy broadcasting if memory allows
    try:
        # compute squared distances using (a-b)^2 = a^2 + b^2 -2ab trick
        A = encodings.astype(np.float32)
        sqA = (A**2).sum(axis=1).reshape(-1,1)
        dist2 = sqA + sqA.T - 2.0 * (A @ A.T)
        dist2[dist2 < 0] = 0.0
        dist_matrix = np.sqrt(dist2)
        use_full_matrix = True
    except Exception as e2:
        dist_matrix = None
        use_full_matrix = False


In [5]:
selected_idx = []
rng = np.random.default_rng(42)
first = int(rng.integers(0, n_total))
selected_idx.append(first)

if use_full_matrix:
    # greedy select by maximizing min distance to selected set
    remaining = np.array([i for i in range(n_total) if i not in selected_idx], dtype=int)
    while len(selected_idx) < n_sample:
        # compute min distance to selected for each remaining
        # dist_matrix[remaining][:, selected_idx] -> shape (remaining_count, selected_count)
        min_dists = dist_matrix[remaining][:, selected_idx].min(axis=1)
        best_rel = np.argmax(min_dists)
        best_idx = int(remaining[best_rel])
        selected_idx.append(best_idx)
        remaining = np.array([i for i in remaining if i != best_idx], dtype=int)
else:
    # incremental greedy with on-the-fly distances (vectorized)
    remaining = [i for i in range(n_total) if i not in selected_idx]
    while len(selected_idx) < n_sample:
        best_idx = None
        best_min = -1.0
        sel_array = np.array(selected_idx, dtype=int)
        # compute distances in batches for speed
        batch_size = 2000
        for bstart in range(0, len(remaining), batch_size):
            batch = remaining[bstart:bstart+batch_size]
            # compute distances between batch and selected
            d = np.linalg.norm(encodings[np.array(batch)][:,None,:] - encodings[sel_array][None,:,:], axis=2)
            mins = d.min(axis=1)
            # check best in batch
            batch_best_idx = np.argmax(mins)
            batch_best_val = mins[batch_best_idx]
            if batch_best_val > best_min:
                best_min = float(batch_best_val)
                best_idx = int(batch[batch_best_idx])
        selected_idx.append(best_idx)
        remaining.remove(best_idx)


In [6]:
selected_df = df_combinations.iloc[selected_idx].reset_index(drop=True)
out_path = f'../../data/inputs/sample_plan_{n}.csv'
selected_df.to_csv(out_path, index=False)

# Diagnostics
diagnostics = []
for j, name in enumerate(parameter_values.keys()):
    pop_counts = df_combinations[name].value_counts().sort_index()
    samp_counts = selected_df[name].value_counts().sort_index()
    all_levels = list(dict.fromkeys(parameter_values[name]))
    for lvl in all_levels:
        diagnostics.append({
            'parameter': name,
            'level': lvl,
            'population_count': int(pop_counts.get(lvl, 0)),
            'sample_count': int(samp_counts.get(lvl, 0))
        })
diag_df = pd.DataFrame(diagnostics)
diag_df['percentage_in_population'] = diag_df['sample_count'] / diag_df['population_count'] * 100.0

print(f"Total combinations in grid: {n_total}")
print(f"Sample size produced: {n_sample}")
display(selected_df.head(10))
display(diag_df.head(40))
print(f"Saved sample CSV to: {out_path}")

Total combinations in grid: 7290
Sample size produced: 300


Unnamed: 0,hp,chp,boiler,pv,supply_config,hr_mode,T_dhw_sp,hwt_volume
0,Air_30kW,50000.0,200000.0,25000,2-runner,on,65,20000
1,Air_40kW,150000.0,400000.0,75000,4-runner,off,55,10000
2,Air_30kW,0.0,400000.0,25000,2-runner,off,55,10000
3,Air_30kW,0.0,400000.0,75000,4-runner,on,65,10000
4,Air_30kW,150000.0,0.0,25000,4-runner,on,55,10000
5,Air_30kW,150000.0,0.0,75000,2-runner,off,65,10000
6,Air_40kW,0.0,400000.0,25000,4-runner,on,55,20000
7,Air_40kW,0.0,400000.0,75000,2-runner,off,65,20000
8,Air_40kW,150000.0,0.0,25000,2-runner,off,55,20000
9,Air_40kW,150000.0,0.0,75000,4-runner,on,65,20000


Unnamed: 0,parameter,level,population_count,sample_count,percentage_in_population
0,hp,Air_30kW,2430,128,5.26749
1,hp,Air_60kW,2430,63,2.592593
2,hp,Air_40kW,2430,109,4.485597
3,chp,0,1458,80,5.486968
4,chp,50000,1944,51,2.623457
5,chp,100000,1944,48,2.469136
6,chp,150000.0,1944,121,6.22428
7,boiler,0,1458,92,6.310014
8,boiler,200000,1944,67,3.446502
9,boiler,300000,1944,32,1.646091


Saved sample CSV to: ../../data/inputs/sample_plan_300.csv
