### Imports

In [74]:
!! pip install pymixbox pandas

 "You should consider upgrading via the 'C:\\Python310\\python.exe -m pip install --upgrade pip' command."]

In [75]:
import pandas as pd
import math
from typing import Callable
import mixbox
import math

### Error Definitions

In [76]:
def deltaE76(color1: list[float], color2: list[float]) -> float:
    dl = color1[0] - color2[0]
    da = color1[1] - color2[1]
    db = color1[2] - color2[2]

    return math.sqrt(dl*dl + da*da + db*db)

In [77]:
def rmsErrorForDim(color1: list[float], color2: list[float], dim: str) -> float:
    labDim = 0 if dim == "l" else 1 if dim == "a" else 2
    return math.sqrt((color1[labDim] - color2[labDim])**2)

In [78]:
def calculateAverageError(errorFunction: Callable[[list[float], list[float]], float], dataset, field1: str, field2: str) -> float:
    error_values = []
    for i in range(len(dataset)):
        targetLab = [float(x) for x in dataset[field1][i].split(";")]
        resultLab = [float(x) for x in dataset[field2][i].split(";")]
        error_values.append(errorFunction(targetLab, resultLab))
    return sum(error_values) / len(error_values)

## Comaring different optimization strategies

In [79]:
def count_false_weights(result_w, target_w, threshold, similarity_groups=None):
    # Simple non-domain logic
    if similarity_groups is None:
        return sum(abs(r - t) > threshold for r, t in zip(result_w, target_w))

    # Domain-aware logic
    COLOR_TO_GROUP = {
        idx: g
        for g, indices in similarity_groups.items()
        for idx in indices
    }

    false_count = 0
    for i, (r, t) in enumerate(zip(result_w, target_w)):
        if abs(r - t) <= threshold:
            continue

        group = COLOR_TO_GROUP[i]
        group_indices = similarity_groups[group]

        target_sum = sum(target_w[j] for j in group_indices)
        result_sum = sum(result_w[j] for j in group_indices)

        if abs(target_sum - result_sum) <= threshold:
            continue

        false_count += 1

    return false_count


In [80]:
import math

def rmse_false_weights(result_w, target_w, threshold, similarity_groups=None):
    if similarity_groups is None:
        diffs = [(r - t)**2 for r, t in zip(result_w, target_w) if abs(r - t) > threshold]
        return math.sqrt(sum(diffs) / len(diffs)) if diffs else 0.0

    COLOR_TO_GROUP = {
        idx: g
        for g, indices in similarity_groups.items()
        for idx in indices
    }

    sq_errors = []

    for i, (r, t) in enumerate(zip(result_w, target_w)):
        diff = r - t
        if abs(diff) <= threshold:
            continue

        group = COLOR_TO_GROUP[i]
        group_indices = similarity_groups[group]

        target_sum = sum(target_w[j] for j in group_indices)
        result_sum = sum(result_w[j] for j in group_indices)

        if abs(target_sum - result_sum) <= threshold:
            continue

        sq_errors.append(diff * diff)

    return math.sqrt(sum(sq_errors) / len(sq_errors)) if sq_errors else 0.0


In [81]:
import os
from pathlib import Path

db_folder = "C:\\Users\\safii\\CI_presentation\\datasets\\"

csv_files = [f for f in os.listdir(db_folder) if f.endswith('.csv')]

results = []

for csv_file in csv_files:
    path = os.path.join(db_folder, csv_file)
    dataset = pd.read_csv(path, delimiter=",")
    
    average_error_result = calculateAverageError(deltaE76, dataset, "targetLab", "resultLab")
    average_error_initial = calculateAverageError(deltaE76, dataset, "targetLab", "initialLab")
    
    mean_num_eval = dataset["numberOfEvaluations"].mean()
    mean_time_ms = dataset["runtimeMs"].mean()

    dataset["resultWeights"] = dataset["resultWeights"].str.split(";").apply(lambda x: list(map(float, x)))
    dataset["targetWeights"] = dataset["targetWeights"].str.split(";").apply(lambda x: list(map(float, x)))

    threshold = 0.1

    dataset["NumFalseWeights"] = dataset.apply(
    lambda row: count_false_weights_domain(row["resultWeights"], row["targetWeights"], 0.1),
    axis=1
    )

    dataset["RMSEFalseWeights"] = dataset.apply(
        lambda row: rmse_false_weights_domain(row["resultWeights"], row["targetWeights"], 0.1),
        axis=1
    )
    
    dataset["converged"] = dataset["converged"].astype(str).str.lower() == "true"
    convergence_rate = dataset["converged"].mean()


    results.append({
        'Optimizer': dataset["optimizerName"][0],
        'Average Result Error': average_error_result,
        'Average Initial Error': average_error_initial,
        'Mean Evaluations': mean_num_eval,
        "Mean Time Ms": mean_time_ms,
        "Num False Weights": dataset["NumFalseWeights"].mean(),
        "RMSE False Weights": dataset["RMSEFalseWeights"].mean(),
        "Convergance Rate": convergence_rate,
    })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Optimizer,Average Result Error,Average Initial Error,Mean Evaluations,Mean Time Ms,Num False Weights,RMSE False Weights,Convergance Rate
0,BOBYQA,43.671001,56.685908,87.0,16.85,9.4,0.307499,1.0
1,CMA-ES,5.164211,56.685908,9410.5,123.6,5.15,0.460372,0.0
2,Nelder-Mead,26.004887,56.685908,194.2,0.1,10.05,2.588688,1.0
3,NSGAII,10.189884,56.685908,50000.0,103965.85,3.4,0.479072,1.0
4,Powell,26.004887,56.685908,194.2,1.25,10.05,2.588688,1.0
5,SMSEMOA,12.941743,56.685908,50000.0,34950.85,4.6,0.528263,1.0


## Comparing different optimization parameters for a given strategy

In [87]:
import os
import pandas as pd

def analyze_optimizer_minimal(common_columns, dataset_folder):

    csv_files = [
        f for f in os.listdir(dataset_folder)
    ]

    results = []

    for csv_file in csv_files:
        path = os.path.join(db_folder, csv_file)
        dataset = pd.read_csv(path)

        # --- Compute errors ---
        avg_error_result = calculateAverageError(deltaE76, dataset, "targetLab", "resultLab")
        avg_error_initial = calculateAverageError(deltaE76, dataset, "targetLab", "initialLab")

        # --- Common statistics ---
        runtime_mean = dataset["runtimeMs"].mean()
        dataset["converged"] = dataset["converged"].astype(str).str.lower() == "true"
        convergence_rate = dataset["converged"].mean()

        # --- Detect optimizer-specific parameters ---
        param_columns = dataset.columns.difference(common_columns)

        optimizer_params = {
            col: dataset[col].iloc[0] for col in param_columns
        }

        # --- Add results ---
        row = {
            "Average Result Error": avg_error_result,
            "Average Initial Error": avg_error_initial,
            "Runtime (ms)": runtime_mean,
            "Convergance Rate": convergence_rate,
        }

        # Add optimizer-specific parameters
        row.update(optimizer_params)

        results.append(row)

    return pd.DataFrame(results)


In [88]:
import os
import pandas as pd

def analyze_optimizer(
    dataset_folder,
    common_columns,
    threshold=0.1,
    similarity_groups=None,
    weight_result_col="resultWeights",
    weight_target_col="targetWeights"
):

    csv_files = [f for f in os.listdir(dataset_folder) if f.endswith(".csv")]
    results = []

    for csv_file in csv_files:
        path = os.path.join(dataset_folder, csv_file)
        dataset = pd.read_csv(path)

        # --- Parse weights ---
        dataset[weight_result_col] = dataset[weight_result_col].str.split(";").apply(lambda x: list(map(float, x)))
        dataset[weight_target_col] = dataset[weight_target_col].str.split(";").apply(lambda x: list(map(float, x)))

        # --- Compute domain-aware or simple weight errors ---
        dataset["NumFalseWeights"] = dataset.apply(
            lambda row: count_false_weights(
                row[weight_result_col], row[weight_target_col], threshold, similarity_groups
            ),
            axis=1
        )

        dataset["RMSEFalseWeights"] = dataset.apply(
            lambda row: rmse_false_weights(
                row[weight_result_col], row[weight_target_col], threshold, similarity_groups
            ),
            axis=1
        )

        # --- ΔE metrics ---
        avg_error_result = calculateAverageError(deltaE76, dataset, "targetLab", "resultLab")
        avg_error_initial = calculateAverageError(deltaE76, dataset, "targetLab", "initialLab")

        # --- Convergence ---
        dataset["converged"] = dataset["converged"].astype(str).str.lower() == "true"
        conv_rate = dataset["converged"].mean()

        # --- Common statistics ---
        runtime_mean = dataset["runtimeMs"].mean()

        # --- Detect optimizer-specific parameters automatically ---
        param_columns = dataset.columns.difference(common_columns)
        optimizer_params = {col: dataset[col].iloc[0] for col in param_columns}

        mean_num_eval = dataset["numberOfEvaluations"].mean()

        # --- Assemble row ---
        row = {
            "File": csv_file,
            "Average Result Error": avg_error_result,
            "Average Initial Error": avg_error_initial,
            "Runtime (ms)": runtime_mean,
            "Convergence Rate": conv_rate,
            "Number of Evaluations": mean_num_eval
        }

        row.update(optimizer_params)
        results.append(row)

    return pd.DataFrame(results)


In [91]:
db_folder = "C:\\Users\\safii\\CI_presentation\\datasets\\"

SIMILAR_GROUPS = {
    "yellow": [0, 2, 6],
    "red":    [1, 4],
    "blue":   [3, 8, 10],
    "earth":  [5],
    "white":  [7],
    "black":  [9],
}

COMMON_COLUMNS = [
    "targetLab",
    "resultLab",
    "initialLab",
    "targetWeights",
    "resultWeights",
    "initialWeights",
    "numberOfEvaluations",
    "mixingErrorName",
    "penalties",
    "normalizer",
    "initialGuessType",
    "runtimeMs",
    "converged",
]

COMPARISON_COLUMNS = [
    'optimizerName',
    'Average Result Error',
    'Average Initial Error',
    'Runtime (ms)',
    'Convergence Rate',
    'NumFalseWeights',
    'RMSEFalseWeights',
    "Number of Evaluations"
]


results = analyze_optimizer(
    dataset_folder=db_folder,
    common_columns=COMMON_COLUMNS,
    threshold=0.1,
    similarity_groups=SIMILAR_GROUPS,
)
results[COMPARISON_COLUMNS]


Unnamed: 0,optimizerName,Average Result Error,Average Initial Error,Runtime (ms),Convergence Rate,NumFalseWeights,RMSEFalseWeights,Number of Evaluations
0,BOBYQA,43.671001,56.685908,16.85,1.0,11,0.314357,87.0
1,CMA-ES,5.164211,56.685908,123.6,0.0,3,0.570986,9410.5
2,Nelder-Mead,26.004887,56.685908,0.1,1.0,10,3.384677,194.2
3,NSGAII,10.189884,56.685908,103965.85,1.0,3,0.874155,50000.0
4,Powell,26.004887,56.685908,1.25,1.0,10,3.384677,194.2
5,SMSEMOA,12.941743,56.685908,34950.85,1.0,3,0.47605,50000.0


## Performance Overview
- CMA-ES: lowest error for reasonable time considering number of evaluations and lowest weight error rate.
- NSGAII: second lowest error but way too slow, number of evaluations possibly wrong
- SMSEMOA: same as NSGAII
- Nelder-Mead	and Powell: super fast but way less accurate
- BOBYQA: worst by performance

## Hyperparameter Optimization for CMA-ES

In [102]:

def find_best_parameters(dir_path: str, common_columns: list[str]):
    results = []

    for file in os.listdir(dir_path):
        if not file.endswith(".csv"):
            continue

        full_path = os.path.join(dir_path, file)
        dataset = pd.read_csv(full_path, sep=",")

        avg_error_result = calculateAverageError(deltaE76, dataset, "targetLab", "resultLab")

        param_columns = dataset.columns.difference(common_columns)
        params = {col: dataset[col].iloc[0] for col in param_columns}

        params["avg_error_result"] = avg_error_result
        params["file"] = file

        results.append(params)

    # Convert list of dicts → DataFrame
    results_df = pd.DataFrame(results)

    # --- Find best parameters ---
    best = results_df.loc[results_df["avg_error_result"].idxmin()]

    return results_df, best

### Nelder Mead (fast : > 22 min)

In [104]:
db_folder = "C:\\Users\\safii\\CI_presentation\\datasets\\CMA-ES-Hyperparameter-Runs\\"

COMMON_COLUMNS = [
    "targetLab",
    "resultLab",
    "initialLab",
    "targetWeights",
    "resultWeights",
    "initialWeights",
    "numberOfEvaluations",
    "mixingErrorName",
    "penalties",
    "normalizer",
    "initialGuessType",
    "runtimeMs",
    "converged",
]

find_best_parameters(db_folder, COMMON_COLUMNS)[1]


checkFeasibleCount                         11
diagonalOnly                               11
optimizerName                          CMA-ES
populationMultiplier                       13
sigma                                0.389002
stopFitness                          0.001234
avg_error_result                     3.982888
file                    run-1765058286487.csv
Name: 138, dtype: object

### CMA-ES (slow : > ?)

Overall best parameters for CMA-ES
- checkFeasibleCount                         ?
- diagonalOnly                               ?
- populationMultiplier                       ?
- sigma                                      ?
- stopFitness                                ?
- avg_error_result                           ?