# Load Results

In [None]:
from zubora_gabora.experiment.experiment_loader import ExperimentLoader
from scipy.stats import wilcoxon

# Load the experiment
ga_experiments = ExperimentLoader("experiments/ga/").experiment_data
aco_experiments = ExperimentLoader("experiments/aco/").experiment_data
num_experiments = len(ga_experiments["experiment_id"].unique())


# Show boxplots of results

In [None]:
# Generate boxplots of algorithms
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

# Concatenate the data
ga_experiments["algorithm"] = "GA"
aco_experiments["algorithm"] = "ACO"
experiments = pd.concat([ga_experiments, aco_experiments], ignore_index=True)
experiments["experiment_id"] = experiments["experiment_id"].astype(int)
experiments["n_evaluations"] = experiments["n_evaluations"].astype(int)
experiments["fitness"] = experiments["fitness"].astype(float)
experiments["algorithm"] = experiments["algorithm"].astype(str)

# Plot the fitness boxplot for each dataset
for experiment_id in range(1,num_experiments+1):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="algorithm", y="fitness", data=experiments[experiments["experiment_id"] == experiment_id])
    plt.title(f"Fitness for dataset {experiment_id}")
    plt.show()
    # Plot the number of evaluations boxplot for each dataset
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="algorithm", y="n_evaluations", data=experiments[experiments["experiment_id"] == experiment_id])
    plt.title(f"Number of evaluations for dataset {experiment_id}")
    plt.show()



# Compare each dataset using Wilcoxon signed-rank test

In [None]:
def is_better(a,b):
  return a < b

alpha = 0.05

for i in range(1,num_experiments+1):
  # Extract those where column experiment_id equals i
  ga_experiment = ga_experiments[ga_experiments["experiment_id"] == i]
  aco_experiment = aco_experiments[aco_experiments["experiment_id"] == i]

  ga_fitness = ga_experiment["fitness"]
  aco_fitness = aco_experiment["fitness"]

  print(f"Experiment {i}")
  print(f"GA Fitness: {ga_fitness.mean()} +/- {ga_fitness.std()}")
  print(f"ACO Fitness: {aco_fitness.mean()} +/- {aco_fitness.std()}")

  if is_better(ga_fitness.mean(), aco_fitness.mean()):
    print("GA is better than ACO in Quality")
  else:
    print("ACO is better than GA in Quality")

  res = wilcoxon(ga_fitness, aco_fitness)
  if res.pvalue < alpha:
    print("The difference is statistically significant")
  else:
    print("The difference is not statistically significant")

  ga_evaluations = ga_experiment["n_evaluations"]
  aco_evaluations = aco_experiment["n_evaluations"]

  print(f"GA Evaluations: {ga_evaluations.mean()} +/- {ga_evaluations.std()}")
  print(f"ACO Evaluations: {aco_evaluations.mean()} +/- {aco_evaluations.std()}")

  if is_better(ga_evaluations.mean(), aco_evaluations.mean()):
    print("GA is better than ACO in Speed")
  else:
    print("ACO is better than GA in Speed")
  
  res = wilcoxon(ga_evaluations, aco_evaluations)
  if res.pvalue < alpha:
    print("The difference is statistically significant")
  else:
    print("The difference is NOT statistically significant")
  print("------------------------------------------------")

# Compare all datasets at once using Friedman test

In [None]:
import numpy as np
from stac.nonparametric_tests import friedman_aligned_ranks_test, shaffer_multitest

def is_better(a,b):
  return a < b

alpha = 0.05

names = ["GA", "ACO"]
names_pos = dict(zip(names, range(len(names))))

# Comparing quality
ga_fitness = ga_experiment["fitness"]
aco_fitness = aco_experiment["fitness"]

_, p_value, rankings, pivots = friedman_aligned_ranks_test(ga_fitness, aco_fitness)

if p_value < alpha:
  d = dict(zip(names, pivots))
  comp, _, _, adpval = shaffer_multitest(d)

  for i, apv in enumerate(adpval):
    if apv < alpha:
      chunks = comp[i].split("vs")

      name_l = chunks[0].strip()
      name_r = chunks[1].strip()

      if is_better(rankings[names_pos[name_l]], rankings[names_pos[name_r]]):
        print(f"{name_l} is better than {name_r} in terms of Quality")
      else:
        print(f"{name_r} is better than {name_l} in terms of Quality")
    else:
      print(f"There is no difference in terms of Quality between {name_l} and {name_r}")
else:
  print(f"There is no difference in terms of Quality")


# Comparing speed
ga_evaluations = ga_experiment["n_evaluations"]
aco_evaluations = aco_experiment["n_evaluations"]

_, p_value, rankings, pivots = friedman_aligned_ranks_test(ga_evaluations, aco_evaluations)

if p_value < alpha:
  d = dict(zip(names, pivots))
  comp, _, _, adpval = shaffer_multitest(d)

  for i, apv in enumerate(adpval):
    if apv < alpha:
      chunks = comp[i].split("vs")

      name_l = chunks[0].strip()
      name_r = chunks[1].strip()

      if is_better(rankings[names_pos[name_l]], rankings[names_pos[name_r]]):
        print(f"{name_l} is better than {name_r} in terms of Speed")
      else:
        print(f"{name_r} is better than {name_l} in terms of Speed")
    else:
      print(f"There is no difference in terms of Speed between {name_l} and {name_r}")
else:
  print(f"There is no difference in terms of Speed")
