# Analyzing Benchmark Results for Validation

In [1]:
import sys
# This allows us to import the nucml utilities
sys.path.append("..")

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)
pd.options.mode.chained_assignment = None  # default='warn'
sns.set_style("white")

import nucml.exfor.data_utilities as exfor_utils
import nucml.evaluation.data_utilities as endf_utils
import nucml.plot.utilities as plot_utils
import nucml.datasets as nuc_data
import nucml.ace.data_utilities as ace_utils
import nucml.model.building_utils as model_building
import nucml.model.utilities as model_utils
import nucml.general_utilities as gen_utils
import nucml.ace.plot as ace_plots

In [3]:
import importlib
importlib.reload(exfor_utils)
importlib.reload(gen_utils)
importlib.reload(endf_utils)
importlib.reload(plot_utils)
importlib.reload(nuc_data)
importlib.reload(ace_utils)
importlib.reload(model_building)
importlib.reload(model_utils)
importlib.reload(ace_plots)
print("Finish re-loading scripts.")

Finish re-loading scripts.


In [4]:
figure_dir = "figures/B0/"

In [5]:
sns.set(font_scale=2.5)
sns.set_style('white')

## Gathering Results from Benchmark Calculations

In [6]:
model_results_b0 = ace_utils.gather_benchmark_results("ml/KNN_B0/")
model_results_b1 = ace_utils.gather_benchmark_results("ml/KNN_B1/")
# model_results_b2 = ace_utils.gather_benchmark_results("ml/KNN_B2/")
# model_results_b3 = ace_utils.gather_benchmark_results("ml/KNN_B3/")
# model_results_b4 = ace_utils.gather_benchmark_results("ml/KNN_B4/")

## Analyzing Decision Tree Results

In [7]:
results_b0 = pd.read_csv("../ML_EXFOR_neutrons/1_KNN/knn_results_B0.csv")
results_b1 = pd.read_csv("../ML_EXFOR_neutrons/1_KNN/knn_results_B1.csv")
# results_b2 = pd.read_csv("../ML_EXFOR_neutrons/1_KNN/knn_results_B2.csv")
# results_b3 = pd.read_csv("../ML_EXFOR_neutrons/1_KNN/knn_results_B3.csv")
# results_b4 = pd.read_csv("../ML_EXFOR_neutrons/1_KNN/knn_results_B4.csv")

In [8]:
# results_b0["scale_energy"] = results_b0.run_name.apply(lambda x: True if "v2" in x else False)
# results_b0 = results_b0[results_b0.normalizer == "minmax"]
# results_b0 = results_b0[results_b0.scale_energy == True]
# results_b0 = results_b0[results_b0.distance_metric == 'manhattan']

In [9]:
results_b0['Model'] = results_b0.model_path.apply(lambda x: os.path.basename(os.path.dirname(x)))
results_b0['dataset'] = 'b0'

In [10]:
results_b1['Model'] = results_b1.model_path.apply(lambda x: os.path.basename(os.path.dirname(x)))
results_b1['dataset'] = 'b1'

In [11]:
# for df, dataset_tag in zip([results_b0, results_b1, results_b2, results_b3, results_b4], ["b0", "b1", "b2", "b3", "b4"]):
#     df['Model'] = df.model_path.apply(lambda x: os.path.basename(os.path.dirname(x)))
#     df['dataset'] = dataset_tag

In [12]:
results_b0 = results_b0[["Model", "train_mae", "val_mae", "test_mae", "dataset"]]
results_b1 = results_b1[["Model", "train_mae", "val_mae", "test_mae", "dataset"]]
# results_b2 = results_b2[["Model", "train_mae", "val_mae", "test_mae", "dataset"]]
# results_b3 = results_b3[["Model", "train_mae", "val_mae", "test_mae", "dataset"]]
# results_b4 = results_b4[["Model", "train_mae", "val_mae", "test_mae", "dataset"]]

In [13]:
final_b0 = model_results_b0.merge(results_b0, on="Model")
final_b1 = model_results_b1.merge(results_b1, on="Model")
# final_b2 = model_results_b2.merge(results_b2, on="Model")
# final_b3 = model_results_b3.merge(results_b3, on="Model")
# final_b4 = model_results_b4.merge(results_b4, on="Model")

In [14]:
final_set = final_b0.append(final_b1)

# final_set = final_b0.append(final_b1).append(final_b2).append(final_b3).append(final_b4)

# final_set["k"] = final_set.Model.apply(lambda x: x.split("_")[0][1:]).astype(int)

In [15]:
# final_b1["k"] = final_b1.Model.apply(lambda x: x.split("_")[0][1:]).astype(int)

final_set["k"] = final_set.Model.apply(lambda x: x.split("_")[0][1:]).astype(int)


In [16]:
# u233_002_001 = final_set[final_set.Benchmark == "U233_MET_FAST_002_001"].sort_values(by="Deviation_Ana")
# u233_002_002 = final_set[final_set.Benchmark == "U233_MET_FAST_002_002"].sort_values(by="Deviation_Ana")
# u233_001 = final_set[final_set.Benchmark == "U233_MET_FAST_001"].sort_values(by="Deviation_Ana")

In [17]:
# u233_002_001 = final_b1[final_b1.Benchmark == "U233_MET_FAST_002_001"].sort_values(by="Deviation_Ana")[['k', 'K_eff_ana', 'Unc_ana', 'Deviation_Ana', 'train_mae', 'val_mae', 'test_mae']]
# u233_002_002 = final_b1[final_b1.Benchmark == "U233_MET_FAST_002_002"].sort_values(by="Deviation_Ana")[['k', 'K_eff_ana', 'Unc_ana', 'Deviation_Ana', 'train_mae', 'val_mae', 'test_mae']]
# u233_001 = final_b1[final_b1.Benchmark == "U233_MET_FAST_001"].sort_values(by="Deviation_Ana")[['k', 'K_eff_ana', 'Unc_ana', 'Deviation_Ana', 'train_mae', 'val_mae', 'test_mae']]

In [18]:
u233_002_001 = final_set[final_set.Benchmark == "U233_MET_FAST_002_001"].sort_values(by="Deviation_Ana")[['Model', 'K_eff_ana', 'Unc_ana', 'Deviation_Ana', 'train_mae', 'val_mae', 'test_mae']]
u233_002_002 = final_set[final_set.Benchmark == "U233_MET_FAST_002_002"].sort_values(by="Deviation_Ana")[['Model', 'K_eff_ana', 'Unc_ana', 'Deviation_Ana', 'train_mae', 'val_mae', 'test_mae']]
u233_001 = final_set[final_set.Benchmark == "U233_MET_FAST_001"].sort_values(by="Deviation_Ana")[['Model', 'K_eff_ana', 'Unc_ana', 'Deviation_Ana', 'train_mae', 'val_mae', 'test_mae']]

In [19]:
u233_001.Deviation_Ana = u233_001.Deviation_Ana * 100
u233_002_001.Deviation_Ana = u233_002_001.Deviation_Ana * 100
u233_002_002.Deviation_Ana = u233_002_002.Deviation_Ana * 100

# U233_001

In [41]:
print(model_utils.get_best_models_df(u233_001).to_latex(index=False))

\begin{tabular}{lrrrrrrl}
\toprule
                                      Model &  K\_eff\_ana &  Unc\_ana &  Deviation\_Ana &  train\_mae &  val\_mae &  test\_mae &   tag \\
\midrule
k20\_distance\_manhattan\_minmax\_one\_hot\_B1\_v1 &   0.997505 &  0.00043 &         0.2495 &   0.025814 & 0.121110 &  0.120706 & Train \\
 k9\_distance\_manhattan\_minmax\_one\_hot\_B1\_v1 &   1.005510 &  0.00042 &         0.5510 &   0.025921 & 0.119010 &  0.118578 &   Val \\
 k8\_distance\_manhattan\_minmax\_one\_hot\_B1\_v1 &   1.006040 &  0.00044 &         0.6040 &   0.025952 & 0.119074 &  0.118461 &  Test \\
\bottomrule
\end{tabular}



In [42]:
print(u233_001.head(1).to_latex(index=False))

\begin{tabular}{lrrrrrr}
\toprule
                                      Model &  K\_eff\_ana &  Unc\_ana &  Deviation\_Ana &  train\_mae &  val\_mae &  test\_mae \\
\midrule
k18\_distance\_manhattan\_minmax\_one\_hot\_B1\_v1 &    0.99989 &  0.00045 &          0.011 &   0.025818 & 0.120711 &  0.120294 \\
\bottomrule
\end{tabular}



# U233_002_001

In [67]:
print(model_utils.get_best_models_df(u233_002_001).to_latex(index=False))

\begin{tabular}{rrrrrrrl}
\toprule
 k &  K\_eff\_ana &  Unc\_ana &  Deviation\_Ana &  train\_mae &  val\_mae &  test\_mae &   tag \\
\midrule
20 &   0.998708 &  0.00044 &         0.1292 &   0.025814 & 0.121110 &  0.120706 & Train \\
 9 &   1.006410 &  0.00043 &         0.6410 &   0.025921 & 0.119010 &  0.118578 &   Val \\
 8 &   1.007830 &  0.00042 &         0.7830 &   0.025952 & 0.119074 &  0.118461 &  Test \\
\bottomrule
\end{tabular}



In [68]:
print(u233_002_001.head(1).to_latex(index=False))

\begin{tabular}{rrrrrrr}
\toprule
 k &  K\_eff\_ana &  Unc\_ana &  Deviation\_Ana &  train\_mae &  val\_mae &  test\_mae \\
\midrule
17 &    1.00021 &  0.00044 &          0.021 &   0.025822 & 0.120527 &  0.120098 \\
\bottomrule
\end{tabular}



# U233_002_002

In [69]:
print(model_utils.get_best_models_df(u233_002_002).to_latex(index=False))

\begin{tabular}{rrrrrrrl}
\toprule
 k &  K\_eff\_ana &  Unc\_ana &  Deviation\_Ana &  train\_mae &  val\_mae &  test\_mae &   tag \\
\midrule
20 &    1.00154 &  0.00044 &          0.154 &   0.025814 & 0.121110 &  0.120706 & Train \\
 9 &    1.00874 &  0.00045 &          0.874 &   0.025921 & 0.119010 &  0.118578 &   Val \\
 8 &    1.00996 &  0.00044 &          0.996 &   0.025952 & 0.119074 &  0.118461 &  Test \\
\bottomrule
\end{tabular}



In [70]:
print(u233_002_002.head(1).to_latex(index=False))

\begin{tabular}{rrrrrrr}
\toprule
 k &  K\_eff\_ana &  Unc\_ana &  Deviation\_Ana &  train\_mae &  val\_mae &  test\_mae \\
\midrule
20 &    1.00154 &  0.00044 &          0.154 &   0.025814 &  0.12111 &  0.120706 \\
\bottomrule
\end{tabular}



# Best Models Overall

In [73]:
import numpy as np

In [74]:
np.array([0.998075, 1.00086, 1.000200]).mean()

0.9997116666666667

In [20]:
ERROR = 0.0288333

In [21]:
model_mean = final_b1.groupby("Model").mean()

In [93]:
model_mean = model_mean[['K_eff_ana']]

In [22]:
model_mean["Error"] = (abs(model_mean.K_eff_ana - 1) /1) * 100

In [23]:
model_mean = model_mean.reset_index()

In [24]:
model_mean.sort_values("Error").head()

Unnamed: 0,Model,K_eff_ana,Unc_ana,K_eff_imp,Unc_imp,Deviation_Ana,Deviation_Imp,train_mae,val_mae,test_mae,Error
9,k19_distance_manhattan_minmax_one_hot_B1_v1,0.99991,0.00044,0.999908,0.000303,0.001956,0.001838,0.025817,0.120899,0.120494,0.008967
10,k1_distance_manhattan_minmax_one_hot_B1_v1,0.999706,0.000427,0.999953,0.000287,0.00154,0.001734,0.029016,0.140824,0.141952,0.029367
11,k20_distance_manhattan_minmax_one_hot_B1_v1,0.999251,0.000437,0.999333,0.0003,0.001776,0.0017,0.025814,0.12111,0.120706,0.0749
8,k18_distance_manhattan_minmax_one_hot_B1_v1,1.000766,0.000437,1.000475,0.0003,0.001081,0.001298,0.025818,0.120711,0.120294,0.0766
7,k17_distance_manhattan_minmax_one_hot_B1_v1,1.000776,0.00044,1.00103,0.000293,0.001004,0.00125,0.025822,0.120527,0.120098,0.077567


In [90]:
model_mean = model_mean[model_mean.Error < ERROR]

In [91]:
print(model_mean.sort_values("Error").to_latex(index=False))

\begin{tabular}{lrr}
\toprule
                                      Model &  K\_eff\_ana &    Error \\
\midrule
k19\_distance\_manhattan\_minmax\_one\_hot\_B1\_v1 &    0.99991 & 0.008967 \\
\bottomrule
\end{tabular}



In [1]:
# ace_plots.knn_dual_plot(
#     knn_robust_euclidean, "k", "train_mae", "val_mae", "Deviation_Ana", save=False, saving_dir=figure_dir)

# # ace_plot_utils.knn_dual_plot(
# #     knn_robust_euclidean, "k", "train_mae", "val_mae", "K_eff_ana", save=False, saving_dir=figure_dir)

# ace_plots.knn_keff_plot(
#     knn_robust_euclidean, "k", "val_mae", "Deviation_Ana", save=True, saving_dir=figure_dir)

In [107]:
# not_to_delete = knn_final.sort_values(by="Deviation_Ana").head()
# not_to_delete = not_to_delete.append(model_utils.get_best_models_df(knn_robust_euclidean))

# to_delete = []

# for i in knn_final.model_path.values:
#     if i not in not_to_delete.model_path.values:
#         to_delete.append(os.path.dirname(i))

# import shutil

# for i in to_delete:
#     shutil.rmtree(i) 