

# Evaluation of Logs
Tensorboard logs play a huge role in evaluating our model performance
This script focuses on getting the necessary information from a log directory to have educated information on the model performance.

The first cell (directly below) focuses on deriving the best F1-Score and Recall model
The second cell focuses on deriving from a directory of several folds (considering this project uses five-fold cross validation) the average across all folds of the necessary metrics

In [2]:
import os
import numpy as np
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

log_dir = r"D:\tensorboard_logs_fold_1"
#log_dir = r"C:\Users\Admin\Documents\MasterThesis\results\ResNet50_BCos_Heavy\seed_0\tensorboard_logs_fold_1"

event_acc = EventAccumulator(log_dir)
event_acc.Reload()

# List all available scalars
available_tags = event_acc.Tags()["scalars"]
print("Available scalars:", available_tags)

# Extract scalar values per epoch
precision_values = event_acc.Scalars("Metrics/Precision")
accuracy_values = event_acc.Scalars("Accuracy/Validation")
f1_values = event_acc.Scalars("Metrics/F1")
recall_values = event_acc.Scalars("Metrics/Recall")
auc_values = event_acc.Scalars("Metrics/AUC")


# Get epoch-wise values
epochs = [x.step for x in precision_values]
accuracy_scores = [x.value for x in accuracy_values]
f1_scores = [x.value for x in f1_values]
precision_scores = [x.value for x in precision_values]  # Convert to list
recall_scores = [x.value for x in recall_values]  # Convert to list

# Compute mean, min, max precision
mean_precision = np.mean(precision_scores)
minimum_precision = np.min(precision_scores)
maximum_precision = np.max(precision_scores)

print(f"Mean Average Precision (mAP): {mean_precision}")
print(f"Min Precision: {minimum_precision}")
print(f"Max Precision: {maximum_precision}")

print()

mean_recall = np.mean(recall_scores)
minimum_recall = np.min(recall_scores)
maximum_recall = np.max(recall_scores)

print(f"Mean Average Recall (mAR): {mean_recall}")
print(f"Min Recall: {minimum_recall}")
print(f"Max Recall: {maximum_recall}")


# Compute mean, min, max accuracy
mean_accuracy = np.mean(accuracy_scores)
minimum_accuracy = np.min(accuracy_scores)
maximum_accuracy = np.max(accuracy_scores)

print()
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Min Accuracy: {minimum_accuracy}")
print(f"Max Accuracy: {maximum_accuracy}")


best_accuracy_idx = np.argmax(accuracy_scores)
best_acc_epoch = epochs[best_accuracy_idx]
best_acc_f1 = f1_scores[best_accuracy_idx]
best_acc_precision = precision_values[best_accuracy_idx].value
best_acc_recall = recall_values[best_accuracy_idx].value  
best_acc_accuracy = accuracy_values[best_accuracy_idx].value
best_acc_auc = auc_values[best_accuracy_idx].value

print()
print(f"Corresponding Accuracy: {best_acc_accuracy} at Epoch {best_accuracy_idx}")
print(f"Corresponding Precision: {best_acc_precision}")
print(f"Highest Recall: {best_acc_recall}")
print(f"Corresponding F1 Score: {best_acc_f1}")
print(f"Corresponding AUC: {best_acc_auc}")
print()

# Find best epoch based on highest F1 score
best_f1_idx = np.argmax(f1_scores)
best_f1_epoch = epochs[best_f1_idx]
best_f1 = f1_scores[best_f1_idx]
best_f1_precision = precision_values[best_f1_idx].value
best_f1_recall = recall_values[best_f1_idx].value  
best_f1_accuracy = accuracy_values[best_f1_idx].value
best_f1_auc = auc_values[best_f1_idx].value


best_f1_precision = round(best_f1_precision, 4)
best_f1_recall = round(best_f1_recall, 4)
best_f1_accuracy = round(best_f1_accuracy, 4)
best_f1_auc = round(best_f1_auc, 4)
best_f1_f1 = round(best_f1, 4)

print()
print(f"Corresponding Accuracy: {best_f1_accuracy}")
print(f"Corresponding Precision: {best_f1_precision}")
print(f"Corresponding Recall: {best_f1_recall}")
print(f"Highest F1 Score: {best_f1_f1} at Epoch {best_f1_epoch}")
print(f"Corresponding AUC: {best_f1_auc}")

best_recall_idx = np.argmax(recall_scores)
best_recall_epoch = epochs[best_recall_idx]
best_recall_f1 = f1_scores[best_recall_idx]
best_recall_precision = precision_values[best_recall_idx].value
best_recall_recall = recall_values[best_recall_idx].value  
best_recall_accuracy = accuracy_values[best_recall_idx].value
best_recall_auc = auc_values[best_recall_idx].value


best_recall_precision = round(best_recall_precision, 4)
best_recall_recall = round(best_recall_recall, 4)
best_recall_accuracy = round(best_recall_accuracy, 4)
best_recall_auc = round(best_recall_auc, 4)
best_recall_f1 = round(best_recall_f1, 4)


print()
print()
print(f"Corresponding Accuracy: {best_recall_accuracy}")
print(f"Corresponding Precision: {best_recall_precision}")
print(f"Highest Recall: {best_recall_recall} at Epoch {best_recall_epoch}")
print(f"Corresponding F1 Score: {best_recall_f1}")
print(f"Corresponding AUC: {best_recall_auc}")

Available scalars: ['Loss/Train', 'Accuracy/Train', 'Loss/Validation', 'Accuracy/Validation', 'Metrics/Precision', 'Metrics/Recall', 'Metrics/F1', 'Metrics/AUC', 'Learning_Rate']
Mean Average Precision (mAP): 0.3408434331417084
Min Precision: 0.29667606949806213
Max Precision: 0.42765071988105774

Mean Average Recall (mAR): 0.43743633925914766
Min Recall: 0.3488323390483856
Max Recall: 0.48400864005088806

Mean Accuracy: 0.8584404706954956
Min Accuracy: 0.8102619051933289
Max Accuracy: 0.881428599357605

Corresponding Accuracy: 0.881428599357605 at Epoch 2
Corresponding Precision: 0.42765071988105774
Highest Recall: 0.43087950348854065
Corresponding F1 Score: 0.36959731578826904
Corresponding AUC: 0.7989233732223511


Corresponding Accuracy: 0.8814
Corresponding Precision: 0.4277
Corresponding Recall: 0.4309
Highest F1 Score: 0.3696 at Epoch 3
Corresponding AUC: 0.7989


Corresponding Accuracy: 0.8103
Corresponding Precision: 0.3153
Highest Recall: 0.484 at Epoch 5
Corresponding F1 Sco

In [None]:
import os
import numpy as np
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

log_base_dir = r"C:\Users\Admin\Documents\MasterThesis\results\VinBigData\ResNet_Baseline\light_oversamp\seed_0\tensorboard_logs_fold_"

num_folds = 5  # Number of folds

# Lists to store best values for each fold
best_f1_f1, best_f1_precisions, best_f1_recalls, best_f1_accuracies, best_f1_aucs = [], [], [], [], []
best_recall_f1, best_recall_precisions, best_recall_recalls, best_recall_accuracies, best_recall_aucs = [], [], [], [], []

for fold in range(1, num_folds + 1):
    log_dir = f"{log_base_dir}{fold}"
    import os
    if not os.path.exists(log_dir):
        print(f"CAREFUL {log_dir} does not exist")
        continue
    
    event_acc = EventAccumulator(log_dir)
    event_acc.Reload()
    
    precision_values = event_acc.Scalars("Metrics/Precision")
    accuracy_values = event_acc.Scalars("Accuracy/Validation")
    f1_values = event_acc.Scalars("Metrics/F1")
    recall_values = event_acc.Scalars("Metrics/Recall")
    auc_values = event_acc.Scalars("Metrics/AUC")
    
    epochs = [x.step for x in precision_values]
    accuracy_scores = [x.value for x in accuracy_values]
    f1_scores = [x.value for x in f1_values]
    precision_scores = [x.value for x in precision_values]
    recall_scores = [x.value for x in recall_values]
    auc_scores = [x.value for x in auc_values]
    
    # Best F1 Score Model
    best_f1_idx = np.argmax(f1_scores)
    best_f1_f1.append(f1_scores[best_f1_idx])
    best_f1_precisions.append(precision_scores[best_f1_idx])
    best_f1_recalls.append(recall_scores[best_f1_idx])
    best_f1_accuracies.append(accuracy_scores[best_f1_idx])
    best_f1_aucs.append(auc_scores[best_f1_idx])
    
    # Best Recall Score Model
    best_recall_idx = np.argmax(recall_scores)
    best_recall_f1.append(f1_scores[best_recall_idx])
    best_recall_precisions.append(precision_scores[best_recall_idx])
    best_recall_recalls.append(recall_scores[best_recall_idx])
    best_recall_accuracies.append(accuracy_scores[best_recall_idx])
    best_recall_aucs.append(auc_scores[best_recall_idx])

# Compute mean values across all folds
def compute_mean(lst):
    return round(np.mean(lst), 4) if lst else None

print("\n======= AVERAGE METRICS OVER ALL FOLDS =======")
print("\nBest F1 Model:")
print(f"Mean Accuracy: {compute_mean(best_f1_accuracies)}")
print(f"Mean Precision: {compute_mean(best_f1_precisions)}")
print(f"Mean Recall: {compute_mean(best_f1_recalls)}")
print(f"Mean F1 Score: {compute_mean(best_f1_f1)}")
print(f"Mean AUC: {compute_mean(best_f1_aucs)}")

print("\nBest Recall Model:")
print(f"Mean Accuracy: {compute_mean(best_recall_accuracies)}")
print(f"Mean Precision: {compute_mean(best_recall_precisions)}")
print(f"Mean Recall: {compute_mean(best_recall_recalls)}")
print(f"Mean F1 Score: {compute_mean(best_recall_f1)}")
print(f"Mean AUC: {compute_mean(best_recall_aucs)}")





Best F1 Model:
Mean Accuracy: 0.9505
Mean Precision: 0.5637
Mean Recall: 0.4036
Mean F1 Score: 0.4475
Mean AUC: 0.934

Best Recall Model:
Mean Accuracy: 0.9499
Mean Precision: 0.5329
Mean Recall: 0.4078
Mean F1 Score: 0.4413
Mean AUC: 0.934
