# Evaluation of Logs
Tensorboard logs play a huge role in evaluating our model performance
This script focuses on getting the necessary information from a log directory to have educated information on the model performance.

The first cell (directly below) focuses on deriving the best F1-Score and Recall model
The second cell focuses on deriving from a directory of several folds (considering this project uses five-fold cross validation) the average across all folds of the necessary metrics

In [None]:
import os
import numpy as np
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = r"C:\Users\Admin\Documents\MasterThesis\results\Pneumonia\ResNet50_FLC_15\no_nosamp\seed_0\tensorboard_logs_fold_3"
#log_dir = r"C:\Users\Admin\Documents\MasterThesis\results\Pneumonia\ResNet50_BCos\light_oversamp\seed_0\tensorboard_logs_fold_1"

event_acc = EventAccumulator(log_dir)
event_acc.Reload()

# List all available scalars
available_tags = event_acc.Tags()["scalars"]
print("Available scalars:", available_tags)

# Extract scalar values per epoch
precision_values = event_acc.Scalars("Metrics/Precision")
accuracy_values = event_acc.Scalars("Accuracy/Validation")
f1_values = event_acc.Scalars("Metrics/F1")
recall_values = event_acc.Scalars("Metrics/Recall")
auc_values = event_acc.Scalars("Metrics/AUC")


# Get epoch-wise values
epochs = [x.step for x in precision_values]
accuracy_scores = [x.value for x in accuracy_values]
f1_scores = [x.value for x in f1_values]
precision_scores = [x.value for x in precision_values]  # Convert to list
recall_scores = [x.value for x in recall_values]  # Convert to list

# Compute mean, min, max precision
mean_precision = np.mean(precision_scores)
minimum_precision = np.min(precision_scores)
maximum_precision = np.max(precision_scores)

print(f"Mean Average Precision (mAP): {mean_precision}")
print(f"Min Precision: {minimum_precision}")
print(f"Max Precision: {maximum_precision}")

print()

mean_recall = np.mean(recall_scores)
minimum_recall = np.min(recall_scores)
maximum_recall = np.max(recall_scores)

print(f"Mean Average Recall (mAR): {mean_recall}")
print(f"Min Recall: {minimum_recall}")
print(f"Max Recall: {maximum_recall}")


# Compute mean, min, max accuracy
mean_accuracy = np.mean(accuracy_scores)
minimum_accuracy = np.min(accuracy_scores)
maximum_accuracy = np.max(accuracy_scores)

print()
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Min Accuracy: {minimum_accuracy}")
print(f"Max Accuracy: {maximum_accuracy}")


best_accuracy_idx = np.argmax(accuracy_scores)
best_acc_epoch = epochs[best_accuracy_idx]
best_acc_f1 = f1_scores[best_accuracy_idx]
best_acc_precision = precision_values[best_accuracy_idx].value
best_acc_recall = recall_values[best_accuracy_idx].value  
best_acc_accuracy = accuracy_values[best_accuracy_idx].value
best_acc_auc = auc_values[best_accuracy_idx].value

print()
print(f"Corresponding Accuracy: {best_acc_accuracy} at Epoch {best_accuracy_idx}")
print(f"Corresponding Precision: {best_acc_precision}")
print(f"Highest Recall: {best_acc_recall}")
print(f"Corresponding F1 Score: {best_acc_f1}")
print(f"Corresponding AUC: {best_acc_auc}")
print()

# Find best epoch based on highest F1 score
best_f1_idx = np.argmax(f1_scores)
best_f1_epoch = epochs[best_f1_idx]
best_f1 = f1_scores[best_f1_idx]
best_f1_precision = precision_values[best_f1_idx].value
best_f1_recall = recall_values[best_f1_idx].value  
best_f1_accuracy = accuracy_values[best_f1_idx].value
best_f1_auc = auc_values[best_f1_idx].value


best_f1_precision = round(best_f1_precision, 4)
best_f1_recall = round(best_f1_recall, 4)
best_f1_accuracy = round(best_f1_accuracy, 4)
best_f1_auc = round(best_f1_auc, 4)
best_f1_f1 = round(best_f1, 4)

print()
print(f"Corresponding Accuracy: {best_f1_accuracy}")
print(f"Corresponding Precision: {best_f1_precision}")
print(f"Corresponding Recall: {best_f1_recall}")
print(f"Highest F1 Score: {best_f1_f1} at Epoch {best_f1_epoch}")
print(f"Corresponding AUC: {best_f1_auc}")

best_recall_idx = np.argmax(recall_scores)
best_recall_epoch = epochs[best_recall_idx]
best_recall_f1 = f1_scores[best_recall_idx]
best_recall_precision = precision_values[best_recall_idx].value
best_recall_recall = recall_values[best_recall_idx].value  
best_recall_accuracy = accuracy_values[best_recall_idx].value
best_recall_auc = auc_values[best_recall_idx].value


best_recall_precision = round(best_recall_precision, 4)
best_recall_recall = round(best_recall_recall, 4)
best_recall_accuracy = round(best_recall_accuracy, 4)
best_recall_auc = round(best_recall_auc, 4)
best_recall_f1 = round(best_recall_f1, 4)


print()
print()
print(f"Corresponding Accuracy: {best_recall_accuracy}")
print(f"Corresponding Precision: {best_recall_precision}")
print(f"Highest Recall: {best_recall_recall} at Epoch {best_recall_epoch}")
print(f"Corresponding F1 Score: {best_recall_f1}")
print(f"Corresponding AUC: {best_recall_auc}")

Available scalars: ['Loss/Train', 'Accuracy/Train', 'Loss/Validation', 'Accuracy/Validation', 'Metrics/Precision', 'Metrics/Recall', 'Metrics/F1', 'Metrics/AUC', 'Learning_Rate']
Mean Average Precision (mAP): 0.6085521618525187
Min Precision: 0.5523560047149658
Max Precision: 0.7599309086799622

Mean Average Recall (mAR): 0.5283458044131597
Min Recall: 0.36575227975845337
Max Recall: 0.6616791486740112

Mean Accuracy: 0.814727379878362
Min Accuracy: 0.7970770001411438
Max Accuracy: 0.8369870781898499

Corresponding Accuracy: 0.8369870781898499 at Epoch 5
Corresponding Precision: 0.7205297946929932
Highest Recall: 0.45220282673835754
Corresponding F1 Score: 0.5556690692901611
Corresponding AUC: 0.8616808652877808


Corresponding Accuracy: 0.8078
Corresponding Precision: 0.5625
Corresponding Recall: 0.6617
Highest F1 Score: 0.6081 at Epoch 2
Corresponding AUC: 0.8392


Corresponding Accuracy: 0.8078
Corresponding Precision: 0.5625
Highest Recall: 0.6617 at Epoch 2
Corresponding F1 Score:

In [2]:
import os
import numpy as np
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_base_dir = r"C:\Users\Admin\Documents\MasterThesis\results\Pneumonia\ResNet50_FLC\no_nosamp\seed_0\tensorboard_logs_fold_"
# log_base_dir = r"C:\Users\Admin\Documents\MasterThesis\results\Pneumonia\ResNet50_BCos\light_oversamp\seed_0\tensorboard_logs_fold_"
# need to download transformer light no oversamp!!!!!!!! baseline

num_folds = 5  # Number of folds

# Lists to store best values for each fold
best_f1_f1, best_f1_precisions, best_f1_recalls, best_f1_accuracies, best_f1_aucs = [], [], [], [], []
best_recall_f1, best_recall_precisions, best_recall_recalls, best_recall_accuracies, best_recall_aucs = [], [], [], [], []

for fold in range(1, num_folds + 1):
    log_dir = f"{log_base_dir}{fold}"
    import os
    if not os.path.exists(log_dir):
        print(f"CAREFUL {log_dir} does not exist")
        continue
    
    event_acc = EventAccumulator(log_dir)
    event_acc.Reload()
    
    precision_values = event_acc.Scalars("Metrics/Precision")
    accuracy_values = event_acc.Scalars("Accuracy/Validation")
    f1_values = event_acc.Scalars("Metrics/F1")
    recall_values = event_acc.Scalars("Metrics/Recall")
    auc_values = event_acc.Scalars("Metrics/AUC")
    
    epochs = [x.step for x in precision_values]
    accuracy_scores = [x.value for x in accuracy_values]
    f1_scores = [x.value for x in f1_values]
    precision_scores = [x.value for x in precision_values]
    recall_scores = [x.value for x in recall_values]
    auc_scores = [x.value for x in auc_values]
    
    # Best F1 Score Model
    best_f1_idx = np.argmax(f1_scores)
    best_f1_f1.append(f1_scores[best_f1_idx])
    best_f1_precisions.append(precision_scores[best_f1_idx])
    best_f1_recalls.append(recall_scores[best_f1_idx])
    best_f1_accuracies.append(accuracy_scores[best_f1_idx])
    best_f1_aucs.append(auc_scores[best_f1_idx])
    
    # Best Recall Score Model
    best_recall_idx = np.argmax(recall_scores)
    best_recall_f1.append(f1_scores[best_recall_idx])
    best_recall_precisions.append(precision_scores[best_recall_idx])
    best_recall_recalls.append(recall_scores[best_recall_idx])
    best_recall_accuracies.append(accuracy_scores[best_recall_idx])
    best_recall_aucs.append(auc_scores[best_recall_idx])

# Compute mean values across all folds
def compute_mean(lst):
    return round(np.mean(lst), 4) if lst else None

print("\n======= AVERAGE METRICS OVER ALL FOLDS =======")
print("\nBest F1 Model:")
print(f"Mean Accuracy: {compute_mean(best_f1_accuracies)}")
print(f"Mean Precision: {compute_mean(best_f1_precisions)}")
print(f"Mean Recall: {compute_mean(best_f1_recalls)}")
print(f"Mean F1 Score: {compute_mean(best_f1_f1)}")
print(f"Mean AUC: {compute_mean(best_f1_aucs)}")

print("\nBest Recall Model:")
print(f"Mean Accuracy: {compute_mean(best_recall_accuracies)}")
print(f"Mean Precision: {compute_mean(best_recall_precisions)}")
print(f"Mean Recall: {compute_mean(best_recall_recalls)}")
print(f"Mean F1 Score: {compute_mean(best_recall_f1)}")
print(f"Mean AUC: {compute_mean(best_recall_aucs)}")





Best F1 Model:
Mean Accuracy: 0.8115
Mean Precision: 0.5755
Mean Recall: 0.6607
Mean F1 Score: 0.6116
Mean AUC: 0.8522

Best Recall Model:
Mean Accuracy: 0.8036
Mean Precision: 0.5547
Mean Recall: 0.6815
Mean F1 Score: 0.6091
Mean AUC: 0.8478


In [4]:
import os
import numpy as np
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

log_base_dir = r"C:\Users\Admin\Documents\MasterThesis\results\Pneumonia\ResNet50_Baseline\light_oversamp"
seeds = [0, 1]  # Two seeds to process
num_folds = 5

def process_seed(seed):
    seed_metrics = {
        'f1': {'acc': [], 'prec': [], 'rec': [], 'f1': [], 'auc': []},
        'recall': {'acc': [], 'prec': [], 'rec': [], 'f1': [], 'auc': []}
    }
    
    for fold in range(1, num_folds + 1):
        log_dir = os.path.join(log_base_dir, f"seed_{seed}", f"tensorboard_logs_fold_{fold}")
        if not os.path.exists(log_dir):
            print(f"Missing: {log_dir}")
            continue

        event_acc = EventAccumulator(log_dir)
        event_acc.Reload()
        
        # Extract metrics
        metrics = {
            'acc': [x.value * 100 for x in event_acc.Scalars("Accuracy/Validation")],
            'prec': [x.value * 100 for x in event_acc.Scalars("Metrics/Precision")],
            'rec': [x.value * 100 for x in event_acc.Scalars("Metrics/Recall")],
            'f1': [x.value * 100 for x in event_acc.Scalars("Metrics/F1")],
            'auc': [x.value * 100 for x in event_acc.Scalars("Metrics/AUC")]
        }

        # Best F1 model
        best_f1_idx = np.argmax(metrics['f1'])
        seed_metrics['f1']['acc'].append(metrics['acc'][best_f1_idx])
        seed_metrics['f1']['prec'].append(metrics['prec'][best_f1_idx])
        seed_metrics['f1']['rec'].append(metrics['rec'][best_f1_idx])
        seed_metrics['f1']['f1'].append(metrics['f1'][best_f1_idx])
        seed_metrics['f1']['auc'].append(metrics['auc'][best_f1_idx])

        # Best Recall model
        best_rec_idx = np.argmax(metrics['rec'])
        seed_metrics['recall']['acc'].append(metrics['acc'][best_rec_idx])
        seed_metrics['recall']['prec'].append(metrics['prec'][best_rec_idx])
        seed_metrics['recall']['rec'].append(metrics['rec'][best_rec_idx])
        seed_metrics['recall']['f1'].append(metrics['f1'][best_rec_idx])
        seed_metrics['recall']['auc'].append(metrics['auc'][best_rec_idx])
    
    return seed_metrics

def compute_stats(data):
    return f"{np.mean(data):.2f}% ± {np.std(data):.2f}%"

# Process both seeds
all_seeds = {seed: process_seed(seed) for seed in seeds}

# Aggregate results across seeds for best F1 networks
final_metrics = {
    'acc': [],
    'prec': [],
    'rec': [],
    'f1': [],
    'auc': []
}

for seed in seeds:
    seed_data = all_seeds[seed]['f1']
    final_metrics['acc'].append(np.mean(seed_data['acc']))
    final_metrics['prec'].append(np.mean(seed_data['prec']))
    final_metrics['rec'].append(np.mean(seed_data['rec']))
    final_metrics['f1'].append(np.mean(seed_data['f1']))
    final_metrics['auc'].append(np.mean(seed_data['auc']))

# Print results
print("=== Individual Seed Results ===")
for seed in seeds:
    print(f"\nSeed {seed} - Best F1 Networks:")
    print(f"Accuracy: {np.mean(all_seeds[seed]['f1']['acc']):.4f}")
    print(f"Precision: {np.mean(all_seeds[seed]['f1']['prec']):.4f}")
    print(f"Recall: {np.mean(all_seeds[seed]['f1']['rec']):.4f}") 
    print(f"F1: {np.mean(all_seeds[seed]['f1']['f1']):.4f}")
    print(f"AUC: {np.mean(all_seeds[seed]['f1']['auc']):.4f}")

print("\n=== Final Cross-Seed Statistics ===")
print(f"Accuracy: {compute_stats(final_metrics['acc'])}")
print(f"Precision: {compute_stats(final_metrics['prec'])}")
print(f"Recall: {compute_stats(final_metrics['rec'])}")
print(f"F1 Score: {compute_stats(final_metrics['f1'])}")
print(f"AUC: {compute_stats(final_metrics['auc'])}")


=== Individual Seed Results ===

Seed 0 - Best F1 Networks:
Accuracy: 81.3071
Precision: 56.2861
Recall: 76.9134
F1: 64.9688
AUC: 87.6995

Seed 1 - Best F1 Networks:
Accuracy: 80.7712
Precision: 55.1689
Recall: 78.8420
F1: 64.8963
AUC: 88.2355

=== Final Cross-Seed Statistics ===
Accuracy: 81.04% ± 0.27%
Precision: 55.73% ± 0.56%
Recall: 77.88% ± 0.96%
F1 Score: 64.93% ± 0.04%
AUC: 87.97% ± 0.27%
