<div class="alert alert-block alert-success">

# **1.** Environment Setup

<div>

## 1.1 Import Libraries

In [None]:
import pandas as pd
import zipfile
import pickle
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import regularizers
from classes import *
from functions import *

In [None]:
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.applications.efficientnet import EfficientNetB0, preprocess_input
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Concatenate, Dropout, Input, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from keras.metrics import AUC, F1Score, CategoricalAccuracy, TopKCategoricalAccuracy
from sklearn.metrics import f1_score, precision_score, recall_score
from keras import optimizers

<div class="alert alert-block alert-success">

# **2.** Preprocessing

<div>

In [None]:
with open("../data/train_df.pkl", "rb") as f:
     train_df = pickle.load(f)

with open("../data/val_df.pkl", "rb") as f:
     val_df = pickle.load(f)

with open("../data/test_df.pkl", "rb") as f:
     test_df = pickle.load(f)

In [None]:
with open('family_encoder.pkl', 'rb') as f:
    family_encoder = pickle.load(f)

# Load class names
class_names = family_encoder.classes_

class_names

In [None]:
minority_class = train_df['family'].value_counts()[train_df['family'].value_counts() < 25].index
minority_class=minority_class.to_list()

In [None]:
batch_size = 32 ## the less the better because in each epoch the model sees N / batch_size images
image_size = (224, 224)

preprocess = Preprocessor(image_size=image_size, batch_size=batch_size)

<div class="alert alert-block alert-success">

# **3.** Results Analysis

<div>

In [None]:
# Load model keras
model = load_model("efficient_net_finetuned_final.keras")

In [None]:
# load datasets with efficientnets preprocessing
train_ds, _ = preprocess.load_img(
    data_dir="../data/rare_species/train",
    minority_class=minority_class,
    augment="mixup",
    oversampling=True,
    shuffle= True,
    preprocessing_function=preprocess_input)

val_ds, _ = preprocess.load_img(
    data_dir="../data/rare_species/val",
    minority_class=[],
    augment=None,
    shuffle= False,
    preprocessing_function=preprocess_input)

test_ds, _ = preprocess.load_img(
    data_dir="../data/rare_species/test",
    minority_class=[],
    augment=None,
    shuffle= False,
    preprocessing_function=preprocess_input)

In [None]:
y_pred = model.predict(val_ds)

### Classification report

In [None]:
get_metric(train_ds, "efficient_net_finetuned_final.keras")

In [None]:
get_metric(val_ds, "efficient_net_finetuned_final.keras")

### Visualize no confidence images

In [None]:
# Unbatch train_ds into list
train_images = []
train_labels = []

for img, label in train_ds.unbatch():
    train_images.append(img.numpy())
    train_labels.append(label.numpy())

train_images = np.array(train_images)
train_labels = np.array(train_labels)

# Predict all train images
pred_probs_all = model.predict(train_images, verbose=1)
y_pred = np.argmax(pred_probs_all, axis=1)
y_true = train_labels

In [None]:
train_labels.shape

In [None]:
train_images

In [None]:
pred_probs_all

In [None]:
np.max(pred_probs_all, axis=1)

In [None]:
# Plot histogram of certainties
plt.figure(figsize=(8, 5))
plt.hist(np.max(pred_probs_all, axis=1), bins=30, edgecolor='black')
plt.xlabel('Model Certainty (Max Softmax Probability)')
plt.ylabel('Number of Samples')
plt.title('Distribution of Model Certainty')
plt.grid(True)
plt.show()

#### Histogram of model confidence, divided by correctly and incorrectly classified images

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# --- assume pred_probs_all and train_labels are already defined ---

# 1) prepare labels
labels = train_labels
if labels.ndim > 1:
    labels = np.argmax(labels, axis=1)

# 2) get preds and confidences
preds = np.argmax(pred_probs_all, axis=1)
confidences = np.max(pred_probs_all, axis=1)

# 3) split confidences by correctness
correct_conf = confidences[preds == labels]
incorrect_conf = confidences[preds != labels]

# 4) define bins
bins = np.linspace(0, 1, 21)  # 20 bins

# 5) histogram counts for each bin
correct_counts, _   = np.histogram(correct_conf,   bins=bins)
incorrect_counts, _ = np.histogram(incorrect_conf, bins=bins)
total_counts = correct_counts + incorrect_counts

# 6) misclassification rate per bin (in %), guard against zero-division
with np.errstate(divide='ignore', invalid='ignore'):
    misclass_rate = 100 * incorrect_counts / total_counts
misclass_rate = np.nan_to_num(misclass_rate)  # zero where total_counts==0

# 7) bin centers for plotting the line
bin_centers = (bins[:-1] + bins[1:]) / 2

# 8) plot
fig, ax1 = plt.subplots(figsize=(8, 5))

# stacked histogram
ax1.hist(
    [correct_conf, incorrect_conf],
    bins=bins,
    stacked=True,
    color=['green', 'red'],
    label=['Correct classification', 'Incorrect classification'],
    edgecolor="black"
)
ax1.set_xlabel('Model Confidence (Max Softmax Probability)')
ax1.set_ylabel('Number of Samples')
ax1.legend(loc='upper left')

# secondary axis for misclassification %
ax2 = ax1.twinx()
ax2.plot(
    bin_centers,
    misclass_rate,
    marker='o',
    linestyle='-',
    color='blue',
    label='Misclassification rate (%)'
)
ax2.set_ylabel('Misclassification Rate (%)')
ax2.set_ylim(0, 100)
ax2.legend(loc='upper right')

plt.title('Distribution of Model Confidence')
plt.tight_layout()
plt.show()


#### Trying to incorporate phylum

In [None]:
# Unbatch train_ds into list
val_images = []
val_labels = []

for img, label in val_ds.unbatch():
    val_images.append(img.numpy())
    val_labels.append(label.numpy())

val_images = np.array(val_images)
val_labels = np.array(val_labels)

# Predict all val images
pred_probs_all_val = model.predict(val_images, verbose=1)
y_pred = np.argmax(pred_probs_all, axis=1)
y_true = val_labels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# create family:phylum dicionary
meta = pd.read_csv('../data/rare_species/metadata.csv')
dup = meta.groupby('family')['phylum'].nunique()
family_to_phylum = dict(zip(meta['family'], meta['phylum']))


In [None]:
# --- 2) Prepare true labels & names ---
# train_labels: your true labels, shape (11200,) or one-hot (11200,202)
# class_names: list of length 202 mapping index→family string
labels = np.argmax(val_labels, axis=1) # family# prediction, not one-hot-encoded
y_true_fam   = np.array([class_names[i] for i in labels])
y_true_phyl  = np.array([family_to_phylum[f] for f in y_true_fam])


In [None]:
# --- 3) Baseline predictions & phylum check ---
y_pred       = np.argmax(pred_probs_all_val, axis=1)
y_pred_fam   = np.array([class_names[i] for i in y_pred])
y_pred_phyl  = np.array([family_to_phylum[f] for f in y_pred_fam])

In [None]:
# a dictionary phylum:[list of indices of classes that belong to that phylum]
phylum_to_inds = {}
for idx, fam in enumerate(class_names):
    ph = family_to_phylum[fam]
    phylum_to_inds.setdefault(ph, []).append(idx)


In [None]:
# --- 4) Hierarchical override logic ---
y_pred_hier = y_pred.copy()
for i, probs in enumerate(pred_probs_all_val):
    # if predicted phylum ≠ true phylum, force to best within true phylum
    if y_pred_phyl[i] != y_true_phyl[i]:
        valid_idx = phylum_to_inds[y_true_phyl[i]]
        y_pred_hier[i] = valid_idx[np.argmax(probs[valid_idx])]



In [None]:
# --- 5) Overall accuracy comparison ---
acc_base  = accuracy_score(labels, y_pred)
acc_hier  = accuracy_score(labels, y_pred_hier)
print(f"Baseline overall accuracy:     {acc_base:.4f}")
print(f"Hierarchical override accuracy:{acc_hier:.4f}")

In [None]:
print("Classification report with original predictions:")
print(classification_report(labels,y_pred))
print("Classification report with original predictions:")
print(classification_report(labels,y_pred_hier,digits=4))

In [None]:
# --- 6) Per-phylum accuracy improvement ---
phyla = np.unique(y_true_phyl)
base_scores = []
hier_scores = []
for ph in phyla:
    idxs = np.where(y_true_phyl == ph)[0]
    base_scores.append( accuracy_score(labels[idxs], y_pred[idxs]) )
    hier_scores.append( accuracy_score(labels[idxs], y_pred_hier[idxs]) )

x = np.arange(len(phyla))
width = 0.35

plt.figure(figsize=(10,5))
plt.bar(x - width/2, base_scores, width, label='Baseline')
plt.bar(x + width/2, hier_scores, width, label='Hierarchical')
plt.xticks(x, phyla, rotation=45, ha='right')
plt.ylabel('Accuracy')
plt.title('Accuracy by Phylum')
plt.legend()
plt.tight_layout()
plt.show()


#### No certainty

In [None]:
# Convert y_true to class labels
y_true_labels = np.argmax(y_true, axis=1)

# Get misclassified indices
misclassified_indices = np.where(y_true_labels != y_pred)[0]

# For each misclassified, get model confidence (highest softmax probability)
confidences = np.max(pred_probs_all[misclassified_indices], axis=1)

# Sort misclassified examples by descending confidence
sorted_indices = np.argsort(confidences)  # Sort ascending
selected_indices = misclassified_indices[sorted_indices[:12]]  # Take top 6

# Prepare images to show (selected misclassified ones)
images_to_show = []

for idx in selected_indices:
    img = train_images[idx]
    true_label = y_true_labels[idx]
    pred_label = y_pred[idx]
    confidence = confidences[np.where(misclassified_indices == idx)][0]  # Get confidence for the current misclassified image
    images_to_show.append((img, true_label, pred_label, confidence))

# Save the misclassified data
misclassified_data = {
    "True Label": [class_names[true_label] for _, true_label, _, _ in images_to_show],
    "Predicted Label": [class_names[pred_label] for _, _, pred_label, _ in images_to_show],
    "Confidence": [confidence for _, _, _, confidence in images_to_show]
}

# Plot
plt.figure(figsize=(16, 10))

for i, (img, true_label, pred_label, confidence) in enumerate(images_to_show):
    plt.subplot(4, 3, i + 1)
    plt.imshow(img.astype("uint8"))
    plt.axis('off')
    plt.title(f"True: {class_names[true_label]}\nPred: {class_names[pred_label]}\nConf: {confidence:.2f}", 
              color='red', fontsize=10)

plt.suptitle("Less Confident Misclassifications", fontsize=18)
plt.tight_layout()
plt.show()

In [None]:
# Plot histogram of certainties
plt.figure(figsize=(8, 5))
plt.hist(confidences, bins=30, edgecolor='black')
plt.xlabel('Model Certainty (Max Softmax Probability)')
plt.ylabel('Number of Misclassified Samples')
plt.title('Distribution of Model Certainty for Misclassified Samples')
plt.grid(True)
plt.show()

Since you have 202 classes, a model that is completely uncertain (i.e., guessing randomly) would have maximum softmax probability 0.004. But models aren't truly random even when uncertain — usually, even bad predictions are around 0.02–0.1 certainty (depends on how the softmax behaves).

In [None]:
# First, compute the maximum probability for ALL predictions
all_confidences = np.max(pred_probs_all, axis=1)  # Shape: (num_val_samples,)

# Total number of validation images
total_val_images = len(train_images)

# Number of predictions where certainty < 0.1
num_uncertain_predictions = np.sum(all_confidences < 0.019)

print(f"Total validation images: {total_val_images}")
print(f"Number of predictions with certainty < 0.1: {num_uncertain_predictions}")
print(f"Percentage: {100 * num_uncertain_predictions / total_val_images:.2f}%")

In [None]:
misclassified_data

#### Top Correctly Classified

In [None]:
# Convert y_true to class labels
y_true_labels = np.argmax(y_true, axis=1)

# Get indices of correctly classified images
correct_indices = np.where(y_true_labels == y_pred)[0]

# For each correctly classified image, get model confidence (highest softmax probability)
confidences = np.max(pred_probs_all[correct_indices], axis=1)

# Sort correctly classified examples by descending confidence
sorted_indices = np.argsort(confidences)[::-1]  # Sort descending
selected_indices = correct_indices[sorted_indices[:6]]  # Take top 6 with highest confidence

# Prepare images to show (selected correctly classified ones)
images_to_show = []

for idx in selected_indices:
    img = test_images[idx]
    true_label = y_true_labels[idx]
    pred_label = y_pred[idx]
    confidence = confidences[np.where(correct_indices == idx)][0]  # Get confidence for the current correctly classified image
    images_to_show.append((img, true_label, pred_label, confidence))

# Save the misclassified data
correctly_classified_data = {
    "True Label": [class_names[true_label] for _, true_label, _, _ in images_to_show],
    "Predicted Label": [class_names[pred_label] for _, _, pred_label, _ in images_to_show],
    "Confidence": [confidence for _, _, _, confidence in images_to_show]
}

# Plot
plt.figure(figsize=(16, 10))

for i, (img, true_label, pred_label, confidence) in enumerate(images_to_show):
    plt.subplot(2, 3, i + 1)
    plt.imshow(img.astype("uint8"))
    plt.axis('off')
    plt.title(f"True: {class_names[true_label]}\nPred: {class_names[pred_label]}\nConf: {confidence:.2f}", 
              color='green', fontsize=10)

plt.suptitle("Most Confident Correct Classifications", fontsize=18)
plt.tight_layout()
plt.show()

In [None]:
correctly_classified_data

#### Correctly Classified for Specific Class

In [None]:
misclassified_data

In [None]:
for family in misclassified_data["True Label"]:
    show_correct_predictions_for_family(y_true, y_pred, pred_probs_all, test_images, class_names, family_name=family, num_images=6)

In [None]:
for family in misclassified_data["Predicted Label"]:
    show_correct_predictions_for_family(y_true, y_pred, pred_probs_all, test_images, class_names, family_name=family, num_images=6)

In [None]:
correctly_classified_data

In [None]:
for family in correctly_classified_data["True Label"]:
    show_correct_predictions_for_family(y_true, y_pred, pred_probs_all, test_images, class_names, family_name=family, num_images=6)

<div class="alert alert-block alert-success">

# **4.** More

<div>

In [None]:
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt


In [None]:
files = {
    "VGG16": "vgg_history_train_until_opt.csv",
    "ResNet50": "resnet50_final_train_history.csv",
    "EfficientNet Baseline 1": "efficient_net_baseline1_history.csv",
    "EfficientNet Baseline 2": "efficient_net_baseline2_history.csv",
    "EfficientNet Final": "efficient_net_final_train_history.csv",
    "EfficientNet Fine-Tuned": "efficient_net_final_finetune_history.csv"
}


In [None]:
def load_history(path):
    df = pd.read_csv(path)
    if 'acc' in df.columns:
        df = df.rename(columns={'acc': 'accuracy'})
    if 'val_acc' in df.columns:
        df = df.rename(columns={'val_acc': 'val_accuracy'})
    return df


In [None]:
histories = {label: load_history(path) for label, path in files.items()}

summary_list = []
for label, df in histories.items():
    last_epoch = df.iloc[-1]
    # Para cada métrica, tenta pegar 'accuracy' e 'f1_score'
    summary_list.append({
        "Model": label,
        "Train Set Accuracy": last_epoch.get("accuracy", last_epoch.get("accuracy", None)),
        "Validation Set Accuracy": last_epoch.get("val_accuracy", last_epoch.get("val_accuracy", None)),
        "Train Set F1-Score": last_epoch.get("f1_score", last_epoch.get("f1_score", None)),
        "Validation Set F1-Score": last_epoch.get("val_f1_score", last_epoch.get("val_f1_score", None)),
        "Train Set Loss": last_epoch.get("loss", last_epoch.get("loss", None)),
        "Validation Set Loss": last_epoch.get("val_loss", last_epoch.get("val_loss", None))
    })

summary_df = pd.DataFrame(summary_list)
summary_df = summary_df.set_index("Model")
summary_df = summary_df.sort_values(by="Validation Set F1-Score", ascending=False)


### Final Summary

In [None]:
summary_df = summary_df.sort_values(by="Validation Set F1-Score", ascending=False)

In [None]:
summary_df

### Plots

In [None]:
plt.rcParams['font.family'] = 'Aptos'

In [None]:
sns.set_style("white")
plt.figure(figsize=(8, 10))
ax = summary_df[['Train Set Accuracy', 'Validation Set Accuracy']].plot(kind='bar', color=['navy', 'maroon'], legend=False)
ax.legend(loc='upper right', fontsize=10)
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=90)
plt.show()


In [None]:
sns.set_style("white")
plt.figure(figsize=(8, 10))
ax = summary_df[['Train Set Loss', 'Validation Set Loss']].plot(kind='bar', color=['navy', 'maroon'], legend=False)
ax.legend(loc='upper left', fontsize=10)
plt.ylabel("Loss")
plt.ylim(0, 2.5)
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.set_style("white")
plt.figure(figsize=(8, 10))
ax = summary_df[['Train Set F1-Score', 'Validation Set F1-Score']].plot(kind='bar', color=['navy', 'maroon'], legend=False)
ax.legend(loc='upper right', fontsize=10)
plt.ylabel("F1-Score")
plt.ylim(0, 1)
plt.xticks(rotation=90)
plt.show()

### Test Performance

In [None]:
from tensorflow.keras.applications.efficientnet import EfficientNetB0, preprocess_input

In [None]:
test_ds, _ = preprocess.load_img(
    data_dir="../data/rare_species/test",
    minority_class=[],
    augment=None,
    shuffle= False,
    preprocessing_function=preprocess_input)

In [None]:
get_metric(test_ds, "efficient_net_finetuned_final.keras")