<div class="alert alert-block alert-success">

# **1.** Environment Setup

<div>

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Apr 24 21:27:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             46W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


## 1.1 Connect Google Drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os

# Change to the directory where project is located
os.chdir('/content/drive/MyDrive/College/MSc/2nd Semester/Deep Learning/project')

# Verify that we changed the directory
print("Changed directory to:", os.getcwd())

Changed directory to: /content/drive/MyDrive/College/MSc/2nd Semester/Deep Learning/project


## 1.2 Import Libraries

In [7]:
# Google Colab
# !pip install keras_cv

In [8]:
import pandas as pd
import zipfile
import pickle
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import regularizers
from classes import *
from functions import *

In [9]:
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.applications.efficientnet import EfficientNetB0, preprocess_input
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Concatenate, Dropout, Input, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from keras.metrics import AUC, F1Score, CategoricalAccuracy, TopKCategoricalAccuracy
from sklearn.metrics import f1_score, precision_score, recall_score
from keras import optimizers

## 1.3 Import Dataset

The amount of data we have is not supported by GitHub (where we have our project stored). The solution is: create a folder named data and allocate the rare_species file inside it. The gitignore file makes sure this folder is not used when we are pulling or pushing changes but everyone needs to have it on their machines locally. A random seed was used to ensure that the splits stay the same.

In [10]:
# Import Metadata
metadata_path = Path("../data/rare_species/metadata.csv")
df = pd.read_csv(metadata_path)
df.head()

Unnamed: 0,rare_species_id,eol_content_id,eol_page_id,kingdom,phylum,family,file_path
0,75fd91cb-2881-41cd-88e6-de451e8b60e2,12853737,449393,animalia,mollusca,unionidae,mollusca_unionidae/12853737_449393_eol-full-si...
1,28c508bc-63ff-4e60-9c8f-1934367e1528,20969394,793083,animalia,chordata,geoemydidae,chordata_geoemydidae/20969394_793083_eol-full-...
2,00372441-588c-4af8-9665-29bee20822c0,28895411,319982,animalia,chordata,cryptobranchidae,chordata_cryptobranchidae/28895411_319982_eol-...
3,29cc6040-6af2-49ee-86ec-ab7d89793828,29658536,45510188,animalia,chordata,turdidae,chordata_turdidae/29658536_45510188_eol-full-s...
4,94004bff-3a33-4758-8125-bf72e6e57eab,21252576,7250886,animalia,chordata,indriidae,chordata_indriidae/21252576_7250886_eol-full-s...


In [11]:
df.shape # 11983 images

(11983, 7)

<div class="alert alert-block alert-success">

# **2.** Preprocessing

<div>

In [12]:
#Load the DataFrames from the .pkl files
with open("../data/train_df.pkl", "rb") as f:
     train_df = pickle.load(f)

with open("../data/valid_df.pkl", "rb") as f:
     val_df = pickle.load(f)

with open("../data/test_df.pkl", "rb") as f:
     test_df = pickle.load(f)

with open("family_encoder.pkl", "rb") as f:
     family_encoder = pickle.load(f)

with open("phylum_encoder.pkl", "rb") as f:
     phylum_encoder = pickle.load(f)

In [13]:
# identify the minority class
minority_class = train_df['family'].value_counts()[train_df['family'].value_counts() < 25].index
minority_class = minority_class.to_list()

In [14]:
batch_size = 32 ## the less the better because in each epoch the model sees N / batch_size images
image_size = (224, 224)

preprocess = Preprocessor_with_phylum(image_size=image_size, batch_size=batch_size)

In [15]:
# Compile with metrics
verbose = 1
metrics = [
    CategoricalAccuracy(name="accuracy"),
    AUC(name="auc"),
    F1Score(average="macro", name="f1_macro"),
    F1Score(average="weighted", name="f1_weighted"),
    TopKCategoricalAccuracy(k=5, name="top5_accuracy")
]

In [16]:
# num_images = 16 ##
# rows, cols = 4, 4 ##

# plot_batch(train_ds, class_names=class_names, num_images=num_images, rows=rows, cols=cols)

<div class="alert alert-block alert-success">

# **3.** Models

<div>

## EfficientNet + Phylum

### Base line 1 (without preprocessing and without regularization)

#### Set-up

In [17]:
from tensorflow.keras.applications.efficientnet import EfficientNetB0, preprocess_input

In [18]:
# Load datasets
train_ds_en_no_proc_no_reg, family_class_names, phylum_class_names = preprocess.load_img(
    train_df,
    minority_class=[],
    augment=None,
    oversampling=False,
    shuffle=True,
    preprocessing_function=preprocess_input,
    family_encoder=family_encoder,
    phylum_encoder=phylum_encoder,
)

val_ds_en_no_proc_no_reg, _, _ = preprocess.load_img(
    val_df,
    minority_class=[],
    augment=None,
    oversampling=False,
    shuffle=False,
    preprocessing_function=preprocess_input,
    family_encoder=family_encoder,
    phylum_encoder=phylum_encoder,
)

#### Run

In [19]:
# Image input pipeline
image_input = Input(shape=(224, 224, 3), name="image_input")  # Input for RGB image
base_model = EfficientNetB0(include_top=False, weights="imagenet", input_tensor=image_input)  # Pretrained EfficientNet without final dense layers

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Add a global average pooling layer
x = GlobalAveragePooling2D()(base_model.output)  # Convert 4D feature map to 2D vector (batch_size, 2048)

# Phylum input (one-hot or multi-class vector with 5 classes)
phylum_input = Input(shape=(5,), name="phylum_input")  # Input for phylum-level info

# Combine image and phylum features
combined = Concatenate()([x, phylum_input])  # Concatenate the two inputs: (batch_size, 2048 + 5)
combined = Dense(256, activation='relu')(combined)  # Fully connected layer
output = Dense(202, activation='softmax')(combined)  # Final classification layer (202 family classes)

# Define the model
model_en_no_proc_no_reg = Model(inputs=[image_input, phylum_input], outputs=output)

# Compile the model
model_en_no_proc_no_reg.compile(
    optimizer=optimizers.RMSprop(learning_rate=1e-4),
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=metrics
)

# Print the model summary
# model.summary()

In [None]:
# Initialize the experiment
experiment_en_no_proc_no_reg = Experiment(
    model=model_en_no_proc_no_reg,
    train_ds=train_ds_en_no_proc_no_reg,
    val_ds=val_ds_en_no_proc_no_reg,
    experiment_name="eff-net_with_phylum_no_proc_no_reg",
    resume=False,
    steps_per_epoch=263,
)

# Run the experiment
history_en_no_proc_no_reg = experiment_en_no_proc_no_reg.run_experiment(callbacks=None, epochs=25)

Epoch 1/25
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 273ms/step - accuracy: 0.0746 - auc: 0.6430 - f1_macro: 0.0205 - f1_weighted: 0.0560 - loss: 5.0549 - top5_accuracy: 0.1755 - val_accuracy: 0.2326 - val_auc: 0.8260 - val_f1_macro: 0.0621 - val_f1_weighted: 0.1435 - val_loss: 4.2074 - val_top5_accuracy: 0.4296
Epoch 2/25
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 185ms/step - accuracy: 0.2657 - auc: 0.8615 - f1_macro: 0.0813 - f1_weighted: 0.1745 - loss: 3.9376 - top5_accuracy: 0.4855 - val_accuracy: 0.3239 - val_auc: 0.8958 - val_f1_macro: 0.1307 - val_f1_weighted: 0.2321 - val_loss: 3.4247 - val_top5_accuracy: 0.5760
Epoch 3/25
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 184ms/step - accuracy: 0.3622 - auc: 0.9264 - f1_macro: 0.1638 - f1_weighted: 0.2752 - loss: 3.1646 - top5_accuracy: 0.6324 - val_accuracy: 0.4062 - val_auc: 0.9360 - val_f1_macro: 0.2084 - val_f1_weighted: 0.3232 - val_loss: 2.8680 - val_top5

In [25]:
# Load the experiment log
df = pd.read_csv('experiment_log.csv')

# Identify the latest experiment
max_id = df['id'].max()

# Filter the DataFrame to get the latest experiment
df_latest_experiment = df[df['id'] == max_id]

# Save the latest experiment log to a CSV file
df_latest_experiment.to_csv('phylum_models_results/efficient_net_phylum_no_proc_no_reg_history.csv', index=False)

df_latest_experiment

Unnamed: 0,id,experiment_name,epoch,train_accuracy,val_accuracy,train_loss,val_loss,f1_train_macro,f1_val_macro,f1_train_weighted,f1_val_weighted,top5_train_accuracy,top5_val_accuracy,timestamp
325,12,eff-net_with_phylum_no_proc_no_reg,1,0.1415,0.2326,4.7804,4.2074,0.0365,0.0621,0.0947,,0.279,0.4296,2025-04-24 21:04:44
326,12,eff-net_with_phylum_no_proc_no_reg,2,0.286,0.3239,3.7534,3.4247,0.1026,0.1307,0.1979,,0.5152,0.576,2025-04-24 21:05:33
327,12,eff-net_with_phylum_no_proc_no_reg,3,0.3845,0.4062,3.0423,2.868,0.1891,0.2084,0.3014,,0.6477,0.6611,2025-04-24 21:06:21
328,12,eff-net_with_phylum_no_proc_no_reg,4,0.4714,0.4763,2.5021,2.4492,0.2866,0.296,0.3993,,0.7391,0.7334,2025-04-24 21:07:08
329,12,eff-net_with_phylum_no_proc_no_reg,5,0.541,0.5159,2.0869,2.1355,0.3768,0.3449,0.4815,,0.8121,0.7863,2025-04-24 21:07:56
330,12,eff-net_with_phylum_no_proc_no_reg,6,0.6021,0.5693,1.7783,1.9012,0.4896,0.4442,0.5605,,0.8594,0.8236,2025-04-24 21:08:44
331,12,eff-net_with_phylum_no_proc_no_reg,7,0.6591,0.5943,1.5373,1.7354,0.5871,0.4928,0.6326,,0.8906,0.8425,2025-04-24 21:09:32
332,12,eff-net_with_phylum_no_proc_no_reg,8,0.6983,0.6138,1.3571,1.6164,0.6389,0.5292,0.6779,,0.9111,0.8564,2025-04-24 21:10:20
333,12,eff-net_with_phylum_no_proc_no_reg,9,0.7287,0.6305,1.2164,1.5211,0.6887,0.5569,0.714,,0.9243,0.8715,2025-04-24 21:11:08
334,12,eff-net_with_phylum_no_proc_no_reg,10,0.7568,0.6422,1.0999,1.4525,0.7265,0.5757,0.746,,0.9355,0.8742,2025-04-24 21:11:56
