# Context

In this notebook we make the final ruun of the preoject the idea is to aggregate all the model build for each phylum

# Imports

In [None]:
from google.colab import drive
import zipfile
drive.mount('/content/drive')

zip_path = '/content/drive/MyDrive/rare_species 1.zip'
extract_path = '/content/rare_species 1'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow import data as tf_data
from tensorflow.keras import layers
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Rescaling, RandAugment
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

In [None]:


# With colab
folder_path = '/content/rare_species 1/rare_species 1'
meta = pd.read_csv('/content/rare_species 1/rare_species 1/metadata.csv')


# With vscode
# folder_path = '../data/rare_species 1'
# meta = pd.read_csv('../data/rare_species 1/metadata.csv')

# Splitting the data

## Creating a test set

In order for the to by able to have a test set that is not split by phylum we need to do it before spliting each images into a folder for their specific Phylum

In [None]:
meta_train_val , meta_test = train_test_split(meta, test_size=0.1, stratify=meta['category'], random_state=42, stratify =meta['family'])

In [None]:
# With colab
current_locations = '/content/rare_species 1/rare_species 1'

# with vscode
# current_locations = '../data/rare_species 1'

for _, row in meta_train_val.iterrows():

    phylum = row['phylum']
    file_path = row['file_path']


    file_location = os.path.join(current_locations, file_path)

    # create a a detination folder keeping the subfolder structure

        # with colab
    target_folder = os.path.join(phylum, os.path.dirname(file_path))

        # with vscode
    # target_folder = os.path.join("../data" , phylum, os.path.dirname(file_path))

    os.makedirs(target_folder, exist_ok=True)  # Make sure the folder exists

    # Final destination path
    destination = os.path.join(target_folder, os.path.basename(file_path))

    # Copy the file if it exists
    if os.path.exists(file_location):
        shutil.copy2(file_location, destination)
    else:
        print(f"Couldn't find the file: {file_location}")

# Final Train, Val, Test, Split

In [None]:
# with colab
path_phylum_athropoda = "/content/arthropoda"
path_phylum_chordata = "/content/chordata"
path_phylum_cnidaria = "/content/cnidaria"
path_phylum_mollusca = "/content/mollusca"

# with vscode
# path_phylum_athropoda = "../data/arthropoda"
# path_phylum_chordata = "../data/chordata"
# path_phylum_cnidaria = "../data/cnidaria"
# path_phylum_mollusca = "../data/mollusca"

image_size = (224, 224)
seed = 42
batch_size = 32

train_ds_athropoda, val_athropoda= keras.utils.image_dataset_from_directory(
    path_phylum_athropoda,
    validation_split=0.2,
    subset= "both",
    seed= seed,
    image_size= image_size,
    batch_size= batch_size
)

train_ds_chordata, val_chordata= keras.utils.image_dataset_from_directory(
    path_phylum_chordata,
    validation_split=0.2,
    subset= "both",
    seed= seed,
    image_size= image_size,
    batch_size= batch_size
)

train_ds_cnidaria, val_cnidaria= keras.utils.image_dataset_from_directory(
    path_phylum_cnidaria,
    validation_split=0.2,
    subset= "both",
    seed= seed,
    image_size= image_size,
    batch_size= batch_size
)

train_ds_mollusca, val_mollusca= keras.utils.image_dataset_from_directory(
    path_phylum_mollusca,
    validation_split=0.2,
    subset= "both",
    seed= seed,
    image_size= image_size,
    batch_size= batch_size
)



In [None]:
def display_images(train_ds):
    for images, labels in train_ds.take(1):
        plt.figure(figsize=(8, 8))
        for i in range(9):
            ax = plt.subplot(3, 3, i + 1)
            plt.imshow(np.array(images[i]).astype("uint8"))
            plt.title(int(labels[i]))
            plt.axis("off")
        plt.tight_layout()
        plt.show()

In [None]:
display_images(train_ds_athropoda)
display_images(train_ds_chordata)
display_images(train_ds_cnidaria)
display_images(train_ds_mollusca)

# Data Augmentation

In [None]:
def data_augmentation(images, augmentation_layers):
    for layer in augmentation_layers:
        images = layer(images)
    return images

In [None]:
data_augmentation_athropoda= [
 ## add any you want here
]

data_augmentation_chordata= keras.Sequential([

    # apply any kind of kera preprocessing randomly
    layers.RandAugment(value_range=(0, 255), num_ops=2),

    # change the image by moving or zooming in
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.2), # 20 percent rotation
    layers.RandomZoom(0.2), # 20 percent rotation


    # change the image rgbs --> contrast and brightness
    layers.RandomContrast(0.2, value_range=(0, 255)), # change by 20%
    layers.RandomBrightness(0.2, (0, 255)), # cahnge by 20 %

    # adds noise to the images to prevent overfitting (blurry filter)
    layers.GaussianNoise(0.1),

])

data_augmentation_cnidaria= [

 ## add any you want here
]

data_augmentation_mollusca = keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical"),
    layers.RandomRotation(0.2),   # Rotate images randomly up to 20%
    layers.RandomZoom(0.2),        # Zoom in/out randomly up to 20%
    layers.RandomContrast(0.2)     # Change contrast randomly up to 20%
])


# Build the models

In [None]:
def make_model_athropoda(input_shape, num_classes):
    return keras.Model(inputs, outputs)


In [None]:
def make_model_chordata(input_shape, num_classes):
    inputs = keras.Input(shape=input_shape)

    # Rescaling layer
    x = data_augmentation_chordata(inputs)
    x = Rescaling(1./255)(x)

    # Pretrained MobileNetV2 base
    base_model = MobileNetV2(include_top=False, input_tensor=x, weights="imagenet")
    base_model.trainable = False  # Freeze for transfer learning

    x = base_model.output
    x = layers.GlobalAveragePooling2D()(x) # to avoid over fitting
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    outputs = layers.Dense(num_classes, activation="softmax", kernel_regularizer=keras.regularizers.l2(0.001))(x) #try to prevent overfitting

    model = keras.Model(inputs, outputs)
    model.base_model = base_model # save thee base model to be able to call it back when fine tunning

    return model

In [None]:
def make_model_cnidaria(input_shape, num_classes):
    return keras.Model(inputs, outputs)

In [None]:
def make_model_mollusca(input_shape, num_classes):
    inputs = keras.Input(shape=input_shape)
    # Apply data augmentation
    x = data_augmentation_mollusca(inputs)  #

    # Normalize pixel values
    x = Rescaling(1./255)(x)

    # Pretrained MobileNetV2 base (frozen)
    base_model = MobileNetV2(include_top=False, input_tensor=x, weights="imagenet")
    base_model.trainable = False # Freeze for transfer learning

    # Flatten instead of pooling (as required)
    x = base_model.output
    x = layers.BatchNormalization()(x)               # GlobalAveragePooling2D
    x = layers.Dropout(0.3)(x)      # Optional regularization, change, it randomly sets eurons to 0 to reduce overfitting. so 0.1 is 10% of neurons are of.

    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = keras.Model(inputs, outputs)
    model.base_model = base_model

    return model

# Run The models

## Arthropoda

## Chordata

### First run

In [None]:
model_chordata = make_model_chordata(input_shape=image_size + (3,), num_classes=166)
epochs = 10

callbacks = [
    # saves the best model of the run using max val_accuracy as a metric
    keras.callbacks.ModelCheckpoint(
        "best_model_chordata.keras",
        save_best_only=True,
        monitor="val_acc",
        mode="max",
        verbose=1)
    ]

## change from kera example is the loss function as we deal with a lot of classes
model_chordata.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), ## change this CategoricalCrossentropy to the the one it is now
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")], ## change this CategoricalCrossentropy to the the one it is now
)

model_chordata.fit(
    train_ds_chordata,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=val_chordata,
)

### Fine tunning

In [None]:
fine_tune_epochs = 20

# we recall the model only this time we allow it to change the layers in the base model
# we load the weights of the best reuslt of the first training
fine_tune_model = make_model_chordata(input_shape=image_size + (3,), num_classes=166)
fine_tune_model.load_weights("best_model_chordata.keras")

# only unfreeze the lasts layer of the pretrained model here 20
fine_tune_model.base_model.trainable = True
for layer in fine_tune_model.base_model.layers[:-40]:
    layer.trainable = False


fine_tune_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4), # lower learning rate
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

fine_tune_model.fit(
    train_ds_chordata,
    epochs=fine_tune_epochs,
    validation_data=val_chordata,
    callbacks=callbacks
)

## Cnidaria

## Mollusca

# Evaluation

In [None]:
def evaluate_model_predictions(model_path, test_ds):

    model = keras.models.load_model(model_path)
    y_pred_probs = model.predict(test_ds)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.concatenate([y for _, y in test_ds], axis=0)

    print(classification_report(y_true, y_pred))