In [15]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sys
import os
from PIL import Image
from glob import glob
from tensorflow.keras.applications import EfficientNetB0
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import cv2
import shutil
from sklearn.model_selection import train_test_split

In [2]:
# directory w/ all images
images_dir = '/Users/melissaaprilcastro/FeatherFind/Data/CUB_200_2011/images' 
# file that maps images to class labels
labels_file = '/Users/melissaaprilcastro/FeatherFind/Data/CUB_200_2011/image_class_labels.txt'
#provides dimensions of boundaing boxes
bounding_boxes_file = '/Users/melissaaprilcastro/FeatherFind/Data/CUB_200_2011/bounding_boxes.txt'
# labels classes for files/folders
classes_file = '/Users/melissaaprilcastro/FeatherFind/Data/CUB_200_2011/classes.txt'
# This the binary file defining train/test splits for this dataset
split_file = '/Users/melissaaprilcastro/FeatherFind/Data/CUB_200_2011/train_test_split.txt'  

# Define output directories
train_dir = '/Users/melissaaprilcastro/FeatherFind/Data/CUB_200_2011/Train'
test_dir = '/Users/melissaaprilcastro/FeatherFind/Data/CUB_200_2011/Test'


In [19]:
# load the split_file to see how to split
df_split = pd.read_csv(split_file, sep = ' ' , header = None, names = ['image_num', 'is_training'])
df_split

Unnamed: 0,image_num,is_training
0,1,0
1,2,1
2,3,0
3,4,1
4,5,1
...,...,...
11783,11784,1
11784,11785,0
11785,11786,0
11786,11787,1


In [20]:
labels_df = pd.read_csv(labels_file, sep = ' ', header = None, names = ['image_num', 'class_id'])
labels_df

Unnamed: 0,image_num,class_id
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
11783,11784,200
11784,11785,200
11785,11786,200
11786,11787,200


In [21]:
# merge the two df
data_df = pd.merge(df_split, labels_df, on='image_num')
data_df

Unnamed: 0,image_num,is_training,class_id
0,1,0,1
1,2,1,1
2,3,0,1
3,4,1,1
4,5,1,1
...,...,...,...
11783,11784,1,200
11784,11785,0,200
11785,11786,0,200
11786,11787,1,200


In [30]:
# lists to hold train and test paths
train_paths = []
test_paths = []

image_counter = 1

# iterate over all species folders
for species_folder in os.listdir(images_dir):
    species_path = os.path.join(images_dir, species_folder)
    if os.path.isdir(species_path):
        # remove num prefixes from bird folder names
        clean_species_name = species_folder.split('.', 1)[-1]  # Splits on the first dot and keeps the species name
        
        # list all images in the species folder
        for image_file in os.listdir(species_path):
            image_path = os.path.join(species_path, image_file)
            
            try:
                # match the image_num (which is based on the order of images)
                is_training = df_split[df_split['image_num'] == image_counter]['is_training'].values[0]
                
                # check if the image is part of training or testing
                if is_training == 1:
                    train_paths.append(image_path)
                else:
                    test_paths.append(image_path)
                
                # incriment(?) the image counter for the next image
                image_counter += 1
            except Exception as e:
                print(f"Error processing {image_file}: {e}")

# print summary
print(f"Total training images: {len(train_paths)}")
print(f"Total testing images: {len(test_paths)}")

Total training images: 5994
Total testing images: 5794


In [13]:
''' lets get image numbers from data 
recall that data is filed so that
>DATA(file)
  > Bird Species(file)
    > image1...
    ...
    >imagek
  > Bird Species 2
    >image1
    ...
    >imagek
...

'''
image_paths = glob(f'{base_path}/**/*.jpg', recursive=True)
# print total images in data
print(f'total images: {len(image_paths)}')

total images: 11788


In [32]:
# function to save images to folder
def save_images_to_folders(image_paths, output_dir):
    for image_path in image_paths:
        # Extract species name from the original path
        species_name = os.path.basename(os.path.dirname(image_path))
        
        # Create species directory in the output folder
        species_output_dir = os.path.join(output_dir, species_name)
        os.makedirs(species_output_dir, exist_ok=True)
        
        # Copy the image to the respective directory
        shutil.copy(image_path, os.path.join(species_output_dir, os.path.basename(image_path)))

# save training images
save_images_to_folders(train_paths, train_dir)

# save testing images
save_images_to_folders(test_paths, test_dir)

In [36]:
''' using efficientNet to fine tune for bird classification

This model takes input images of shape (224, 224, 3), and the input data should be in the range [0, 255]. 

Normalization is included as part of the model.

libraries :
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
'''

'''from my understanding, include_top = False is usually for fine tunning the efficientNet model'''

base_model = EfficientNetB0(weights='imagenet', include_top = False)

2024-09-23 00:51:57.729806: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-09-23 00:51:57.729884: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-09-23 00:51:57.729892: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-09-23 00:51:57.729987: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-23 00:51:57.730303: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [35]:
''' for every efficient net model we hace tio resize images to their specified size

for efficient net it is 224 resolution

import cv2

path = base_path
dataset_path = os.listdir(base_path)

image_size = 224

images = []
labels = []

for i in dataset_path:
    data_path = path + str(i)
    filenames = [i for i in os.listdir(base_peth)]
    for f in filenames:
        img = cv2. imread (data_path + '/' + f)
        img = cv2. resize(img, (im_size, im_size))
        images. append (img)
        labels. append(i)
        
'''

" for every efficient net model we hace tio resize images to their specified size\n\nfor efficient net it is 224 resolution\n\nimport cv2\n\npath = base_path\ndataset_path = os.listdir(base_path)\n\nimage_size = 224\n\nimages = []\nlabels = []\n\nfor i in dataset_path:\n    data_path = path + str(i)\n    filenames = [i for i in os.listdir(base_peth)]\n    for f in filenames:\n        img = cv2. imread (data_path + '/' + f)\n        img = cv2. resize(img, (im_size, im_size))\n        images. append (img)\n        labels. append(i)\n        \n"