### 3. Data Augmentation

In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import random
import cv2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

2024-02-28 01:56:53.231231: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-28 01:56:53.457082: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 01:56:53.457291: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 01:56:53.497593: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-28 01:56:53.568322: I tensorflow/core/platform/cpu_feature_guar

In [2]:
def read_and_crop(path):
    
    # Read the CSV file into a DataFrame
    if path[-1] == '/':
        df = pd.read_csv(path +'_annotations.csv')
    else :
        df = pd.read_csv(path +'/_annotations.csv')

    # Read and store cropped images
    cropped_image_list = []

    for index, row in df.iterrows():
        image_path = row['filename']
        if path[-1] == '/':
            image = Image.open(path + image_path)
        else :
            image = Image.open(path + '/' + image_path)
            

        # Extract cropping coordinates
        xmin, ymin, xmax, ymax = (
            max(0, row['xmin']),
            max(0, row['ymin']),
            min(image.width, row['xmax']),
            min(image.height, row['ymax'])
        )

        # Check if the adjusted coordinates are valid
        if xmin < xmax and ymin < ymax:
            
            # Crop the image
            cropped_image = np.array(image.crop((xmin, ymin, xmax, ymax))) 

            # Reshape the images in the 'cropped_image' column
            cropped_reshape_image = cv2.resize(cropped_image, (171, 138))

            # Store the cropped image in the list
            cropped_image_list.append(cropped_reshape_image)
        else:
            # If the coordinates are invalid, append a placeholder (e.g., None)
            cropped_image_list.append(None)

    # Add a new column to the DataFrame with cropped images
    df['cropped_image'] = cropped_image_list.copy()

    # Filter out rows with None values in the 'cropped_image' column
    df_valid_crops = df.dropna(subset=['cropped_image']).copy()
    
    df_final = df_valid_crops[['class', 'cropped_image']].copy()
    
    # 
    old_class_to_actual_class = {
        'tuna' : 'tuna', 
        'surgeon': 'surgeon', 
        'shark': 'shark', 
        'jack': 'jack', 
        'grouper': 'grouper', 
        'parrot': 'parrot', 
        'snapper': 'snapper',
        'damsel': 'damsel', 
        'trigger': 'trigger', 
        'Zanclidae (Moorish Idol)': 'moorish idol',
        'Scaridae -Parrotfishes-': 'parrot', 
        'Carangidae -Jacks-': 'jack',
        'Scombridae -Tunas-': 'tuna', 
        'Shark -Selachimorpha-': 'shark',
        'Serranidae -Groupers-': 'grouper', 
        'Lutjanidae -Snappers-': 'snapper',
        'Acanthuridae -Surgeonfishes-': 'surgeon', 
        'Pomacentridae -Damselfishes-': 'damsel',
        'Labridae -Wrasse-': 'wrasse', 
        'angel': 'angel', 
        'wrasse': 'wrasse', 
        'Zanclidae -Moorish Idol-': 'moorish idol',
        'Ephippidae -Spadefishes-': 'spade', 
        'Pomacanthidae -Angelfishes-': 'angel',
        'Balistidae -Triggerfishes-': 'trigger', 
        'spade': 'spade'
    }
   
    # Replace old class names with new class names
    df_final['class'] = df_final['class'].replace(old_class_to_actual_class)
    
    return df_final


def print_image_by_classes(df,images_to_plot=2):
    
    # Get unique classes in the DataFrame
    unique_classes = df['class'].unique()

    # Plot two random images for each class
    for class_name in unique_classes:
        # Filter DataFrame based on the current class
        df_same_class = df[df['class'] == class_name]

        # Check the number of images in the current class
        num_images = len(df_same_class)

        if num_images >= images_to_plot:
            # Randomly select two images from the current class
            random_indices = random.sample(df_same_class.index.tolist(), images_to_plot)

            # Plot the two random images for the current class
            fig, axes = plt.subplots(1, images_to_plot, figsize=(10, 5))
            fig.suptitle(f'Two Random Images of Class: {class_name}')

            for i, ax in enumerate(axes):
                ax.imshow(df_same_class.loc[random_indices[i]]['cropped_image'])
                ax.set_title(f"Index: {random_indices[i]}")
                ax.axis('off')

            plt.show()
            continue
            
        elif num_images >= 1:
            # Plot the single image for the current class
            plt.figure(figsize=(5, 5))
            plt.imshow(df_same_class.iloc[0]['cropped_image'])
            plt.title(f'Image of Class: {class_name} (Index: {df_same_class.index[0]})')
            plt.axis('off')
            plt.show()
            continue
            
        else:
            print(f"No images of class '{class_name}' for plotting.")
    

In [3]:
df_train = read_and_crop('data/train/')
df_test = read_and_crop('data/test/')
df_valid = read_and_crop('data/valid/')

In [4]:
# function to ressort the X and y of dataframe
def to_work(df):
    
    class_to_number = {
    'tuna': 0, 
    'surgeon': 1, 
    'shark': 2, 
    'jack': 3, 
    'grouper': 4, 
    'parrot': 5, 
    'snapper': 6,
    'damsel': 7, 
    'trigger': 8, 
    'moorish idol': 9, 
    'wrasse': 10, 
    'angel': 11, 
    'spade': 12
}

    X = np.stack(df['cropped_image'].to_numpy().copy(), axis=0)
    
    # standardize and center data (make my pc crash)
    X = (X / 255) - 0.5
    
    y = df['class'].replace(class_to_number).to_numpy().copy()
    y_cat = to_categorical(y, num_classes=13)  
    
    return X, y, y_cat 

In [5]:
X_train, y_train, y_train_cat = to_work(df_train)

  y = df['class'].replace(class_to_number).to_numpy().copy()


In [6]:
X_train.shape

(2075, 138, 171, 3)

In [7]:
y_train.shape

(2075,)

In [8]:
def oversampling(X,y):
# define dataset
# Reshape each image to a flat vector
    X_flat = X.reshape(X.shape[0], -1)
# summarize class distribution
    print(Counter(y))
# define oversampling strategy
    oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
    X_over, y_over = oversample.fit_resample(X_flat, y)
# summarize class distribution
    print(Counter(y_over))
# Reshape the resampled data back to the original image shape
    X_resampled = X_over.reshape(-1, 138, 171, 3)
    return X_resampled, y_over

In [9]:
def oversampling(X, y, iterations=6):
    for i in range(iterations):
        # Reshape each image to a flat vector
        X_flat = X.reshape(X.shape[0], -1)
        
        # Summarize class distribution before oversampling
        print(f"Iteration {i + 1} - Class distribution before oversampling: {Counter(y)}")
        
        # Define oversampling strategy
        oversample = RandomOverSampler(sampling_strategy='minority')
        
        # Fit and apply the transform
        X_over, y_over = oversample.fit_resample(X_flat, y)
        
        # Summarize class distribution after oversampling
        print(f"Iteration {i + 1} - Class distribution after oversampling: {Counter(y_over)}")
        
        # Reshape the resampled data back to the original image shape
        X = X_over.reshape(-1, 138, 171, 3)
        y = y_over
    
    return X, y

# Perform oversampling for 6 iterations
X_resampled, y_resampled = oversampling(X_train, y_train, iterations=6)


Iteration 1 - Class distribution before oversampling: Counter({1: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 8: 2, 9: 2, 10: 2, 11: 2, 12: 2})
Iteration 1 - Class distribution after oversampling: Counter({1: 534, 8: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 9: 2, 10: 2, 11: 2, 12: 2})
Iteration 2 - Class distribution before oversampling: Counter({1: 534, 8: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 9: 2, 10: 2, 11: 2, 12: 2})
Iteration 2 - Class distribution after oversampling: Counter({1: 534, 8: 534, 9: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 10: 2, 11: 2, 12: 2})
Iteration 3 - Class distribution before oversampling: Counter({1: 534, 8: 534, 9: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 10: 2, 11: 2, 12: 2})
Iteration 3 - Class distribution after oversampling: Counter({1: 534, 8: 534, 9: 534, 10: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 11: 2, 12: 2})
Iteration 4 - Class distr

: 

In [None]:
d

In [None]:
X_1 , y_1= oversampling(X_train,y_train)

Counter({1: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 8: 2, 9: 2, 10: 2, 11: 2, 12: 2})
Counter({1: 534, 8: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 9: 2, 10: 2, 11: 2, 12: 2})


In [None]:
oversampling(X_1, y_1)

Counter({1: 534, 8: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 9: 2, 10: 2, 11: 2, 12: 2})
Counter({1: 534, 8: 534, 9: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 10: 2, 11: 2, 12: 2})


(array([[[[-0.27254902,  0.03333333,  0.18235294],
          [-0.27254902,  0.03333333,  0.18235294],
          [-0.27254902,  0.03333333,  0.18235294],
          ...,
          [-0.28431373,  0.0254902 ,  0.15490196],
          [-0.29215686,  0.0254902 ,  0.13921569],
          [-0.29215686,  0.0254902 ,  0.13921569]],
 
         [[-0.27647059,  0.02941176,  0.17843137],
          [-0.27647059,  0.02941176,  0.17843137],
          [-0.27647059,  0.02941176,  0.17843137],
          ...,
          [-0.27647059,  0.0254902 ,  0.1627451 ],
          [-0.27647059,  0.0254902 ,  0.14705882],
          [-0.28431373,  0.01764706,  0.14313725]],
 
         [[-0.28039216,  0.0254902 ,  0.1745098 ],
          [-0.28039216,  0.0254902 ,  0.1745098 ],
          [-0.28039216,  0.0254902 ,  0.1745098 ],
          ...,
          [-0.28431373,  0.01764706,  0.16666667],
          [-0.27254902,  0.00980392,  0.15882353],
          [-0.26862745,  0.00196078,  0.15490196]],
 
         ...,
 
         [[-

In [None]:
d

In [None]:
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

In [None]:
# summarize the class distribution
counter = Counter(y_train)
print(f'Original Training : {counter}')

# combining SMOTE & Undersampling strategies
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

# creating a pipeline
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_train_reassembled, y_train_reassembled = pipeline.fit_resample(X_train, y_train)

# summarize the new class distribution
counter = Counter(y_train_reassembled)
print(f'Training combining SMOTE & Undersampling strategies : {counter}')

Original Training : Counter({1: 534, 3: 376, 0: 333, 6: 299, 2: 188, 5: 176, 4: 147, 7: 12, 8: 2, 9: 2, 10: 2, 11: 2, 12: 2})


NameError: name 'RandomUnderSampler' is not defined

In [None]:
# Display the class distribution in the training set before rebalancing
print("Class distribution in the training set before rebalancing:")


# Reshape each image to a flat vector
X_train_flat = X_train.reshape(X_train.shape[0], -1)

# Rebalance the training set using SMOTE
smote = SMOTE(random_state=42,k_neighbors=min(1, len(X_train_flat) - 1))
X_train_resampled_flat, y_train_resampled = smote.fit_resample(X_train_flat, y_train)

# Reshape the resampled data back to the original image shape
X_train_resampled = X_train_resampled_flat.reshape(-1, 138, 171, 3)

# Display the class distribution in the rebalanced training set
print("\nClass distribution in the rebalanced training set:")
print(pd.Series(y_train_resampled).value_counts())


Class distribution in the training set before rebalancing:


: 