<a href="https://colab.research.google.com/github/pcashman21/feral-cat-census/blob/main/src/notebooks/generate_unusable_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook uses Tensorflow data augmentation to generate N transforms of each unusable image listed as such in a dataframe read in from the csv file '/content/gdrive/Cat images/kaggle_cats/usability_master.csv'.  The value of N is set so that we will end up with an approximately equal number of usable and unusable images in the dataframe.

For more information see https://www.tensorflow.org/tutorials/images/data_augmentation.

In [1]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
def transform_one_unusable_image(row, path_to_read, path_to_write, transformer, number_of_transforms):
    """
    This function takes in an image and creates a number of transforms of the image.

    row: row of dataframe with image name and usability
    path_to_read: path to folder of images
    path_to_write: path to folder to save transformed images
    transformer: image transformer object
    number_of_transforms: number of transforms to create
    Returns:
        dataframe with transformed images
    """

    image_file = row['filename']
    image_folder = row['folder']
    image_file_path = os.path.join(path_to_read,image_folder,image_file)
    image = plt.imread(image_file_path)
    image_file_name = image_file.split('.')[0]
    image_file_extension = image_file.split('.')[1]
    image_file_name_transformed_root = image_file_name + '_transformed_'
    df_generated_unusable = pd.DataFrame({'filename':[], 'usable':[], 'folder':[], 'generated': []})

    for i in range(number_of_transforms):
        image_file_name_transformed = image_file_name_transformed_root + str(i) + '.' + image_file_extension
        image_file_path_transformed = os.path.join(path_to_write,image_folder, image_file_name_transformed)
        image_transformed = transformer(image)
        # Convert tensor to numpy array and save
        plt.imsave(image_file_path_transformed,image_transformed.numpy())
        gen_df = pd.DataFrame({'filename':[image_file_name_transformed], 'usable':[0], 'folder':[image_folder], 'generated': [1]})
        df_generated_unusable = pd.concat([df_generated_unusable, gen_df], axis=0, ignore_index=True)
    return df_generated_unusable

In [4]:
def transform_unusable_images(df, path_to_read, path_to_write, transformer, number_of_transforms):
    """
    This function takes in a dataframe, selects the unusable images, creates a number of
    transforms of each unusable image, and saves the transforms to a folder.

    df: dataframe with image names and usability
    path_to_read: path to folder of images
    path_to_write: path to folder to save transformed images
    transformer: image transformer object
    number_of_transforms: number of transforms to create

    Returns:
        updated dataframe with transformed images
    """

    # Select unusable images and transform them
    df['generated'] = 0 # Add column to indicate if image is generated
    # Since transform_one_unusable_image returns a dataframe with multiple rows, we can't
    # use a lambda to iterate, as 'apply' will try to stuff the result in the row
    # which is the lambda argument.
    for i in range(len(df)):
      if df['usable'].iloc[i] == 0 and df['generated'].iloc[i] == 0:
        df_generated_unusables = transform_one_unusable_image(df.iloc[i], path_to_read, path_to_write, transformer, number_of_transforms)
        df = pd.concat([df, df_generated_unusables], axis=0, ignore_index=True)
    df['usable'] = df['usable'].astype(int)
    df['generated'] = df['generated'].astype(int)
    return df

In [5]:
IMG_SIZE = 244
NUMBER_OF_TRANSFORMS = 20

In [9]:
PATH_TO_READ = '/content/gdrive/My Drive/Cat images/kaggle_cats'
PATH_TO_WRITE = '/content/gdrive/My Drive/Cat images/kaggle_cats'

In [7]:
# Resize the input image, rescale the pixel values, and perform a series of transforms
transformer_nn = tf.keras.Sequential([
        layers.Resizing(IMG_SIZE, IMG_SIZE),
        layers.Rescaling(1./255),
        layers.RandomFlip("horizontal_and_vertical"),
        layers.RandomRotation(0.2),
        layers.RandomTranslation(0.3, 0.3),
        layers.RandomZoom(0.2)
    ])

In [10]:
# Read the CSV file containing the dataframe
df = pd.read_csv(os.path.join(PATH_TO_READ,'usability_00_01_done.csv'))
df.columns = ['filename','usable','folder']
df.head()

Unnamed: 0,filename,usable,folder
0,00000001_005.jpg,1,CAT_00
1,00000001_008.jpg,0,CAT_00
2,00000001_011.jpg,1,CAT_00
3,00000001_012.jpg,1,CAT_00
4,00000001_016.jpg,1,CAT_00


In [11]:
# Run the transform on all unusable images
df = transform_unusable_images(df, PATH_TO_READ, PATH_TO_WRITE, transformer_nn, NUMBER_OF_TRANSFORMS)
df.head()

Unnamed: 0,filename,usable,folder,generated
0,00000001_005.jpg,1,CAT_00,0
1,00000001_008.jpg,0,CAT_00,0
2,00000001_011.jpg,1,CAT_00,0
3,00000001_012.jpg,1,CAT_00,0
4,00000001_016.jpg,1,CAT_00,0


In [12]:
# Save the updated df with the added unusable images
df.to_csv(os.path.join(PATH_TO_WRITE,'usability_master.csv'),mode='w',header=True,index=False)