<a href="https://colab.research.google.com/github/pcashman21/feral-cat-census/blob/main/src/notebooks/move_files_to_idg_folder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook takes images from the 'content/gdrive/Cat images/kaggle_cats' folder structure (CAT_00, CAT_01) and a dataframe defined in the CSV file 'content/gdrive/Cat images/kaggle_cats/usability_master.csv' and moves the image files into a folder structure for use by the Tensorflow ImageDataGenerator class.  We assume that the images named in the df are approximately balanced between usable and unusable images.  (This was accomplished by running the generate_unusable_images notebook.)

In [9]:
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import load_img, save_img

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [19]:
def move_image(x, path_to_read, path_to_write, train_class_limits, test_class_limits, train_class_counts, test_class_counts):
    """
    Move image from resized_images folder to a folder structure that
        can be used by the ImageDataGenerator class.
    x: row of dataframe containing the image file name and its class label
    path_to_read: path to directory from which to read images
    path_to_write: path to directory to save images
    train_class_limits: number of images to use for training in each class
    test_class_limits: number of images to use for testing in each class
    train_class_counts: number of images written to training directory for each class
    test_class_counts: number of images written to testing directory for each class
    """

    # Read in image
    image = load_img(os.path.join(path_to_read, x['folder'], x['filename']), target_size=(244, 244))

    # Write image to appropriate directory.  Note that the file name is
    # sequentially numbered in each class, which is an ImageDataGenerator requirement.
    if x['usable'] == 0:
        if train_class_counts[0] < train_class_limits[0]:
            train_class_counts[0] += 1
            save_img(os.path.join(path_to_write, 'train/unusable', str(train_class_counts[0]) + '.jpg'), image)
        elif test_class_counts[0] < test_class_limits[0]:
            test_class_counts[0] += 1
            save_img(os.path.join(path_to_write, 'test/unusable', str(test_class_counts[0]) + '.jpg'), image)

    else:
        if train_class_counts[1] < train_class_limits[1]:
            train_class_counts[1] += 1
            save_img(os.path.join(path_to_write, 'train/usable', str(train_class_counts[1]) + '.jpg'), image)
        elif test_class_counts[1] < test_class_limits[1]:
            test_class_counts[1] += 1
            save_img(os.path.join(path_to_write, 'test/usable',  str(test_class_counts[1]) + '.jpg'), image)


In [15]:
def make_folder_for_image_generator(df, path_to_read, path_to_write, train_test_split=0.8):
    """
    Move images from resized_images folder to a folder structure that
        can be used by the ImageDataGenerator class.
    df: dataframe containing the image file names and their class labels
    path_to_read: path to directory from which to read images
    path_to_write: path to directory to save images
    train_test_split: proportion of images to use for training
    """

    # Count number of images in each class
    class_counts = df['usable'].value_counts()
    train_class_limits = [int(class_counts[0] * train_test_split), int(class_counts[1] * train_test_split)]
    test_class_limits = [class_counts[0] - train_class_limits[0], class_counts[1] - train_class_limits[1]]
    train_class_counts = [0,0]
    test_class_counts = [0,0]
    df[['usable', 'filename', 'folder']].apply(lambda x: move_image(x, path_to_read, path_to_write, train_class_limits, test_class_limits, train_class_counts, test_class_counts), axis=1)

In [21]:
PATH_TO_READ = '/content/gdrive/My Drive/Cat images/kaggle_cats'
PATH_TO_WRITE = '/content/gdrive/My Drive/Cat images/ImageDataGenerator_images'

In [10]:
# Read the CSV file containing the dataframe
df = pd.read_csv(os.path.join(PATH_TO_READ,'usability_master.csv'))
df.head()

Unnamed: 0,filename,usable,folder,generated
0,00000001_005.jpg,1,CAT_00,0
1,00000001_008.jpg,0,CAT_00,0
2,00000001_011.jpg,1,CAT_00,0
3,00000001_012.jpg,1,CAT_00,0
4,00000001_016.jpg,1,CAT_00,0


In [22]:
make_folder_for_image_generator(df, PATH_TO_READ, PATH_TO_WRITE, train_test_split=0.8)