<a href="https://colab.research.google.com/github/myredex/tensorflow_tutorials/blob/master/02_How_to_prepare_dataset_for_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download sample data and unzip it
!gdown "14ffqV1P6xgVo9WhjqF3TSc7zTyJmSpxu"

# Unzip data
import zipfile
import tensorflow as tf

zip_ref = zipfile.ZipFile("image_sample.zip")
zip_ref.extractall()
zip_ref.close()

Downloading...
From: https://drive.google.com/uc?id=14ffqV1P6xgVo9WhjqF3TSc7zTyJmSpxu
To: /content/image_sample.zip
  0% 0.00/9.42M [00:00<?, ?B/s]100% 9.42M/9.42M [00:00<00:00, 183MB/s]


## Prepare dataset from directory using ImageDataGenerator

In [2]:
# You need directory for example "train" with subfolders "category_1", "category_2" etc. in it
# In each subfolder images belonging to this category
# ImageDataGenerator able to preprocess images and perform data augmentation

# Step 1 Create data generator instance
datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255,
                                                                       zoom_range=0.2,
                                                                       rotation_range=0.2)

# Step 2 Generate dataframe
train_data = datagen.flow_from_directory(directory="/content/images",
                                         target_size=(100, 100),
                                         class_mode="categorical",
                                         batch_size=32
                                         )

train_data

Found 6 images belonging to 3 classes.


<keras.src.preprocessing.image.DirectoryIterator at 0x7889adb1be80>

## Prepare dataset from directory using image_dataset_from_directory

In [3]:
from tensorflow.python.ops.gen_dataset_ops import batch_dataset
# You need directory for example "train" with subfolders "category_1", "category_2" etc. in it
# In each subfolder images belonging to this category
# image_dataset_from_directory has less options to preprocess images and perform data augmentation

train_data_2 = tf.keras.preprocessing.image_dataset_from_directory(directory="/content/images",
                                                                   label_mode="categorical",
                                                                   batch_size=32,
                                                                   image_size=(100, 100))

train_data_2

Found 6 files belonging to 3 classes.


<_BatchDataset element_spec=(TensorSpec(shape=(None, 100, 100, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>

## Prepare dataset from list of images and list of labels

In [4]:
# If you have no sorted images but have list of image's paths and list of labels
import os
# Create empty list
images_paths=[]
# Walk throught directory and collect all image's paths
for file in os.listdir("/content/all_images_in_one_folder"):
  images_paths.append("/content/all_images_in_one_folder" + "/" + file)
images_paths

['/content/all_images_in_one_folder/cherry1.jpeg',
 '/content/all_images_in_one_folder/apple1.jpg',
 '/content/all_images_in_one_folder/banana2.jpg',
 '/content/all_images_in_one_folder/apple2.jpg',
 '/content/all_images_in_one_folder/cherry2.jpg',
 '/content/all_images_in_one_folder/banana1.png']

In [5]:
# Now create list of labels
# The order of labels might NOT match list of paths in your notebook
text_labels_list = ["apple", "cherry", "apple", "cherry", "banana", "banana"]

one_hot_labels_list = [[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]

In [6]:
# When you have List of image's paths and list of labels you can create tensorflow dataframe

# Step 1 Create preprocessing function
def image_preprocessing(image, label, img_shape=100):

  img = tf.io.read_file(image) # Read file
  img = tf.io.decode_image(img, channels=3, expand_animations = False) # Decode file

  img = tf.image.resize(img, [img_shape, img_shape]) # Resize it
  return tf.cast(img, tf.float32), label # Return image in Float32 datatype

In [7]:
# Step 2 Create dataset
dataset = tf.data.Dataset.from_tensor_slices((images_paths, text_labels_list))
dataset = dataset.map(image_preprocessing, num_parallel_calls=tf.data.AUTOTUNE)
dataset.shuffle(buffer_size=1000).batch(batch_size=32).prefetch(buffer_size=tf.data.AUTOTUNE)

dataset

<_ParallelMapDataset element_spec=(TensorSpec(shape=(100, 100, 3), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>