# 📚 Data Preprocessing for Food-101 Dataset

In [None]:
# -- Imports --
import tensorflow as tf
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt

# -- Load Food-101 Dataset --
print("Loading Hugging Face 'food101' dataset...")
food_dataset = load_dataset("food101")

# -- Explore Dataset --
print("Training samples:", len(food_dataset['train']))
print("Validation samples:", len(food_dataset['validation']))

# -- Preprocessing Functions --
IMG_SIZE = (224, 224)

In [None]:
def preprocess(example):
    image = tf.convert_to_tensor(np.array(example['image']))  # Convert PIL to tensor
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32) / 255.0  # Normalize to [0,1]
    label = example['label']
    return {"image": image, "label": label}  # Must return dict for datasets

# -- Apply Preprocessing to Dataset --
processed_train = food_dataset['train'].map(preprocess, remove_columns=['image', 'label'])
processed_val = food_dataset['validation'].map(preprocess, remove_columns=['image', 'label'])

# -- Convert to tf.data.Dataset --
BATCH_SIZE = 32

def hf_to_tf_dataset(dataset):
    return tf.data.Dataset.from_generator(
        lambda: ({"image": x["image"], "label": x["label"]} for x in dataset),
        output_signature={
            "image": tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
            "label": tf.TensorSpec(shape=(), dtype=tf.int64),
        },
    ).map(lambda x: (x["image"], x["label"])).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_tfds = hf_to_tf_dataset(processed_train)
val_tfds = hf_to_tf_dataset(processed_val)

# -- Visualize Few Samples --
plt.figure(figsize=(10, 8))
for images, labels in train_tfds.take(1):
    for i in range(9):
        plt.subplot(3, 3, i+1)
        plt.imshow(images[i])
        plt.title(f"Class ID: {labels[i].numpy()}")
        plt.axis("off")
plt.show()

print("✅ Data Preprocessing Complete.")
