In [2]:
# Import necessary libraries
import polars as pl
import io
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tensorflow as tf

# Load the food ingredients dataset
df = pl.read_parquet('hf://datasets/Scuccorese/food-ingredients-dataset/data/train-*.parquet')



In [3]:
# Preprocess the DataFrame
df = df.drop('category', 'ingredient')  # Drop unnecessary columns
df = df.unnest("image").select(pl.col("subcategory"), pl.col("bytes").alias("image"))  # Keep only required columns

# Function to convert images to WEBP format
def convert_to_webp(image_bytes):
    with Image.open(io.BytesIO(image_bytes)) as im:
        # Check if the image has transparency
        if im.mode == "P":  # Palette-based (e.g., GIFs)
            im = im.convert("RGBA")
        elif im.mode != "RGB":  # For other non-RGB formats
            im = im.convert("RGB")
        
        # Save the image to WEBP format in memory
        output = io.BytesIO()
        im.save(output, format='WEBP')
        return output.getvalue()

# Convert all images to WEBP format
df = df.with_columns(
    pl.col("image").map_elements(convert_to_webp, return_dtype=pl.Binary)
)



In [4]:
# Function to decode image bytes into NumPy arrays
def decode_image(image_bytes):
    with Image.open(io.BytesIO(image_bytes)) as img:
        img = img.convert('RGB')  # Ensure 3 color channels
        img = img.resize((128, 128))  # Resize to 128x128
        img_array = np.array(img, dtype=np.float32) / 255.0  # Normalize pixel values to [0, 1]
        return img_array

# Decode all images in the dataset
images = [decode_image(img_bytes) for img_bytes in df["image"].to_list()]

# Convert labels
labels = df["subcategory"].to_list()

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# One-hot encode the labels
one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_labels = one_hot_encoder.fit_transform(encoded_labels.reshape(-1, 1))

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, one_hot_labels, test_size=0.2, random_state=42)

# Convert lists to NumPy arrays
X_train = np.array(X_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)

# Check the shapes
print("X_train shape:", X_train.shape)  # Should be (number of samples, 128, 128, 3)
print("y_train shape:", y_train.shape)  # Should be (number of samples, number of classes)


X_train shape: (5340, 128, 128, 3)
y_train shape: (5340, 28)


In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, one_hot_labels, test_size=0.2, random_state=42)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32).shuffle(buffer_size=1000)
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create an ImageDataGenerator with data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Apply data augmentation to the training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = (
    train_dataset.map(lambda x, y: (tf.image.random_flip_left_right(x), y))
    .batch(32)
    .shuffle(buffer_size=1000)
)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

# Check the label mapping for reference
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Mapping:", label_mapping)

# Define the CNN model
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Add dropout for regularization
    tf.keras.layers.Dense(28, activation='softmax')  # 28 classes
])

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Use Adam optimizer with a smaller learning rate
optimizer = Adam(learning_rate=0.001)

# Add a learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

# Compile the model
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])



Label Mapping: {np.str_('allium'): 0, np.str_('avocado_like'): 1, np.str_('beans'): 2, np.str_('berries'): 3, np.str_('citrus'): 4, np.str_('cruciferous'): 5, np.str_('flours'): 6, np.str_('fruits'): 7, np.str_('game'): 8, np.str_('leafy'): 9, np.str_('lentils'): 10, np.str_('peas'): 11, np.str_('poultry'): 12, np.str_('pseudocereals'): 13, np.str_('red_meat'): 14, np.str_('refined_grains'): 15, np.str_('root'): 16, np.str_('salt'): 17, np.str_('seafood'): 18, np.str_('spices'): 19, np.str_('sprouted'): 20, np.str_('stem'): 21, np.str_('stone_fruits'): 22, np.str_('sugars'): 23, np.str_('tropical'): 24, np.str_('types'): 25, np.str_('vegetables'): 26, np.str_('whole_grains'): 27}


In [14]:
# Train the model
history = model.fit(train_dataset, validation_data=test_dataset, epochs=30, callbacks=[lr_scheduler])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Save the trained model
model.save("subcategory_classification_model.h5")

Epoch 1/30
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 298ms/step - accuracy: 0.0894 - loss: 3.3215 - val_accuracy: 0.1542 - val_loss: 3.0529 - learning_rate: 0.0010
Epoch 2/30
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 264ms/step - accuracy: 0.1435 - loss: 3.0312 - val_accuracy: 0.1789 - val_loss: 2.8361 - learning_rate: 0.0010
Epoch 3/30
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 261ms/step - accuracy: 0.1705 - loss: 2.8757 - val_accuracy: 0.2021 - val_loss: 2.7459 - learning_rate: 0.0010
Epoch 4/30
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 265ms/step - accuracy: 0.1927 - loss: 2.7497 - val_accuracy: 0.2193 - val_loss: 2.6685 - learning_rate: 0.0010
Epoch 5/30
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 272ms/step - accuracy: 0.2100 - loss: 2.6482 - val_accuracy: 0.2485 - val_loss: 2.5443 - learning_rate: 0.0010
Epoch 6/30
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[



Test Loss: 2.7984189987182617
Test Accuracy: 0.3697604835033417


In [None]:

# Import the necessary libraries
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

# Load the pre-trained EfficientNetB0 model
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze the base model layers
base_model.trainable = False

# Add custom layers on top
model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(28, activation='softmax')  # 28 classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


In [None]:

# Train the improved model with a pre-trained backbone
epochs = 15  # Initial training with frozen layers
history = model.fit(train_dataset, validation_data=test_dataset, epochs=epochs, callbacks=[lr_scheduler])

# Fine-tune the model: unfreeze some layers and train further
base_model.trainable = True  # Unfreeze all layers
model.compile(optimizer=Adam(learning_rate=0.0001),  # Lower learning rate for fine-tuning
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Fine-tune the model
fine_tune_epochs = 10
total_epochs = epochs + fine_tune_epochs

history_fine = model.fit(train_dataset, validation_data=test_dataset, initial_epoch=history.epoch[-1], epochs=total_epochs, callbacks=[lr_scheduler])


In [None]:

# Evaluate the improved model
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Save the improved model
model.save("subcategory_classification_improved_model.h5")
