In [151]:
import polars as pl
import io
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# Scuccorese food ingredients data set 
df = pl.read_parquet('hf://datasets/Scuccorese/food-ingredients-dataset/data/train-*.parquet')

# Preprocessing Data Frame
df = df.drop('category', 'subcategory')
# Unnesting the image binary and ingredient
df = df.unnest("image").select(pl.col("ingredient"), pl.col("bytes").alias("image"))

In [152]:
# Dataset contains images in these unique image formats: {'JPEG', 'GIF', 'PNG', 'WEBP'}.
# For simplifying model building and improving training all file formats will be converted
# to webp. Using webp is preferred because webp images have smaller file sizes while retaining
# or improving image quality compared to JPEG and PNG.

def convert_to_webp(image_bytes):
    with Image.open(io.BytesIO(image_bytes)) as im:
        # Check if the image has transparency
        if im.mode == "P":  # Palette-based (e.g., GIFs)
            im = im.convert("RGBA")
        elif im.mode != "RGB":  # For other non-RGB formats
            im = im.convert("RGB")
        
        # Save the image to WebP format in memory
        output = io.BytesIO()
        im.save(output, format='WEBP')
        return output.getvalue()

# Mapping over the image col to convert all images to a webp format.
df = df.with_columns(
    pl.col("image").map_elements(convert_to_webp, return_dtype=pl.Binary)
)

In [153]:
# Function to decode webp into nparrays.
def decode_image(image_bytes):
    img = Image.open(io.BytesIO(image_bytes)).convert('RGB')  # Ensure 3 channels
    img = img.resize((128, 128))  # Resize to a standard size
    img_array = np.array(img) / 255.0  # Normalize to [0, 1]
    return img_array

# Mapping over the image col to convert all images binary to a nparrays.
df = df.with_columns(
    pl.col("image").map_elements(decode_image, return_dtype=pl.Object)
)

In [171]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Extract features (images) and labels (ingredients)
images = df["image"].to_list()  # Convert to a numpy array
labels = df["ingredient"].to_list()

# Encode the labels (ingredients) to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# One-hot encode the labels
one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_labels = one_hot_encoder.fit_transform(encoded_labels.reshape(-1, 1))

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, one_hot_labels, test_size=0.2, random_state=42)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32).shuffle(buffer_size=1000)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

# Check the label mapping for reference
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
#print("Label Mapping:", label_mapping)

# Example TensorFlow CNN model
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(label_mapping), activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_dataset, validation_data=test_dataset, epochs=10)

Label Mapping: {'adzuki beans': 0, 'all-purpose flour': 1, 'allspice': 2, 'almond flour': 3, 'amaranth': 4, 'apricot': 5, 'arugula': 6, 'asparagus': 7, 'avocado': 8, 'avocado oil': 9, 'bamboo shoots': 10, 'banana': 11, 'barley': 12, 'beef': 13, 'beet': 14, 'beluga lentils': 15, 'bergamot': 16, 'bison': 17, 'black beans': 18, 'black cherry': 19, 'black lentils': 20, 'black olives': 21, 'black salt': 22, 'black sapote': 23, 'black-eyed peas': 24, 'blackberry': 25, 'blueberry': 26, 'bok choy': 27, 'boysenberry': 28, 'bread flour': 29, 'breadfruit': 30, 'broccoli': 31, 'broccoli stem': 32, 'brown lentils': 33, 'brown sugar': 34, 'brussels sprouts': 35, 'buckwheat': 36, 'buffalo': 37, 'bulgur': 38, 'cabbage': 39, 'cake flour': 40, 'cane sugar': 41, 'canned anchovies': 42, 'canned apples': 43, 'canned apricots': 44, 'canned artichoke hearts': 45, 'canned baked beans': 46, 'canned beets': 47, 'canned black beans': 48, 'canned cannellini beans': 49, 'canned carrots': 50, 'canned cherries': 51,

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 322ms/step - accuracy: 0.0075 - loss: 5.8789 - val_accuracy: 0.0105 - val_loss: 5.7085
Epoch 2/10
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 327ms/step - accuracy: 0.0294 - loss: 5.5212 - val_accuracy: 0.0599 - val_loss: 5.1312
Epoch 3/10
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 326ms/step - accuracy: 0.1717 - loss: 4.2403 - val_accuracy: 0.1272 - val_loss: 4.7775
Epoch 4/10
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 330ms/step - accuracy: 0.5107 - loss: 2.2786 - val_accuracy: 0.2096 - val_loss: 5.0596
Epoch 5/10
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 330ms/step - accuracy: 0.8013 - loss: 0.9474 - val_accuracy: 0.2268 - val_loss: 5.7599
Epoch 6/10
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 328ms/step - accuracy: 0.9284 - loss: 0.4229 - val_accuracy: 0.2440 - val_loss: 6.2473
Epoch 7/10
[1m167/16

<keras.src.callbacks.history.History at 0x31b0b1a30>

In [173]:
test_loss, test_accuracy = model.evaluate(test_dataset)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 89ms/step - accuracy: 0.2494 - loss: 6.0404
Test Loss: 6.021127223968506
Test Accuracy: 0.2522455155849457
