In [123]:
import polars as pl
import io
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# Scuccorese food ingredients data set 
df = pl.read_parquet('hf://datasets/Scuccorese/food-ingredients-dataset/data/train-*.parquet')

# Preprocessing Data Frame
df = df.drop('category', 'subcategory')
# Unnesting the image binary and ingredient
df = df.unnest("image").select(pl.col("ingredient"), pl.col("bytes").alias("image"))

In [129]:
# Dataset contains images in these unique image formats: {'JPEG', 'GIF', 'PNG', 'WEBP'}.
# For simplifying model building and improving training all file formats will be converted
# to webp. Using webp is preferred because webp images have smaller file sizes while retaining
# or improving image quality compared to JPEG and PNG.

def convert_to_webp(image_bytes):
    with Image.open(io.BytesIO(image_bytes)) as im:
        # Check if the image has transparency
        if im.mode == "P":  # Palette-based (e.g., GIFs)
            im = im.convert("RGBA")
        elif im.mode != "RGB":  # For other non-RGB formats
            im = im.convert("RGB")
        
        # Save the image to WebP format in memory
        output = io.BytesIO()
        im.save(output, format='WEBP')
        return output.getvalue()

# Mapping over the image col to convert all images to a webp format.
df = df.with_columns(
    pl.col("image").map_elements(convert_to_webp, return_dtype=pl.Binary)
)

In [135]:
# Function to decode webp into nparrays.
def decode_image(image_bytes):
    img = Image.open(io.BytesIO(image_bytes)).convert('RGB')  # Ensure 3 channels
    img = img.resize((128, 128))  # Resize to a standard size
    img_array = np.array(img) / 255.0  # Normalize to [0, 1]
    return img_array

# Mapping over the image col to convert all images binary to a nparrays.
df = df.with_columns(
    pl.col("image").map_elements(decode_image, return_dtype=pl.Object)
)

In [137]:
print(df)

shape: (6_676, 2)
┌───────────────┬─────────────────────────────────┐
│ ingredient    ┆ image                           │
│ ---           ┆ ---                             │
│ str           ┆ object                          │
╞═══════════════╪═════════════════════════════════╡
│ spinach       ┆ [[[0.71764706 0.82352941 0.654… │
│ spinach       ┆ [[[0.9372549  0.85098039 0.725… │
│ spinach       ┆ [[[0.99607843 0.99607843 0.996… │
│ spinach       ┆ [[[0.29803922 0.25098039 0.243… │
│ spinach       ┆ [[[0.73333333 0.75294118 0.647… │
│ …             ┆ …                               │
│ pickling salt ┆ [[[1. 1. 1.]                    │
│               ┆   [1. 1. 1.]                    │
│               ┆   [1…                           │
│ pickling salt ┆ [[[1. 1. 1.]                    │
│               ┆   [1. 1. 1.]                    │
│               ┆   [1…                           │
│ pickling salt ┆ [[[1. 1. 1.]                    │
│               ┆   [1. 1. 1.]                