In [12]:
import os
import pandas as pd

In [13]:
IMG_DIR = "product_images"
os.makedirs(IMG_DIR, exist_ok=True)

df = pd.read_csv("preprocessed_foodfacts_phase2.csv")

In [14]:
# --- product_id and image lookup ---

def extract_code_from_url(url):
    """Extract the product code string from /product/<code>/ in the URL."""
    url = str(url)
    if "/product/" in url:
        part = url.split("/product/")[1].split("/")[0]
        return part.strip()
    return None

def make_product_id(row):
    """
    Use the code from the URL as primary ID, because it matches filenames
    in product_images/. Fall back to a cleaned barcode or index.
    """
    # Prefer URL-based code
    code_from_url = extract_code_from_url(row.get("url", ""))
    if code_from_url:
        return code_from_url

    # Fallback: cleaned barcode (digits only, strip .0 etc.)
    bc = str(row.get("barcode", "")).strip()
    if bc and bc.lower() not in ["nan", "none"]:
        bc_digits = "".join(ch for ch in bc if ch.isdigit())
        if bc_digits:
            return bc_digits

    # Last fallback: index
    return f"idx_{row.name}"

df["product_id"] = df.apply(make_product_id, axis=1)

In [15]:
def find_image(row):
    """
    Locate an image in product_images/ for this row.
    Tries:
      - product_id.jpg / product_id.png
      - code-from-URL.jpg / .png (in case product_id differs)
      - basename of main_image_url
    """
    candidates = []

    # product_id-based filenames
    pid = str(row.get("product_id", "")).strip()
    if pid:
        candidates.append(pid + ".jpg")
        candidates.append(pid + ".png")

    # URL-based code (extra safety, though usually == product_id)
    url_code = extract_code_from_url(row.get("url", ""))
    if url_code and url_code != pid:
        candidates.append(url_code + ".jpg")
        candidates.append(url_code + ".png")

    # filename from original main_image_url (unlikely to match, but cheap to try)
    img_url = row.get("main_image_url", "")
    if isinstance(img_url, str) and img_url.strip():
        base = os.path.basename(img_url.split("?")[0])
        if base:
            candidates.append(base)

    # Deduplicate while preserving order
    seen = set()
    for fname in candidates:
        if fname in seen:
            continue
        seen.add(fname)

        path = os.path.join(IMG_DIR, fname)
        if os.path.exists(path):
            return path

    return None

In [16]:
df["image_path"] = df.apply(find_image, axis=1)
df["has_image"] = df["image_path"].notna().astype(int)

In [17]:
df.head()

Unnamed: 0,url,product_name,barcode,brand,quantity,serving_size,nutriscore_letter,nova_group,ingredients_text,allergens,...,log_sugars_100g,log_salt_100g,brand_cleaned,allergens_cleaned,ingredients_text_cleaned,countries_cleaned,additives_cleaned,product_id,image_path,has_image
0,https://world.openfoodfacts.org/product/800931...,Mais per Pop Corn – La Casetta di Campagna – 4...,8009320000000.0,La Casetta di Campagna,400 g,,1.0,4.0,Italian: Mais,,...,1.223775,0.019803,la casetta di campagna,,mais,italy,,8009318300675,product_images\8009318300675.jpg,1
1,https://world.openfoodfacts.org/product/004122...,Raw almonds – H-E-B Organics – 8 oz,41220020000.0,H-E-B Organics,8 oz,,1.0,4.0,,,...,1.46626,0.0,h-e-b organics,,,united states,,41220017751,product_images\0041220017751.jpg,1
2,https://world.openfoodfacts.org/product/406644...,Linsenwaffeln gesalzen – DmBio – 100g,4066450000000.0,DmBio,100g,,1.0,3.0,"German: 99,5% Linsenmehl*, 0,5 % Meersalz. aus...",,...,0.955511,0.405465,dmbio,,"99, 5% linsenmehl*, 0, 5 % meersalz. aus biolo...",germany,,4066447256581,product_images\4066447256581.jpg,1
3,https://world.openfoodfacts.org/product/007681...,Whole Almonds – Diamond of california – 10 oz,76811340000.0,Diamond of california,10 oz,1/4 cup (30 g) (30 GRM),1.0,4.0,Almonds,Nuts,...,1.465568,0.0,diamond of california,tree_nuts,almonds,united states,,76811342127,product_images\0076811342127.jpg,1
4,https://world.openfoodfacts.org/product/003068...,Whole Raw Almonds – Tree Of Life Inc.,30684010000.0,Tree Of Life Inc.,,1 ONZ (28 g),1.0,4.0,Organic whole raw almonds,Nuts,...,1.519513,0.0,tree of life,tree_nuts,organic whole raw almonds,united states,,30684009013,,0


In [18]:
from PIL import Image

In [19]:
TARGET_SIZE = 160
RESIZED_DIR = "images_160"
os.makedirs(RESIZED_DIR, exist_ok=True)

def resize_with_padding(path_in):
    if not isinstance(path_in, str) or not os.path.exists(path_in):
        return None

    filename = os.path.basename(path_in)
    path_out = os.path.join(RESIZED_DIR, filename)

    if os.path.exists(path_out):
        return path_out

    try:
        img = Image.open(path_in).convert("RGB")
        w, h = img.size

        # scale to fit within TARGET_SIZE while keeping aspect ratio
        scale = TARGET_SIZE / max(w, h)
        new_w = int(w * scale)
        new_h = int(h * scale)
        img = img.resize((new_w, new_h), Image.Resampling.BILINEAR)

        # create square canvas and paste centered
        background = Image.new("RGB", (TARGET_SIZE, TARGET_SIZE), (0, 0, 0))  # or (255,255,255)
        offset_x = (TARGET_SIZE - new_w) // 2
        offset_y = (TARGET_SIZE - new_h) // 2
        background.paste(img, (offset_x, offset_y))

        background.save(path_out, format="JPEG")
        return path_out
    except Exception:
        return None

In [20]:
df["image_160_path"] = df["image_path"].apply(resize_with_padding)
df["has_image_160"] = df["image_160_path"].notna().astype(int)

In [21]:
df.shape

(3185, 57)

In [22]:
df.to_csv("preprocessedPhase3FoodFacts.csv", index=False)