In [1]:
import os
import pandas as pd

In [4]:
IMG_DIR = "product_images"
os.makedirs(IMG_DIR, exist_ok=True)

df = pd.read_csv("preprocessed_foodfacts_phase2.csv")

In [5]:
# --- product_id and image lookup ---

def extract_code_from_url(url):
    """Extract the product code string from /product/<code>/ in the URL."""
    url = str(url)
    if "/product/" in url:
        part = url.split("/product/")[1].split("/")[0]
        return part.strip()
    return None

def make_product_id(row):
    """
    Use the code from the URL as primary ID, because it matches filenames
    in product_images/. Fall back to a cleaned barcode or index.
    """
    # Prefer URL-based code
    code_from_url = extract_code_from_url(row.get("url", ""))
    if code_from_url:
        return code_from_url

    # Fallback: cleaned barcode (digits only, strip .0 etc.)
    bc = str(row.get("barcode", "")).strip()
    if bc and bc.lower() not in ["nan", "none"]:
        bc_digits = "".join(ch for ch in bc if ch.isdigit())
        if bc_digits:
            return bc_digits

    # Last fallback: index
    return f"idx_{row.name}"

df["product_id"] = df.apply(make_product_id, axis=1)

In [6]:
def find_image(row):
    """
    Locate an image in product_images/ for this row.
    Tries:
      - product_id.jpg / product_id.png
      - code-from-URL.jpg / .png (in case product_id differs)
      - basename of main_image_url
    """
    candidates = []

    # product_id-based filenames
    pid = str(row.get("product_id", "")).strip()
    if pid:
        candidates.append(pid + ".jpg")
        candidates.append(pid + ".png")

    # URL-based code (extra safety, though usually == product_id)
    url_code = extract_code_from_url(row.get("url", ""))
    if url_code and url_code != pid:
        candidates.append(url_code + ".jpg")
        candidates.append(url_code + ".png")

    # filename from original main_image_url (unlikely to match, but cheap to try)
    img_url = row.get("main_image_url", "")
    if isinstance(img_url, str) and img_url.strip():
        base = os.path.basename(img_url.split("?")[0])
        if base:
            candidates.append(base)

    # Deduplicate while preserving order
    seen = set()
    for fname in candidates:
        if fname in seen:
            continue
        seen.add(fname)

        path = os.path.join(IMG_DIR, fname)
        if os.path.exists(path):
            return path

    return None

In [7]:
df["image_path"] = df.apply(find_image, axis=1)
df["has_image"] = df["image_path"].notna().astype(int)