In [None]:
import os

def get_all_files_in_directory(directory_path):
    file_list = []
    for root, _, files in os.walk(directory_path):
        for file_name in files:
            file_list.append(file_name)
    return file_list

In [1]:
import pandas as pd
products = pd.read_json('elaf_products_with_combinedSeq.json')

products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16556 entries, 0 to 16555
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           16556 non-null  object
 1   categoryId   16556 non-null  object
 2   brandId      14444 non-null  object
 3   name         16556 non-null  object
 4   qty          11688 non-null  object
 5   uom          11521 non-null  object
 6   size         10954 non-null  object
 7   price        16530 non-null  object
 8   combinedSeq  16556 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 1.1+ MB


In [17]:
import os
import time
import logging
import pandas as pd

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s'
)

start_time = time.time()

# Load products
products = pd.read_json('elaf_products_with_combinedSeq.json')
logging.info(f"Loaded {len(products)} products.")

# Get all image filenames with full name (including extension)
def get_all_images_with_extension(directory_path):
    image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp')
    images = []
    for root, _, files in os.walk(directory_path):
        for file_name in files:
            if file_name.lower().endswith(image_extensions):
                images.append(file_name)
    return images

images = get_all_images_with_extension("./Products-photos/all_images/")
logging.info(f"Found {len(images)} images.")

# Regular search function (keep extension)
def find_matching_images(name_dict, images):
    matches = []
    for key in ['ar', 'en']:
        name = name_dict.get(key)
        if not name:
            continue
        name_lower = name.lower()
        for img in images:
            img_name_without_ext = os.path.splitext(img)[0].lower()
            if img_name_without_ext.startswith(name_lower):
                matches.append(img)  # keep extension
    return matches

# Match all products
logging.info("Starting product-image matching (regular search with extension)...")
product_images = {}
for idx, row in products.iterrows():
    name_dict = row['name']  # {"ar": ..., "en": ...}
    matched_images = find_matching_images(name_dict, images)
    if matched_images:
        product_images[row['id']] = matched_images
    if idx > 0 and idx % 1000 == 0:
        logging.info(f"Processed {idx} products...")

# Save results
df_matches = pd.DataFrame([{"product_id": pid, "images": imgs} for pid, imgs in product_images.items()])
df_matches.to_json("product_image_matches_with_extension.json", orient='records', force_ascii=False, indent=4)

logging.info(f"Matching completed for {len(df_matches)} products.")
logging.info(f"Total execution time: {time.time() - start_time:.2f} seconds")


2025-12-10 10:33:57,290 [INFO] Loaded 16556 products.
2025-12-10 10:33:57,318 [INFO] Found 13410 images.
2025-12-10 10:33:57,319 [INFO] Starting product-image matching (regular search with extension)...
2025-12-10 10:34:17,688 [INFO] Processed 1000 products...
2025-12-10 10:34:37,947 [INFO] Processed 2000 products...
2025-12-10 10:34:58,629 [INFO] Processed 3000 products...
2025-12-10 10:35:19,565 [INFO] Processed 4000 products...
2025-12-10 10:35:40,672 [INFO] Processed 5000 products...
2025-12-10 10:36:00,874 [INFO] Processed 6000 products...
2025-12-10 10:36:21,064 [INFO] Processed 7000 products...
2025-12-10 10:36:41,251 [INFO] Processed 8000 products...
2025-12-10 10:37:01,415 [INFO] Processed 9000 products...
2025-12-10 10:37:21,579 [INFO] Processed 10000 products...
2025-12-10 10:37:41,677 [INFO] Processed 11000 products...
2025-12-10 10:38:01,783 [INFO] Processed 12000 products...
2025-12-10 10:38:21,968 [INFO] Processed 13000 products...
2025-12-10 10:38:42,179 [INFO] Processe

In [18]:

# Assuming your DataFrames are named products and df_matches
combined_df = products.merge(
    df_matches,
    how='left',               # Keep all products
    left_on='id',             # products' ID column
    right_on='product_id'     # df_matches' ID column
).drop(columns=['product_id'])  # optional, remove duplicate column

# Check results
print(combined_df.info())
print(combined_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16556 entries, 0 to 16555
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           16556 non-null  object
 1   categoryId   16556 non-null  object
 2   brandId      14444 non-null  object
 3   name         16556 non-null  object
 4   qty          11688 non-null  object
 5   uom          11521 non-null  object
 6   size         10954 non-null  object
 7   price        16530 non-null  object
 8   combinedSeq  16556 non-null  int64 
 9   images       12943 non-null  object
dtypes: int64(1), object(9)
memory usage: 1.3+ MB
None
                                     id       categoryId brandId  \
0  42eb3284-11b3-47f0-b382-cc52318c7393  meat-appetizers    None   
1  5263c656-e95d-4cfa-a72a-4554ef7dc208  meat-appetizers    None   
2  d54d27ea-8b8c-40dc-beae-9503b7d3cff2  meat-appetizers    None   
3  d3532c6f-4651-442b-85b2-103372b94a5b  meat-appetizers    None   
4  8

In [19]:
with open('elaf_products_with_image_matches.json', 'w', encoding='utf-8') as f:
    combined_df.to_json(f, force_ascii=False, orient='records', indent=4)