In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in background
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# IKEA category URL (example: Chairs)
IKEA_URL = "https://www.ikea.com/us/en/cat/chairs-fu002/"

def scrape_ikea_products():
    driver.get(IKEA_URL)
    time.sleep(5)  # Allow page to load

    # Scroll down multiple times to load more products
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # 🛠 **Updated Selector** - Try grabbing product cards
    products = []
    items = driver.find_elements(By.CSS_SELECTOR, "[data-ref-id]")  # Finds all product containers

    for item in items:
        try:
            name = item.find_element(By.CSS_SELECTOR, "span:not([class])").text  # Find first <span> with text
            price = item.find_element(By.CSS_SELECTOR, "span.pip-price__integer").text
            image = item.find_element(By.CSS_SELECTOR, "img").get_attribute("src")
            link = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")

            products.append({"Name": name, "Price": price, "Image URL": image, "Link": link})
        except Exception as e:
            print("Skipping product due to error:", e)
            continue

    driver.quit()

    # Save to CSV
    df = pd.DataFrame(products)
    df.to_csv("ikea_products.csv", index=False)
    print(f"✅ Scraped {len(products)} products and saved to 'ikea_products.csv'.")

scrape_ikea_products()

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in background
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# IKEA Categories
ikea_categories = {
    "Chairs": "https://www.ikea.com/us/en/cat/chairs-fu002/",
    "Sofas": "https://www.ikea.com/us/en/cat/sofas-sectionals-fu003/",
    "Tables": "https://www.ikea.com/us/en/cat/tables-desks-fu004/",
    "Beds": "https://www.ikea.com/us/en/cat/beds-mattresses-fu005/",
    "Storage": "https://www.ikea.com/us/en/cat/storage-organization-fu006/",
    "Lighting": "https://www.ikea.com/us/en/cat/lighting-fu007/"
}

all_products = []  # Store all scraped products

def scrape_ikea_category(category_name, url):
    print(f"🔄 Scraping {category_name}...")
    driver.get(url)
    time.sleep(5)  # Allow page to load

    # Scroll down to load all products
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Extract product details
    items = driver.find_elements(By.CSS_SELECTOR, "[data-ref-id]")
    category_products = []

    for item in items:
        try:
            name = item.find_element(By.CSS_SELECTOR, "span:not([class])").text  # Product name
            price = item.find_element(By.CSS_SELECTOR, "span.pip-price__integer").text  # Price
            image = item.find_element(By.CSS_SELECTOR, "img").get_attribute("src")  # Image URL
            link = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")  # Product link

            category_products.append({"Category": category_name, "Name": name, "Price": price, "Image URL": image, "Link": link})
        except:
            continue

    print(f"✅ Scraped {len(category_products)} products from {category_name}.")
    return category_products

# Loop through each category and scrape
for category, url in ikea_categories.items():
    all_products.extend(scrape_ikea_category(category, url))

driver.quit()

# Save all data to CSV
df = pd.DataFrame(all_products)
df.to_csv("ikea_all_products.csv", index=False)
print(f"🎉 Scraped {len(all_products)} total products and saved to 'ikea_all_products.csv'.")

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# IKEA Regions
ikea_regions = {
    "USA": "https://www.ikea.com/us/en/",
    "UK": "https://www.ikea.com/gb/en/",
    "Germany": "https://www.ikea.com/de/de/",
    "France": "https://www.ikea.com/fr/fr/",
    "Canada": "https://www.ikea.com/ca/en/",
    "Australia": "https://www.ikea.com/au/en/"
}

# IKEA Categories
ikea_categories = {
    "Chairs": "cat/chairs-fu002/",
    "Sofas": "cat/sofas-sectionals-fu003/",
    "Tables": "cat/tables-desks-fu004/",
    "Beds": "cat/beds-mattresses-fu005/",
    "Storage": "cat/storage-organization-fu006/",
    "Lighting": "cat/lighting-fu007/"
}

all_products = []  # Store all scraped products

def scrape_ikea_category(region, base_url, category_name, category_url):
    """ Starts a new Chrome session for each region to prevent session loss. """
    
    # Start a NEW browser session each time
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in background
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    full_url = base_url + category_url
    print(f"🔄 Scraping {category_name} from {region} - {full_url}")
    driver.get(full_url)
    time.sleep(5)  # Allow page to load

    # Scroll to load all products
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Extract product details
    items = driver.find_elements(By.CSS_SELECTOR, "[data-ref-id]")
    category_products = []

    for item in items:
        try:
            name = item.find_element(By.CSS_SELECTOR, "span:not([class])").text
            price = item.find_element(By.CSS_SELECTOR, "span.pip-price__integer").text
            image = item.find_element(By.CSS_SELECTOR, "img").get_attribute("src")
            link = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")

            category_products.append({
                "Region": region,
                "Category": category_name,
                "Name": name,
                "Price": price,
                "Image URL": image,
                "Link": link
            })
        except:
            continue

    print(f"✅ Scraped {len(category_products)} products from {category_name} in {region}.")
    
    driver.quit()  # Close browser session
    return category_products

# Loop through each region and category (Restarting Browser Each Time)
for region, base_url in ikea_regions.items():
    for category, category_url in ikea_categories.items():
        all_products.extend(scrape_ikea_category(region, base_url, category, category_url))

# Save all data to CSV
df = pd.DataFrame(all_products)
df.to_csv("ikea_all_regions.csv", index=False)
print(f"🎉 Scraped {len(all_products)} total products from multiple regions and saved to 'ikea_all_regions.csv'.")

In [None]:
import os
import requests
import pandas as pd

# Load product data
df = pd.read_csv("ikea_all_regions.csv")

# Create folder for images
IMAGE_FOLDER = "ikea_images"
os.makedirs(IMAGE_FOLDER, exist_ok=True)

# Download images
for index, row in df.iterrows():
    image_url = row["Image URL"]
    
    # 🛠 Handle missing product names
    product_name = str(row["Name"])  # Convert to string to avoid float issues
    if pd.isna(product_name) or product_name.strip() == "nan":  
        product_name = f"product_{index}"  # Use a default name

    product_name = product_name.replace("/", "-").replace("\\", "-")[:50]  # Clean filename
    image_path = f"{IMAGE_FOLDER}/{product_name}_{index}.jpg"

    try:
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            with open(image_path, "wb") as file:
                file.write(response.content)
            print(f"✅ Downloaded: {product_name}")
        else:
            print(f"❌ Failed to download: {product_name}")
    except Exception as e:
        print(f"⚠️ Error downloading {product_name}: {e}")

print("🎉 All images downloaded successfully!")

In [None]:
import torch
import clip
from PIL import Image
import os
import pandas as pd
import numpy as np
import faiss

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load IKEA product data
df = pd.read_csv("ikea_all_regions.csv")

# Create a list to store embeddings
image_embeddings = []
image_paths = []

# Define image folder
IMAGE_FOLDER = "ikea_images"

# Process each image
for index, row in df.iterrows():
    image_name = str(row["Name"]).replace("/", "-").replace("\\", "-")[:50]
    image_path = f"{IMAGE_FOLDER}/{image_name}_{index}.jpg"
    
    if os.path.exists(image_path):
        try:
            image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
            with torch.no_grad():
                embedding = model.encode_image(image).cpu().numpy()
            image_embeddings.append(embedding)
            image_paths.append(image_path)
            print(f"✅ Processed {image_name}")
        except Exception as e:
            print(f"⚠️ Error processing {image_name}: {e}")

# Convert embeddings to NumPy array
image_embeddings = np.vstack(image_embeddings)

# Save embeddings using FAISS for fast search
faiss_index = faiss.IndexFlatL2(image_embeddings.shape[1])
faiss_index.add(image_embeddings)

faiss.write_index(faiss_index, "ikea_clip_index.faiss")
np.save("ikea_image_paths.npy", image_paths)

print("🎉 CLIP embeddings saved! Now we can search IKEA images using text or images.")