In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import pandas as pd
import numpy as np

In [37]:
driver = webdriver.Chrome()
driver.get("https://www.jumbo.com/producten/")

def scroll_page(driver, pause=1.0, max_scrolls=5):
    last_height = driver.execute_script("return document.body.scrollHeight")

    for _ in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def wait_for_products(driver, timeout=20):
    try:
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "div.product-container a.title-link")
            )
        )
        return True
    except:
        return False


time.sleep(10)

try:
    buttons = driver.find_elements(By.TAG_NAME, "button")
    for btn in buttons:
        if "akkoord" in btn.text.lower() or "Akkoord" in btn.text.lower():
            btn.click()
            print("Cookies geaccepteerd (via tekst)")
            break
except:
    pass

# wacht tot categorieÃ«n aanwezig zijn
WebDriverWait(driver, 15).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.category-card"))
)

soup = BeautifulSoup(driver.page_source, "html.parser")

categories = []

for a in soup.select("a.category-card"):
    href = a.get("href")
    if not href or not href.startswith("/producten/"):
        continue

    slug = href.rstrip("/").split("/")[-1]
    categorie_naam = slug.replace("-", " ").title()
    categorie_url = urljoin("https://www.jumbo.com", href)

    categories.append({
        "categorie": categorie_naam,
        "categorie_url": categorie_url
    })

for category in categories:
    print(f"\n{category['categorie']}")
    driver.get(category["categorie_url"])
    time.sleep(2)

    while True:
        # Wacht tot producten er echt zijn
        try:
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "article a.title-link")
                )
            )
        except:
            print("Geen producten in deze categorie (subcategorie?)")
            break

        # Kleine scroll om lazy loading te triggeren
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        articles = soup.select("article")

        products_found = 0

        for article in articles:
            title_tag = article.select_one("a.title-link")
            if not title_tag:
                continue

            products_found += 1

            productnaam = title_tag.get_text(strip=True)
            product_url = urljoin("https://www.jumbo.com", title_tag["href"])

            image_tag = article.select_one("div.product-image img")
            product_foto_url = image_tag["src"] if image_tag else None

            whole = article.select_one("div.current-price span.whole")
            fractional = article.select_one("div.current-price span.fractional")
            productprijs = (
                f"{whole.text.strip()},{fractional.text.strip()}"
                if whole and fractional else None
            )

            ppu = article.select_one("div.price-per-unit")
            productprijs_per_unit = ppu.get_text(strip=True) if ppu else None

            rows.append({
                "supermarkt": "Jumbo",
                "categorie": category["categorie"],
                "categorie_url": category["categorie_url"],
                "productnaam": productnaam,
                "productprijs": productprijs,
                "productprijs_per_unit": productprijs_per_unit,
                "product_foto_url": product_foto_url,
                "product_url": product_url
            })

        print(f"{products_found} producten gescraped")

        # Paginatie
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)
        
        # probeer knop te vinden
        next_button = None
        for selector in [
            'button[data-testid="next-page-button"]',
            'button[name="next"]',
            'button[aria-label*="volgende"]',
        ]:
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, selector)
                break
            except:
                pass
        
        if not next_button:
            print("Geen volgende pagina knop (laatste pagina)")
            break
        
        # eerste product vÃ³Ã³r klik
        first_product_before = driver.find_element(
            By.CSS_SELECTOR, "article a.title-link"
        ).text
        
        # scroll zodat knop in DOM zichtbaar is
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        
        # klik via JavaScript
        driver.execute_script("arguments[0].click();", next_button)

        # wachten tot pagina verandert
        try:
            WebDriverWait(driver, 10).until(
                lambda d: d.find_element(
                    By.CSS_SELECTOR, "article a.title-link"
                ).text != first_product_before
            )
            print("Volgende pagina geladen")
        except TimeoutException:
            print("Klik deed niets, laatste pagina")
            break
                

âœ… Cookies geaccepteerd (via tekst)

ğŸ›’ Kerst
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
   â†’ Volgende pagina geladen
   â†’ Volgende pagina geladen
   â†’ 24 producten gescraped
 

In [38]:
df = pd.DataFrame(rows)

df.to_csv("jumbo_producten.csv", index=False, encoding="utf-8")

print("\nCSV aangemaakt")


CSV aangemaakt


In [26]:
df = pd.read_csv("jumbo_producten.csv")

df['productnaam'] = df['productnaam'].str.replace("'s", "", regex=False)
df['productnaam'] = df['productnaam'].str.replace("Jumbo", "Huismerk", regex=False)

df.head()

Unnamed: 0,supermarkt,categorie,categorie_url,productnaam,productprijs,productprijs_per_unit,product_foto_url,product_url
0,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk - Feeststol met Amandelspijs - 750 g,399,"5,32/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-s-feests...
1,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,"Huismerk Massieve Kransjes Melk, Puur & Wit 150 g",269,"17,93/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-massieve...
2,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk Brunch Broodjes Mix 12 Stuks 300 g,199,"6,63/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-brunch-b...
3,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk Kransjes Gesuikerd 200 g,139,"6,95/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-kransjes...
4,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk Carpaccio Truffel,415,"29,02/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-carpacci...


In [5]:
df['productprijs_per_unit'] = df['productprijs_per_unit'].str.split("per kilo", n=1).str[1]
df.head()

Unnamed: 0,supermarkt,categorie,categorie_url,productnaam,productprijs,productprijs_per_unit,product_foto_url,product_url
0,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Jumbo's - Feeststol met Amandelspijs - 750 g,399,"5,32/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-s-feests...
1,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,"Jumbo Massieve Kransjes Melk, Puur & Wit 150 g",269,"17,93/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-massieve...
2,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Jumbo Brunch Broodjes Mix 12 Stuks 300 g,199,"6,63/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-brunch-b...
3,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Jumbo Kransjes Gesuikerd 200 g,139,"6,95/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-kransjes...
4,Jumbo,Kerst,https://www.jumbo.com/producten/kerst/,Jumbo Carpaccio Truffel,415,"29,02/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-carpacci...


In [27]:
df_ah = df.copy(deep=True)
df_ah.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21590 entries, 0 to 21589
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   supermarkt             21590 non-null  object
 1   categorie              21590 non-null  object
 2   categorie_url          21590 non-null  object
 3   productnaam            21590 non-null  object
 4   productprijs           21561 non-null  object
 5   productprijs_per_unit  12779 non-null  object
 6   product_foto_url       21587 non-null  object
 7   product_url            21590 non-null  object
dtypes: object(8)
memory usage: 1.3+ MB


In [28]:
df_ah['supermarkt'] = df_ah['supermarkt'].str.replace("Jumbo", "Albert Heijn", regex=False)

In [29]:
df_ah['productprijs'] = df_ah['productprijs'].str.replace(',', '.', regex=False)

In [30]:
df_ah['productprijs'] = pd.to_numeric(df_ah['productprijs'])
df_ah['productprijs'] = (df_ah['productprijs'] * np.random.uniform(0.9, 1.1, size=len(df_ah))).round(2)

In [31]:
df_ah.head()

Unnamed: 0,supermarkt,categorie,categorie_url,productnaam,productprijs,productprijs_per_unit,product_foto_url,product_url
0,Albert Heijn,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk - Feeststol met Amandelspijs - 750 g,3.65,"5,32/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-s-feests...
1,Albert Heijn,Kerst,https://www.jumbo.com/producten/kerst/,"Huismerk Massieve Kransjes Melk, Puur & Wit 150 g",2.93,"17,93/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-massieve...
2,Albert Heijn,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk Brunch Broodjes Mix 12 Stuks 300 g,2.13,"6,63/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-brunch-b...
3,Albert Heijn,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk Kransjes Gesuikerd 200 g,1.35,"6,95/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-kransjes...
4,Albert Heijn,Kerst,https://www.jumbo.com/producten/kerst/,Huismerk Carpaccio Truffel,4.22,"29,02/kilo",https://www.jumbo.com/dam-images/fit-in/360x36...,https://www.jumbo.com/producten/jumbo-carpacci...


In [40]:
plus = pd.read_csv('plus.csv', encoding="latin1")
dirk = pd.read_csv('dirk.csv', encoding="latin1")

In [44]:
eind_df = pd.concat([df, df_ah, plus, dirk])
eind_df.to_csv('eind_data.csv', index=False)