In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.edge.options import Options
import numpy as np

def scrape_product_links():
    products = []
    try:
        product_elements = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/Produkt')]")
        for product in product_elements:
            product_link = product.get_attribute('href')
            print(f'Found product: {product_link}')
            products.append(product_link)
        
    except NoSuchElementException:
        print("Could not find products on this page.")
    
    return products

def go_to_next_page():
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, 'a[data-testid="pagination-next-page"]')
        href_value = next_button.get_attribute('href')
        driver.get(str(href_value))

        print("Going to the next page.")

        return True
    except NoSuchElementException:
        print("No more pages left.")
        return False

# Set up Edge options for headless mode
edge_options = Options()
edge_options.add_argument("--headless")
edge_options.add_argument("--disable-gpu")  # Recommended for headless mode
edge_options.add_argument("--no-sandbox")   # Optional for headless mode

# Initialize the Edge WebDriver with headless options
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=edge_options)

# Main scraping loop
all_products = []

# URL of the search result page
url = 'https://www.rossmann.pl/szukaj?CategoryId=13049&Search=krem%20do%20twarzy'
driver.get(url)

while True:
    all_products.extend(scrape_product_links())
    
    # Wait for a few seconds before moving to the next page
    driver.implicitly_wait(0.5)
    
    # Try to go to the next page, if it fails, break the loop
    if not go_to_next_page():
        break

# Close the driver
driver.quit()


with open("links.txt", 'w') as outfile:
    outfile.writelines((str(product)+'\n' for product in np.unique(all_products)))

Found product: https://www.rossmann.pl/Produkt/Kremy-do-twarzy/SVR-krem-do-twarzy-Sensifine-AR-Teintee-ujednolicajacy-krem-redukujacy-zaczerwienienia-40-ml,2082373,13049
Found product: https://www.rossmann.pl/Produkt/Kremy-do-twarzy/SVR-krem-do-twarzy-Sensifine-AR-Teintee-ujednolicajacy-krem-redukujacy-zaczerwienienia-40-ml,2082373,13049
Found product: https://www.rossmann.pl/Produkt/Kremy-do-twarzy/FlosLek-Laboratorium-stopNaczynka-krem-do-twarzy-nawilzajaco-wzmacniajacy-50-ml,2064001,13049
Found product: https://www.rossmann.pl/Produkt/Kremy-do-twarzy/FlosLek-Laboratorium-stopNaczynka-krem-do-twarzy-nawilzajaco-wzmacniajacy-50-ml,2064001,13049
Found product: https://www.rossmann.pl/Produkt/Kremy-do-twarzy/Eveline-Cosmetics-Magic-Lift-krem-do-twarzy-multi-liftingujacy-modelujacy-owal-twarzy-na-noc-50-,396428,13049
Found product: https://www.rossmann.pl/Produkt/Kremy-do-twarzy/Eveline-Cosmetics-Magic-Lift-krem-do-twarzy-multi-liftingujacy-modelujacy-owal-twarzy-na-noc-50-,396428,13049


In [21]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import pandas as pd
import re
from bs4 import BeautifulSoup

# Set up Edge options for headless mode
edge_options = Options()
edge_options.add_argument("--headless")
edge_options.add_argument("--disable-gpu")  # Recommended for headless mode
edge_options.add_argument("--no-sandbox")   # Optional for headless mode

# Initialize the Edge WebDriver with headless options
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=edge_options)

# Path to the file containing product links
product_links_file_path = 'links.txt'

# Read all links into a list
with open(product_links_file_path, 'r') as file:
    links = [line.strip() for line in file if line.strip()]  # Read and clean up links

total_links = len(links)
print(f"Total links to process: {total_links}")

data = []

try:
    # Process each link
    for index, link in enumerate(links, start=1):
        print(f"Processing link {index} of {total_links}: {link}")
        if index > 100:
            break

        driver.get(link)

        try:
            # Locate the button with the text "Składniki"
            button = driver.find_element(By.XPATH, '//button[span[text()="Składniki"]]')
            # Find the parent container of the button and locate the <p> element
            parent_div = button.find_element(By.XPATH, '..//..')  # Adjust XPath if needed to navigate to the parent container
            p_element = parent_div.find_element(By.CSS_SELECTOR, 'p.styles-module_productDescriptionContent--76j9I')
            ingredients_text = p_element.get_attribute('innerHTML')

            soup = BeautifulSoup(ingredients_text, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)
            cleaned_text = re.sub(r'\s+', ' ', text)
            cleaned_text = cleaned_text.strip()

            data.append({'Product_url': link, 'Ingredients': ingredients_text})
        
        except Exception as e:
            print(f"Error processing link {link}")
            data.append({'Product_url': link, 'Ingredients': 'Error retrieving ingredients'})

        driver.implicitly_wait(0.2)

finally:
    driver.quit()
    df = pd.DataFrame(data)
    df.to_csv("ingredients.tsv", sep=';', index=False)


Total links to process: 672
Processing link 1 of 672: https://www.rossmann.pl/Produkt/Kosmetyki-z-SPF/Beauty-of-Joseon-krem-do-twarzy-Relief-Sun-SPF50-50-ml,2078518,19372
Error processing link https://www.rossmann.pl/Produkt/Kosmetyki-z-SPF/Beauty-of-Joseon-krem-do-twarzy-Relief-Sun-SPF50-50-ml,2078518,19372
Processing link 2 of 672: https://www.rossmann.pl/Produkt/Kosmetyki-z-SPF/Beauty-of-Joseon-sztyft-do-twarzy-i-ciala-Matte-Sun-SPF50-18-g,2078517,19372
Error processing link https://www.rossmann.pl/Produkt/Kosmetyki-z-SPF/Beauty-of-Joseon-sztyft-do-twarzy-i-ciala-Matte-Sun-SPF50-18-g,2078517,19372
Processing link 3 of 672: https://www.rossmann.pl/Produkt/Kosmetyki-z-SPF/Fluff-Superfood-krem-do-twarzy-z-filtrem-UV-SPF-50-50-ml,379135,19372
Error processing link https://www.rossmann.pl/Produkt/Kosmetyki-z-SPF/Fluff-Superfood-krem-do-twarzy-z-filtrem-UV-SPF-50-50-ml,379135,19372
Processing link 4 of 672: https://www.rossmann.pl/Produkt/Kosmetyki-z-SPF/Missha-Aqua-Sun-krem-do-twarzy-SPF