In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.edge.options import Options
import numpy as np

def scrape_product_links():
    products = []
    try:
        product_elements = driver.find_elements(By.XPATH, "//a[starts-with(@href, '/Produkt')]")
        for product in product_elements:
            product_link = product.get_attribute('href')
            print(f'Found product: {product_link}')
            products.append(product_link)
        
    except NoSuchElementException:
        print("Could not find products on this page.")
    
    return products

def go_to_next_page():
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, 'a[data-testid="pagination-next-page"]')
        href_value = next_button.get_attribute('href')
        driver.get(str(href_value))

        print("Going to the next page.")

        return True
    except NoSuchElementException:
        print("No more pages left.")
        return False

# Set up Edge options for headless mode
edge_options = Options()
edge_options.add_argument("--headless")
edge_options.add_argument("--disable-gpu")  # Recommended for headless mode
edge_options.add_argument("--no-sandbox")   # Optional for headless mode

# Initialize the Edge WebDriver with headless options
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=edge_options)

# Main scraping loop
all_products = []

# URL of the search result page
url = 'https://www.rossmann.pl/szukaj?CategoryId=13049&Search=krem%20do%20twarzy'
driver.get(url)

while True:
    all_products.extend(scrape_product_links())
    
    # Wait for a few seconds before moving to the next page
    driver.implicitly_wait(0.5)
    
    # Try to go to the next page, if it fails, break the loop
    if not go_to_next_page():
        break

# Close the driver
driver.quit()


with open("links.txt", 'w') as outfile:
    outfile.writelines((str(product)+'\n' for product in np.unique(all_products)))

In [None]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import pandas as pd
import re
from bs4 import BeautifulSoup

# Set up Edge options for headless mode
edge_options = Options()
edge_options.add_argument("--headless")
edge_options.add_argument("--disable-gpu")  # Recommended for headless mode
edge_options.add_argument("--no-sandbox")   # Optional for headless mode

# Initialize the Edge WebDriver with headless options
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=edge_options)

# Path to the file containing product links
product_links_file_path = 'links.txt'

# Read all links into a list
with open(product_links_file_path, 'r') as file:
    links = [line.strip() for line in file if line.strip()]  # Read and clean up links

total_links = len(links)
print(f"Total links to process: {total_links}")

data = []

try:
    # Process each link
    for index, link in enumerate(links, start=1):
        print(f"Processing link {index} of {total_links}: {link}")
        if index > 100:
            break

        driver.get(link)

        try:
            # Locate the button with the text "Składniki"
            button = driver.find_element(By.XPATH, '//button[span[text()="Składniki"]]')
            # Find the parent container of the button and locate the <p> element
            parent_div = button.find_element(By.XPATH, '..//..')  # Adjust XPath if needed to navigate to the parent container
            p_element = parent_div.find_element(By.CSS_SELECTOR, 'p.styles-module_productDescriptionContent--76j9I')
            ingredients_text = p_element.get_attribute('innerHTML')

            soup = BeautifulSoup(ingredients_text, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)
            cleaned_text = re.sub(r'\s+', ' ', text)
            cleaned_text = cleaned_text.strip()

            data.append({'Product_url': link, 'Ingredients': ingredients_text})
        
        except Exception as e:
            print(f"Error processing link {link}")
            data.append({'Product_url': link, 'Ingredients': 'Error retrieving ingredients'})

        driver.implicitly_wait(0.2)

finally:
    driver.quit()
    df = pd.DataFrame(data)
    df.to_csv("ingredients.tsv", sep=';', index=False)


In [14]:
import pandas as pd

ingredients_df = pd.read_csv("cleaned_ingredients.csv", sep=";")


split_elements = ingredients_df['Ingredients'].str.split(r'[,.•*]')

# Step 2: Flatten the list of lists
flattened_elements = [item.strip() for sublist in split_elements for item in sublist]

# Step 3: Convert the list to a set to remove duplicates
unique_elements = set(flattened_elements)

# Step 4: Create a new DataFrame from the unique elements
unique_df = pd.DataFrame(list(unique_elements), columns=['unique_elements'])




unique_df.to_csv("unique_ingredients", index=False)

In [51]:
import pandas as pd
import re

# Step 1: Load the data
ingredients_df = pd.read_csv("cleaned_ingredients.csv", sep=";")

# Step 2: Define the dictionary for replacements
replace_dict = {
    "WATER": ["AQUA/WATER", "AQUA", "AQUA (WATER)", "WATER / WATER", "WATER (WATER)"]
    # Add more replacements here as needed
}

# Step 3: Apply transformations to the 'Ingredients' column
ingredients_df['Ingredients'] = ingredients_df['Ingredients'].str.upper()  # Convert to uppercase
ingredients_df['Ingredients'] = ingredients_df['Ingredients'].str.replace(r'\*', '', regex=True)  # Remove "*"

# Step 4: Apply replacements based on the replace_dict
# Flatten the dictionary into a single mapping
flat_replace_dict = {re.escape(pattern): replacement for replacement, patterns in replace_dict.items() for pattern in patterns}

# Perform replacements
ingredients_df['Ingredients'] = ingredients_df['Ingredients'].replace(flat_replace_dict, regex=True)

# Step 5: Split the 'Ingredients' column by multiple delimiters
split_elements = ingredients_df['Ingredients'].str.split(r'[,.•*]+', expand=False)

# Step 6: Flatten the list of lists and clean up whitespace
flattened_elements = [item.strip() for sublist in split_elements for item in sublist if item.strip()]

# Step 7: Count occurrences of each unique ingredient
ingredient_counts = pd.Series(flattened_elements).value_counts()

# Step 8: Create a DataFrame from the unique ingredients and their counts
unique_df = pd.DataFrame({
    'unique_elements': ingredient_counts.index,
    'count': ingredient_counts.values
})

# Step 9: Save the unique ingredients and their counts to a CSV file
unique_df.to_csv("unique_ingredients.csv", index=False)
