In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import bs4
import requests
import time
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
def setting_up_webdriver(driver_path, url, secs):
    """
    
    """
    # Loading webdriver
    browser = webdriver.Chrome(driver_path)
    browser.get(url)
    
    # Accepting cookies
    try:
        btn_cookies = browser.find_element_by_css_selector("button.ui-button.ui-button--small.ui-button--primary.ui-button--positive")
        btn_cookies.click()
    except:
        pass
    time.sleep(secs)
    
    # Adding postal code
    pst = browser.find_element_by_name("postalCode")
    btn = browser.find_element_by_css_selector("button.button.button-primary.button-big")
    pst.send_keys("28004")
    btn.click()
    time.sleep(secs)
    
    return browser

In [3]:
def make_categories_list(root_url, start, end):
    """
    
    """
    return [root_url + str(x) for x in range(start, end+1)]

In [4]:
def get_html(webdriver):
    """
    Given a webdriver object, return its full raw html in a BeautifulSoup object
    
    Parameters:
        webdriver: Selenium webdriver object
        
    Returns:
        BeautifulSoup object
    """
    html = webdriver.page_source
    soup = bs4.BeautifulSoup(html)
    return soup

In [5]:
def parse_product(category, subcategory, prod):
    """
    Parse a Beautiful Soup product button returning the item's features 
    
    Parameters:
        Category: String. Category
        subcategory: String. Subcategory
        prod: Beautiful Soup object to parse
        
    Returns:
        Parsed data in a list. E.g.:
        ['Aceite, vinagre y sal', 'Aceite de oliva', 'Aceite de oliva 0,4º Hacendado', 'Garrafa 5 L','10,60 €', 
        'https://prod-mercadona.imgix.net/images/236a405e60648cf189d4a4b3a3403983.jpg?fit=crop&h=300&w=300']
    
    """
    img_link = prod.img["src"]
    name = prod.h4.text
    specs = prod.find_all("span", attrs={"class": "footnote1-r"})
    specs = "".join([spec.text for spec in specs])
    price = prod.find("div", attrs={"class": "product-price"}).p.text
    
    return [category, subcategory, name, specs, price, img_link]

In [6]:
def get_raw_data(soup):
    """
    
    """
    # Fetch main category.
    category = soup.find("div", attrs={"class": "category-detail__name"}).h3.text
    
    # Fetch subcategories.
    sections = soup.find_all("section", attrs={"class": "section"})
    
    # Fetch every product and its features.
    results = []
    for section in sections:
        name = section.h3.text
        products = section.find_all("button", attrs={"class": "product-cell__content-link"})
        section_results = [parse_product(category=category, subcategory=name, prod=product) for product in products]
        results.extend(section_results)
    
    return results

In [7]:
def products2pandas(products_list):
    """
    
    """
    cols = ["Category", "Subcategory", "Product", "Specs", "Price", "img_link"]
    df = pd.DataFrame(data=products_list, columns=cols)
    return df

In [8]:
def get_category_page(webdriver, link, secs):
    """
    
    """
    webdriver.get(link)
    time.sleep(secs)
    if link[-3:] == webdriver.current_url[-3:]:
        return webdriver
    else:
        return None

In [9]:
def main(start_page, end_page):
    """
    
    """
    driver = "../bin/chromedriver"
    url = "https://tienda.mercadona.es/"
    categories_url = "https://tienda.mercadona.es/categories/"
    
    browser = setting_up_webdriver(driver, url, 2)
    ctgry_list = make_categories_list(categories_url, start_page, end_page)
    
    final_results, not_found = [], []
    for link in ctgry_list:
        current_page = get_category_page(browser, link, 2)
        if current_page is not None: 
            soup = get_html(current_page)
            results = get_raw_data(soup)
            final_results.extend(results)
        else:
            not_found.append(link)
    
    browser.close()
    browser.quit()
    
    df = products2pandas(final_results)
    return df

In [10]:
df = main(1, 300)
df # page 31 does not work
# Explicit wait https://www.youtube.com/watch?v=sv0fKrzEOKQ

AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
browser = setting_up_webdriver(driver, url, 1)
ctgry_list = make_categories_list("https://tienda.mercadona.es/categories/", 111, 114)
ctgry_list

In [None]:
final_results = []
not_found = []
for link in ctgry_list:
    current_page = get_category_page(browser, link, 1)
    if current_page is not None: 
        soup = get_html(current_page)
        results = get_raw_data(soup)
        final_results.extend(results)
    else:
        not_found.append(link)

In [None]:
df = products2pandas(final_results)
df

In [None]:
not_found

In [None]:
browser.close()
browser.quit()

In [None]:
soup = get_html(browser)
results = get_raw_data(soup)

In [None]:
results

In [None]:
url = "https://tienda.mercadona.es/categories/112"
browser.get(url)

In [None]:
browser.current_url

In [None]:
category = soup.find("div", attrs={"class": "category-detail__name"}).h3.text
category

In [None]:
sections = soup.find_all("section", attrs={"class": "section"})
len(sections)

In [None]:
results = []
for section in sections:
    name = section.h3.text
    products = section.find_all("button", attrs={"class": "product-cell__content-link"})
    section_results = [parse_product(category=category, subcategory=name, prod=product) for product in products]
    results.extend(section_results)

In [None]:
results

In [None]:
sections[0].h3.text

In [None]:
products = sections[0].find_all("button", attrs={"class": "product-cell__content-link"})

In [None]:
products[0]

In [None]:
products[0].img["src"]
products[0].h4.text
specs = products[0].find_all("span", attrs={"class": "footnote1-r"})
specs = "".join([spec.text for spec in specs])
specs
price = products[0].find("div", attrs={"class": "product-price"}).p.text
price