In [21]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import bs4
import requests
import time
import datetime
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [22]:
def setting_up_webdriver(driver_path, url, secs):
    """
    Create Selenium browser object.
    
    Parameters:
        driver_path: String. Google Chrome webdriver file path. 
        url: String. Mercadona webpage.
        secs: Integer. Wait time (seconds) before running next chunk of code.
        
    Return:
        Selenium browser object.
    """
    
    # Loading webdriver
    browser = webdriver.Chrome(driver_path)
    browser.get(url)
    
    # Accepting cookies
    try:
        btn_cookies = browser.find_element_by_css_selector("button.ui-button.ui-button--small.ui-button--primary.ui-button--positive")
        btn_cookies.click()
    except:
        pass
    time.sleep(secs)
    
    # Adding postal code
    pst = browser.find_element_by_name("postalCode")
    btn = browser.find_element_by_css_selector("button.button.button-primary.button-big")
    pst.send_keys("28004")
    btn.click()
    time.sleep(secs)
    
    return browser

In [23]:
def make_categories_list(root_url, start, end=None):
    """
    Given a root url, add at the right the start and end to query X amount of web pages.
    E.g.: ['nico.com/1', 'nico.com/2', 'nico.com/3', 'nico.com/4', 'nico.com/5']
    
    Parameters:
        root_url: String. Base url.
        start: Integer. First value of the range.
        end: Integer. Last value of the range. Defult None. 
            if end=None, returns a list with a single item.
    
    Return:
        List of urls.
        E.g.: make_categories_list("www.test.com/", 1, 3)
        ['www.test.com/1', 'www.test.com/2', 'www.test.com/3']
    """
    if end == None:
        return [root_url + str(start)]
    else: 
        return [root_url + str(x) for x in range(start, end+1)]

In [24]:
def get_html(webdriver):
    """
    Given a webdriver object, return its full raw html in a BeautifulSoup object
    
    Parameters:
        webdriver: Selenium webdriver object
        
    Returns:
        BeautifulSoup object
    """
    
    html = webdriver.page_source
    soup = bs4.BeautifulSoup(html)
    return soup

In [25]:
def parse_product(category, subcategory, prod):
    """
    Parse a Beautiful Soup product button returning the item's features 
    
    Parameters:
        Category: String. Category
        subcategory: String. Subcategory
        prod: Beautiful Soup object to parse
        
    Returns:
        Parsed data in a list. E.g.:
        ['Aceite, vinagre y sal', 'Aceite de oliva', 'Aceite de oliva 0,4º Hacendado', 'Garrafa 5 L','10,60 €', 
        'https://prod-mercadona.imgix.net/images/236a405e60648cf189d4a4b3a3403983.jpg?fit=crop&h=300&w=300']
    """
    
    img_link = prod.img["src"]
    name = prod.h4.text
    specs = prod.find_all("span", attrs={"class": "footnote1-r"})
    specs = "".join([spec.text for spec in specs])
    price = prod.find("div", attrs={"class": "product-price"}).p.text
    
    return [category, subcategory, name, specs, price, img_link]

In [26]:
def get_category_page(webdriver, link, secs):
    """
    Check for not found pages.
    
    Parameters:
        webdriver: Selenium object. 
        link: String. Link to check, Not found links return a generic not found page. 
        secs: Integer. Wait time in seconds between iteration.
        
    Return:
        Selenium object pointing to a new web page.
    """
    
    webdriver.get(link)
    time.sleep(secs)
    
    # Checks for not found pages. Not found return None. 
    if link[-3:] == webdriver.current_url[-3:]:
        return webdriver
    else:
        return None

In [27]:
def get_raw_data(soup):
    """
    Return a list of lists. Each element represents one item of the current page.
    
    Parameters:
        soup: BeautifulSoup object. Current raw HTML to parse.
        
    Return:
        List of lists. Each element of the list comes fromm the parse_product function. 
        E.g.: [['Lechuga y ensalada preparada', 'Lechuga', 'Lechuga iceberg', 'Pieza 1 ud.', '0,79 €', 
        'https://prod-mercadona.imgix.net/images/e047afab489d1c6a9ffc9e7da122ce3f.jpg?fit=crop&h=300&w=300'],
        ...]
    """
    
    # Fetch main category.
    category = soup.find("div", attrs={"class": "category-detail__name"}).h3.text
    
    # Fetch subcategories.
    try:
        sections = soup.find_all("section", attrs={"class": "section"})
        sections[0].h3.text # testing web page arrangement
    # Some pages have a diffent arrangement, like https://tienda.mercadona.es/categories/31
    except AttributeError as error01: 
        sections = soup.find_all("div", attrs={"class": "category-section"})
    
    # Fetch every product and its features.
    results = []
    for section in sections:
        name = section.h3.text
        products = section.find_all("button", attrs={"class": "product-cell__content-link"})
        section_results = [parse_product(category=category, subcategory=name, prod=product) for product in products]
        results.extend(section_results)
    
    return results

In [28]:
def products2pandas(products_list):
    """
    Given a product list, create a dataframe which columns are:
    "Category", "Subcategory", "Product", "Specs", "Price", "img_link"
    
    Parameters:
        products_list: List. List coming from get_raw_data function.
        
    Return:
        Pandas data frame containing all parsed data. 
    """
    
    cols = ["Category", "Subcategory", "Product", "Specs", "Price", "img_link"]
    df = pd.DataFrame(data=products_list, columns=cols)
    return df

In [29]:
def save_df(df, path):
    """
    Given a path, save products dataframe in csv format.
    
    Parameters:
        df: Pandas data frame.
        path: String. Relative or Absolute path where the file should be stored.
        
    Returns:
        None. Saves pandas data frame in csv format.
    """
    df.to_csv(path, index=False)
    print(f"Dataframe saved at {path}")

In [30]:
def main(start_page, end_page=None):
    """
    Parse Mercadona webpage returning a pandas dataframe with all available products and its features.
    """
    driver = "../bin/chromedriver"
    url = "https://tienda.mercadona.es/"
    categories_url = "https://tienda.mercadona.es/categories/"
    
    # Initializing webdriver
    browser = setting_up_webdriver(driver, url, 3)
    ctgry_list = make_categories_list(categories_url, start_page, end_page)
    
    # Parsing webpage
    final_results, not_found = [], []
    for link in ctgry_list:
        current_page = get_category_page(browser, link, 2)
        if current_page is not None: 
            soup = get_html(current_page)
            results = get_raw_data(soup)
            final_results.extend(results)
        else:
            not_found.append(link)
    
    # Terminating webdriver
    browser.close()
    browser.quit()
    
    # Create a pandas df using the list of products as data
    df = products2pandas(final_results)
    
    # Add run time and supermarket to the data frame
    supermarket = "Mercadona"
    run_time = datetime.datetime.now().strftime("%Y-%m-%d")
    df.insert(loc=0, column="Supermarket", value="Mercadona")
    df["Time"] = run_time
    
    # Save data frame as CSV
    save_df(df, f"../data/{supermarket}_{run_time}.csv")
    
    return df

In [31]:
df = main(1, 300)

Dataframe saved at ../data/Mercadona_2021-01-09.csv
