## B) Realizar web scraping

```
Realizar web scraping con por lo menos 100 repuestos de la página a continuacion, explicar la metodología y el código utilizado.
https://lamartine.cl/
```

In [1]:
import os, time, random

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
URL = "https://lamartine.cl/productos/page/{page_num}/"

# Product grid
CSS_SELECTOR_PRODUCT_GRID_ITEM = "li[class*='product type-product'] > a"

# Product item
    # Title
CSS_SELECTOR_PRODUCT_TITLE = "div[class='summary entry-summary'] > h1[class='product_title entry-title']"
    # SKU
CSS_SELECTOR_PRODUCT_SKU = "span > span[class='sku']"
    # Price: Original
CSS_SELECTOR_PRODUCT_PRICE_ORIGINAL = "del > span > bdi"
    # Price: Current
CSS_SELECTOR_PRODUCT_PRICE_CURRENT = "ins > span > bdi"
    # Stock
CSS_SELECTOR_PRODUCT_STOCK = "div > p[class*='stock']" # Example: "34 disponibles" -> split.
    # Categories
CSS_SELECTOR_PRODUCT_CATEGORIES = "div > span[class='posted_in'] > a" # Multiple categories, use find_elementS
    # Description
CSS_SELECTOR_PRODUCT_DESCRIPTION = "div[role='tabpanel'][id='tab-description'] > p"

#### Get driver

In [3]:
def initialize_driver(headless=False):
    chrome_path = os.path.join("driver", "chromedriver-win64", "chromedriver.exe")

    # Options
    chrome_options = Options()
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    # For images
    #chrome_options.add_argument("--blink-settings=imagesEnabled=false")

    if headless:
        chrome_options.add_argument("--headless")
        chrome_options.add_argument('--disable-gpu')

    chrome_service = Service(chrome_path)
    wd = webdriver.Chrome(
        service=chrome_service,
        options=chrome_options
    )
    
    return wd

#### Open/close tabs

In [4]:
def open_new_tab(driver, url=None):
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[-1])
    if url:
        driver.get(url)

def close_all_other_tabs(driver):
    for tab_i in range(1, len(driver.window_handles)):
        driver.switch_to.window(driver.window_handles[tab_i])
        driver.close()
    driver.switch_to.window(driver.window_handles[0])

#### Find element/elements

In [5]:
def find_element(element, by, value, timeout=10):
    return WebDriverWait(element, timeout).until(
        EC.presence_of_element_located((by, value))
    )

def find_elements(element, by, value, timeout=10):
    return WebDriverWait(element, timeout).until(
        EC.presence_of_all_elements_located((by, value))
    )

#### Get product data (or empty string if it is not shown)

In [6]:
def get_product_data(driver):
    product_dict = {}

    try:
        product_dict['title'] = find_element(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_TITLE).text
    except:
        product_dict['title'] = ""

    try:
        product_dict['SKU'] = find_element(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_SKU).text
    except:
        product_dict['SKU'] = ""

    try:
        product_dict['price_original'] = find_element(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_PRICE_ORIGINAL).text
    except:
        product_dict['price_original'] = ""

    try:
        product_dict['price_current'] = find_element(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_PRICE_CURRENT).text
    except:
        product_dict['price_current'] = ""

    try:
        product_dict['stock'] = int(find_element(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_STOCK).text.split(" ")[0])
    except:
        product_dict['stock'] = 0

    try:
        product_dict['categories'] = [elem.text for elem in find_elements(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_CATEGORIES)]
    except:
        product_dict['categories'] = []

    try:
        product_dict['description'] = find_element(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_DESCRIPTION).text
    except:
        product_dict['description'] = ""

    return product_dict


### Run run runnnn

In [7]:
# How many pages to visit?
pages_to_scrape = 4

# How many product to scrape?
products_to_scrape = 15

products = []

In [8]:
driver = initialize_driver()

for page_num in range(1, pages_to_scrape+1):
    if len(products) >= products_to_scrape:
        break
    
    # Get paginated products grid
    driver.get(URL.format(page_num=page_num))

    # Scrape all items present in the page
    for e in find_elements(driver, By.CSS_SELECTOR, CSS_SELECTOR_PRODUCT_GRID_ITEM):
        product_url = e.get_attribute("href")
        print(f"Page {page_num:3} (product {(len(products)+1):4}/{products_to_scrape}) - URL: {product_url}.")

        # Open product on a new tab
        open_new_tab(driver, product_url)

        # Extract data
        data_dict = get_product_data(driver)

        # Append to current scraped products
        products.append(data_dict)

        # Wait some time TO AVOID POSSIBLE BANS¿¿¿¿?????
        time.sleep(random.randint(2,3))

        # And close tab to repeat        
        close_all_other_tabs(driver)

        if len(products) >= products_to_scrape:
            break


Page    1 (product   1/15) - URL: https://lamartine.cl/producto/aceite-atf6-acdelco-dexron-vi-acdelco/.
Page    1 (product   2/15) - URL: https://lamartine.cl/producto/aceite-de-caja-de-transferencia-946ml-auto-track-ii-acdelco/.
Page    1 (product   3/15) - URL: https://lamartine.cl/producto/aceite-0w20-sintetico-pennzoil-dexos-1-mopar/.
Page    1 (product   4/15) - URL: https://lamartine.cl/producto/aceite-0w20-sintetico-williams-dexos-1-946ml-usa/.
Page    1 (product   5/15) - URL: https://lamartine.cl/producto/aceite-0w40-sintetico-motores-srt-pennzoil/.
Page    1 (product   6/15) - URL: https://lamartine.cl/producto/aceite-10w40-1-ltra-semi-sintetico-acdelco/.
Page    1 (product   7/15) - URL: https://lamartine.cl/producto/aceite-10w40-4litros-semi-sintetico-acdelco/.
Page    1 (product   8/15) - URL: https://lamartine.cl/producto/aceite-10w40-4litros-sintetico-mando-korea/.
Page    1 (product   9/15) - URL: https://lamartine.cl/producto/aceite-15w40-diesel-mopar-946ml/.
Page    1

In [9]:
products

[{'title': 'ACEITE ATF+6 DEXRON VI ACDELCO',
  'SKU': 'AC010095',
  'price_original': '$16,000',
  'price_current': '$12,000',
  'stock': 55,
  'categories': ['Aceites', 'Lubricantes', 'Transmisión'],
  'description': 'AC010095  // Acdelco 19420007  //  10-9300  //'},
 {'title': 'ACEITE CAJA TRANSFERENCIA AUTO-TRACK II (1 LITRO) ACDELCO',
  'SKU': 'AC010200',
  'price_original': '$16,000',
  'price_current': '$12,000',
  'stock': 3,
  'categories': ['Aceites', 'Aditivos', 'Lubricantes', 'Transmisión'],
  'description': 'AC010200  //  88900402  //  10-4017'},
 {'title': 'ACEITE 0W20 SINTETICO PENNZOIL DEXOS 1 946ml',
  'SKU': 'AC010050',
  'price_original': '$22,000',
  'price_current': '$16,500',
  'stock': 0,
  'categories': ['Aceites', 'Lubricantes'],
  'description': '68152004PB //   AC010050  //  550036541  //  68152004PB'},
 {'title': 'ACEITE 0W20 SINTÉTICO WILLIAMS DEXOS 1 946ml USA',
  'SKU': 'AC010051',
  'price_original': '$13,000',
  'price_current': '$9,750',
  'stock': 14,


In [10]:
try:
    driver.close()
except:
    pass