In [None]:
!pip install -r requirements.txt

In [2]:
from os import environ
from dotenv import load_dotenv
from pprint import pprint
from time import sleep
from pymongo import MongoClient

# Selenium Web Scraping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Scraping Simulando Interacción Humana

Tecnologías:  
- `Selenium` (Python / Javascript)
- `Puppeteer` (Javacript)


Estas librerías emplean un `Driver` del navegador que queremos utilizar para la automatización.

En este taller, utilizaremos `Google Chrome`, por lo tanto debemos instalar el driver adecuado.

Afortunadamente existen librerías como `Webdriver Manager` que nos permiten instalar el driver un solo comando en Python.

Vamos a crear una función que nos permita instanciar el driver para nuestra automatización. `Headless` es una opción que permite ocular el navegador. Se utiliza bastante en ambientes en `donde no hay interfaz gráfica`, como por ejemplo una máquina en `EC2` o un `Función Lambda`


In [3]:
def get_webdriver(headless: bool = False) -> webdriver:
    options = Options()

    if headless:
        options.add_argument("--headless")
        options.add_argument("--window-size=1920,1080")

    return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

In [4]:
driver = get_webdriver()

/bin/sh: line 1: google-chrome: command not found
/bin/sh: line 1: google-chrome: command not found


In [5]:
driver.get('https://webscraper.io/test-sites/e-commerce/allinone')

In [6]:
all_rows = driver.find_elements(by=By.CLASS_NAME, value="row")
row_with_items = all_rows[2]
urls = row_with_items.find_elements(by=By.TAG_NAME, value="a")

print([element.text for element in urls])

['Dell Inspiron...', 'Acer Predator...', 'Asus VivoBook...']


In [7]:
first_url = urls[0].get_attribute('href')
driver.get(first_url)

In [8]:
record = {}

record["product_name"] = driver.find_element(by=By.CLASS_NAME, value="title").text
record["product_description"] = driver.find_element(by=By.CLASS_NAME, value="description").text
record["product_price"] = driver.find_element(by=By.CLASS_NAME, value="price").text

record

{'product_name': 'Dell Inspiron 15 (7567) Black',
 'product_description': 'Dell Inspiron 15 (7567) Black, 15.6" FHD, Core i7-7700HQ, 8GB, 1TB, GeForce GTX 1050 Ti 4GB, Linux + Windows 10 Home',
 'product_price': '$1144.2'}

Podemos crear un funcion que haga scraping de la pagina de un producto.
Pasamos el `driver` y el `url` de la pagina del producto

In [9]:
GRUPO = "X"
def scrap_product_page(driver: webdriver, url: str):
    driver.get(url)
    record = {}

    record["group_name"] = f"GRUPO-{GRUPO}"
    record["product_name"] = driver.find_element(by=By.CLASS_NAME, value="title").text
    record["product_description"] = driver.find_element(by=By.CLASS_NAME, value="description").text
    record["product_price"] = driver.find_element(by=By.CLASS_NAME, value="price").text

    return record

Ahora itentemos scrapear todos los productos de telefonos/touch

In [10]:
# Vamos a la pagina de telefonos touch
driver.get('https://webscraper.io/test-sites/e-commerce/allinone/phones/touch')
sleep(3)

results = []

# Iteramos por cada producto y lo guardamos en results
all_rows = driver.find_elements(by=By.CLASS_NAME, value="row")
row_with_items = all_rows[2]
tag_a_elements = row_with_items.find_elements(by=By.TAG_NAME, value="a")
urls = [element.get_attribute('href') for element in tag_a_elements]

for url in urls:
    result = scrap_product_page(driver=driver, url=url)
    results.append(result)
    pprint(results)
    sleep(1)


[{'group_name': 'GRUPO-X',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': '$24.99'}]
[{'group_name': 'GRUPO-X',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': '$24.99'},
 {'group_name': 'GRUPO-X',
  'product_description': '3.2" screen',
  'product_name': 'LG Optimus',
  'product_price': '$57.99'}]
[{'group_name': 'GRUPO-X',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': '$24.99'},
 {'group_name': 'GRUPO-X',
  'product_description': '3.2" screen',
  'product_name': 'LG Optimus',
  'product_price': '$57.99'},
 {'group_name': 'GRUPO-X',
  'product_description': '5 mpx. Android 5.0',
  'product_name': 'Samsung Galaxy',
  'product_price': '$93.99'}]
[{'group_name': 'GRUPO-X',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': '$24.99'},
 {'group_name': 'GRUPO-X',
  'product_description': '3.2" screen',
  'product_name

Finalmente lo guardamos en una base de datos

In [11]:
# Load .env file
load_dotenv()

True

In [12]:
# Connect to MongoDB
client = MongoClient(environ["DB_CONNECTION_STRING"])

# Specify the database and collection
db = client["AYPMD"]
collection = db["SCRAPING_TEST"]

In [13]:
collection.insert_many(results)

InsertManyResult([ObjectId('664e75316b6b09b550979e69'), ObjectId('664e75316b6b09b550979e6a'), ObjectId('664e75316b6b09b550979e6b'), ObjectId('664e75316b6b09b550979e6c'), ObjectId('664e75316b6b09b550979e6d'), ObjectId('664e75316b6b09b550979e6e'), ObjectId('664e75316b6b09b550979e6f'), ObjectId('664e75316b6b09b550979e70'), ObjectId('664e75316b6b09b550979e71')], acknowledged=True)

In [14]:
# Define the query (bring all items of my group)
query = {}

# Execute the query and retrieve 3 documents
query_results = collection.find(query).limit(3)
list(query_results)

[{'_id': ObjectId('664e5fe6b3469a3526b134cb'),
  'product_name': 'Asus VivoBook X441NA-GA190',
  'product_description': 'Asus VivoBook X441NA-GA190 Chocolate Black, 14", Celeron N3450, 4GB, 128GB SSD, Endless OS, ENG kbd',
  'product_price': '$295.99'},
 {'_id': ObjectId('664e6240ad2d7c27c743f966'),
  'group_name': 'GRUPO-X',
  'product_name': 'Nokia 123',
  'product_description': '7 day battery',
  'product_price': '$24.99'},
 {'_id': ObjectId('664e6240ad2d7c27c743f967'),
  'group_name': 'GRUPO-X',
  'product_name': 'LG Optimus',
  'product_description': '3.2" screen',
  'product_price': '$57.99'}]

In [15]:
driver.close()
client.close()