In [None]:
!pip install -r requirements.txt

In [4]:
from os import environ
from dotenv import load_dotenv
from pprint import pprint
from time import sleep
from datetime import datetime
from pymongo import MongoClient

# Selenium Web Scraping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Scraping Simulando Interacción Humana

Tecnologías:  
- `Selenium` (Python / Javascript)
- `Puppeteer` (Javacript)


Estas librerías emplean un `Driver` del navegador que queremos utilizar para la automatización.

En este taller, utilizaremos `Google Chrome`, por lo tanto debemos instalar el driver adecuado.

Afortunadamente existen librerías como `Webdriver Manager` que nos permiten instalar el driver un solo comando en Python.

Vamos a crear una función que nos permita instanciar el driver para nuestra automatización. `Headless` es una opción que permite ocular el navegador. Se utiliza bastante en ambientes en `donde no hay interfaz gráfica`, como por ejemplo una máquina en `EC2` o un `Función Lambda`


In [5]:
def get_webdriver(headless: bool = False) -> webdriver:
    options = Options()

    if headless:
        options.add_argument("--headless")
        options.add_argument("--window-size=1920,1080")

    return webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

In [6]:
driver = get_webdriver()

In [7]:
driver.get('https://webscraper.io/test-sites/e-commerce/allinone')

In [8]:
all_rows = driver.find_elements(by=By.CLASS_NAME, value="row")
row_with_items = all_rows[2]
urls = row_with_items.find_elements(by=By.TAG_NAME, value="a")

print([element.text for element in urls])

['Sony Xperia', 'Dell Latitude...', 'Apple MacBook...']


In [9]:
first_url = urls[0].get_attribute('href')
driver.get(first_url)

In [10]:
record = {}

record["product_name"] = driver.find_element(by=By.CLASS_NAME, value="title").text
record["product_description"] = driver.find_element(by=By.CLASS_NAME, value="description").text
record["product_price"] = float(driver.find_element(by=By.CLASS_NAME, value="price").text.strip("$"))
record["timestamp"] = datetime.now()

record

{'product_name': 'Sony Xperia',
 'product_description': 'GPS, waterproof',
 'product_price': 118.99,
 'timestamp': datetime.datetime(2025, 6, 5, 0, 32, 43, 997349)}

Podemos crear un funcion que haga scraping de la pagina de un producto.
Pasamos el `driver` y el `url` de la pagina del producto

In [11]:
GRUPO = "X"
def scrap_product_page(driver: webdriver, url: str):
    driver.get(url)
    record = {}

    record["group_name"] = f"GRUPO-{GRUPO}"
    record["method"] = "selenium"
    record["product_name"] = driver.find_element(by=By.CLASS_NAME, value="title").text
    record["product_description"] = driver.find_element(by=By.CLASS_NAME, value="description").text
    record["product_price"] = float(driver.find_element(by=By.CLASS_NAME, value="price").text.strip("$"))
    record["timestamp"] = datetime.now()

    return record

Ahora itentemos scrapear todos los productos de telefonos/touch

In [12]:
# Vamos a la pagina de telefonos touch
driver.get('https://webscraper.io/test-sites/e-commerce/allinone/phones/touch')
sleep(3)

results = []

# Iteramos por cada producto y lo guardamos en results
all_rows = driver.find_elements(by=By.CLASS_NAME, value="row")
row_with_items = all_rows[2]
tag_a_elements = row_with_items.find_elements(by=By.TAG_NAME, value="a")
urls = [element.get_attribute('href') for element in tag_a_elements]

for url in urls:
    result = scrap_product_page(driver=driver, url=url)
    results.append(result)
    pprint(results)
    sleep(1)


[{'group_name': 'GRUPO-X',
  'method': 'selenium',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 32, 47, 599204)}]
[{'group_name': 'GRUPO-X',
  'method': 'selenium',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 32, 47, 599204)},
 {'group_name': 'GRUPO-X',
  'method': 'selenium',
  'product_description': '3.2" screen',
  'product_name': 'LG Optimus',
  'product_price': 57.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 32, 48, 802494)}]
[{'group_name': 'GRUPO-X',
  'method': 'selenium',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 32, 47, 599204)},
 {'group_name': 'GRUPO-X',
  'method': 'selenium',
  'product_description': '3.2" screen',
  'product_name': 'LG Optimus',
  'product_pri

Finalmente lo guardamos en una base de datos

In [13]:
# Load .env file
load_dotenv()

True

In [14]:
# Connect to MongoDB
client = MongoClient(environ["DB_CONNECTION_STRING"])

# Specify the database and collection
db = client["AYPMD"]
collection = db["SCRAPING_TEST"]

In [15]:
collection.insert_many(results)

InsertManyResult([ObjectId('68411dfa2cb2d0b9548f82e1'), ObjectId('68411dfa2cb2d0b9548f82e2'), ObjectId('68411dfa2cb2d0b9548f82e3'), ObjectId('68411dfa2cb2d0b9548f82e4'), ObjectId('68411dfa2cb2d0b9548f82e5'), ObjectId('68411dfa2cb2d0b9548f82e6'), ObjectId('68411dfa2cb2d0b9548f82e7'), ObjectId('68411dfa2cb2d0b9548f82e8'), ObjectId('68411dfa2cb2d0b9548f82e9')], acknowledged=True)

In [16]:
# Define the query (bring all items of my group)
query = {}

# Execute the query and retrieve 3 documents
query_results = collection.find(query).limit(3)
list(query_results)

[{'_id': ObjectId('68411d909bfe6368c214f63a'),
  'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'Nokia 123',
  'product_description': '7 day battery',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 2, 373000)},
 {'_id': ObjectId('68411d909bfe6368c214f63b'),
  'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'LG Optimus',
  'product_description': '3.2" screen',
  'product_price': 57.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 3, 557000)},
 {'_id': ObjectId('68411d909bfe6368c214f63c'),
  'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'Samsung Galaxy',
  'product_description': '5 mpx. Android 5.0',
  'product_price': 93.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 4, 646000)}]

In [17]:
driver.close()
client.close()