In [None]:
!pip install -r requirements.txt

In [10]:
from os import environ
from dotenv import load_dotenv
from pprint import pprint
from time import sleep
from datetime import datetime
from pymongo import MongoClient

# Requests Web Scraping
import requests
from bs4 import BeautifulSoup

# Scraping Interactuando con Peticiones HTTP

Tecnologías:  
- `Requests`: simula peticiones que hace un navegador
- `BeautifulSoup`: parser de HTML  

Estas librerías nos permiten realizar scraping de manera más directa y eficiente en comparación con Selenium, ya que no requieren la automatización de un navegador completo. Solo replicamos las peticiones HTTP que hace el navegador.

Para ver las peticiones que queremos replicar, podemso monitorear la pestaña de `Network` del navegador.

Si bien, en `requests` podemos hacer un llamado nuevo haciendo simplemente

```python
requests.get(url)
```

No es recomendable si vamos a hacer multiples llamado en el sitio y queremos consistencia entre los llamados.


Es preferible hacer uso de una sesión y luego hacer un get con esa sesión

```python
session = requests.Session()
```

In [2]:
session = requests.Session()
response = session.get('https://webscraper.io/test-sites/e-commerce/allinone')
response.status_code

200

Luego queremos parsear el la respuesta por `BeautifulSoup`, ya que esta librería nos permite interactuar facilmente con HTML

In [3]:
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
# Encontramos todos los elementos con la clase "row"
all_rows = soup.find_all(class_='row')

# Seleccionamos la tercera fila (índice 2)
row_with_items = all_rows[2]

# Encontramos todos los elementos <a> dentro de esa fila
urls = row_with_items.find_all('a')

# Imprimimos el texto de cada elemento <a>
print([element.text for element in urls])

['\n\t\t\t\t\t\tSony Xperia\n\t\t\t\t\t', '\n\t\t\t\t\t\tDell Latitude...\n\t\t\t\t\t', '\n\t\t\t\t\t\tApple MacBook...\n\t\t\t\t\t']


In [5]:
href= urls[0]["href"]
url = f'https://webscraper.io{href}'
url

'https://webscraper.io/test-sites/e-commerce/allinone/product/5'

In [6]:
response = session.get(url)

In [7]:
response.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<!-- Google Tag Manager -->\n<script nonce="8NG09fbDhHEjdagmNR4Oa2goFC0Kno3t">(function (w, d, s, l, i) {\n\t\tw[l] = w[l] || [];\n\t\tw[l].push({\n\t\t\t\'gtm.start\':\n\t\t\t\tnew Date().getTime(), event: \'gtm.js\'\n\t\t});\n\t\tvar f = d.getElementsByTagName(s)[0],\n\t\t\tj = d.createElement(s), dl = l != \'dataLayer\' ? \'&l=\' + l : \'\';\n\t\tj.async = true;\n\t\tj.src =\n\t\t\t\'https://www.googletagmanager.com/gtm.js?id=\' + i + dl;\n\t\tf.parentNode.insertBefore(j, f);\n\t})(window, document, \'script\', \'dataLayer\', \'GTM-NVFPDWB\');</script>\n<!-- End Google Tag Manager -->\n\t<title>Web Scraper - The #1 web scraping extension</title>\n\t<meta charset="utf-8">\n\t<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n\n\t<meta name="keywords"\n\t\t  content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper"/>\n\t<meta name="description"\n\t\t  content="The most popular web scraping extension.

In [None]:
# Parseamos el contenido HTML de la primera URL
soup_first_url = BeautifulSoup(response.text, 'html.parser')

# Extraemos los detalles del producto
record = {}
record["product_name"] = soup_first_url.find(class_='title').get_text(strip=True)
record["product_description"] = soup_first_url.find(class_='description').get_text(strip=True)
record["product_price"] = float(soup_first_url.find(class_='price').get_text(strip=True).strip("$"))
record["timestamp"] = datetime.now()

record

{'product_name': 'Sony Xperia',
 'product_description': 'GPS, waterproof',
 'product_price': 118.99,
 'timestamp': datetime.datetime(2025, 6, 5, 0, 29, 49, 463978)}

Podemos crear un funcion que haga scraping de la pagina de un producto.
Esta vez solo pasamos el `url` de la pagina del producto.

In [16]:
GRUPO = "X"
def scrap_product_page(session: requests.Session, url: str):
    response = session.get(url)
    # Parseamos el contenido HTML de la primera URL
    soup_first_url = BeautifulSoup(response.text, 'html.parser')

    # Extraemos los detalles del producto
    record = {}
    
    record["group_name"] = f"GRUPO-{GRUPO}"
    record["method"] = "requests"
    record["product_name"] = soup_first_url.find(class_='title').get_text(strip=True)
    record["product_description"] = soup_first_url.find(class_='description').get_text(strip=True)
    record["product_price"] = float(soup_first_url.find(class_='price').get_text(strip=True).strip("$"))
    record["timestamp"] = datetime.now()

    return record

scrap_product_page(session=session, url=url)

{'group_name': 'GRUPO-X',
 'method': 'requests',
 'product_name': 'Iphone',
 'product_description': 'Silver',
 'product_price': 899.99,
 'timestamp': datetime.datetime(2025, 6, 5, 0, 30, 55, 766608)}

Ahora itentemos scrapear todos los productos de telefonos/touch

In [17]:
# Vamos a la pagina de telefonos touch
response = session.get('https://webscraper.io/test-sites/e-commerce/allinone/phones/touch')
sleep(3)


# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find all <a> tags within the desired elements
all_rows = soup.find_all(class_="row")
row_with_items = all_rows[2]
tag_a_elements = row_with_items.find_all('a', href=True)

# Extract URLs
urls = [f'https://webscraper.io{element["href"]}' for element in tag_a_elements]

# List to store results
results = []

# Scraping each URL
for url in urls:
    result = scrap_product_page(session=session, url=url)
    results.append(result)
    pprint(results)
    sleep(1)

results

[{'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 2, 373676)}]
[{'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 2, 373676)},
 {'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_description': '3.2" screen',
  'product_name': 'LG Optimus',
  'product_price': 57.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 3, 557756)}]
[{'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_description': '7 day battery',
  'product_name': 'Nokia 123',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 2, 373676)},
 {'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_description': '3.2" screen',
  'product_name': 'LG Optimus',
  'product_price':

[{'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'Nokia 123',
  'product_description': '7 day battery',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 2, 373676)},
 {'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'LG Optimus',
  'product_description': '3.2" screen',
  'product_price': 57.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 3, 557756)},
 {'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'Samsung Galaxy',
  'product_description': '5 mpx. Android 5.0',
  'product_price': 93.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 4, 646087)},
 {'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'Nokia X',
  'product_description': 'Andoid, Jolla dualboot',
  'product_price': 109.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 31, 5, 735545)},
 {'group_name': 'GRUPO-X',
  'method': 'requests',
  'product_name': 'Sony Xperia',
  'product_description': 'GPS, waterpro

Finalmente lo guardamos en una base de datos

In [18]:
# Load .env file
load_dotenv()

True

In [19]:
# Connect to MongoDB
client = MongoClient(environ["DB_CONNECTION_STRING"])

# Specify the database and collection
db = client["AYPMD"]
collection = db["SCRAPING_TEST"]

In [20]:
collection.insert_many(results)

InsertManyResult([ObjectId('68411d909bfe6368c214f63a'), ObjectId('68411d909bfe6368c214f63b'), ObjectId('68411d909bfe6368c214f63c'), ObjectId('68411d909bfe6368c214f63d'), ObjectId('68411d909bfe6368c214f63e'), ObjectId('68411d909bfe6368c214f63f'), ObjectId('68411d909bfe6368c214f640'), ObjectId('68411d909bfe6368c214f641'), ObjectId('68411d909bfe6368c214f642')], acknowledged=True)

In [21]:
# Define the query (bring all items of my group)
query = {}

# Execute the query and retrieve 3 documents
query_results = collection.find(query).limit(3)
list(query_results)

[{'_id': ObjectId('68411bf3b2b2cde1d7f87d5b'),
  'group_name': 'GRUPO-X',
  'product_name': 'Nokia 123',
  'product_description': '7 day battery',
  'product_price': 24.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 24, 1, 527000)},
 {'_id': ObjectId('68411bf3b2b2cde1d7f87d5c'),
  'group_name': 'GRUPO-X',
  'product_name': 'LG Optimus',
  'product_description': '3.2" screen',
  'product_price': 57.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 24, 3, 324000)},
 {'_id': ObjectId('68411bf3b2b2cde1d7f87d5d'),
  'group_name': 'GRUPO-X',
  'product_name': 'Samsung Galaxy',
  'product_description': '5 mpx. Android 5.0',
  'product_price': 93.99,
  'timestamp': datetime.datetime(2025, 6, 5, 0, 24, 5, 60000)}]

In [16]:
client.close()