In [6]:
from datetime import datetime
import json
import re
import requests
import pandas as pd

url = "https://www.elabastecedor.com.ar/"
routes_pattern = "<a href='([^']+)'><b>([^<]+)</b></a>"
products_pattern = "<form data-codigo='([^']+)' data-marca='([^']+)' data-nombre='([^']+)' data-id='([^']+)' data-precio='([^']+)' class='produItem' name='form1' method='post'>"
pagination_pattern = "class=\"active\"> \d+ </a></li><li> <a href=\"([^\"]+)\"> \d+ </a>"

start_time = datetime.now()
date = f"{start_time:%Y%m%d}"
routes_filename = f"{date}_routes.json"
products_filename = f"{date}_products.csv"

# get routes

with requests.get(url) as response:
    response.raise_for_status()
    content = response.text
    cookie = response.headers["Set-Cookie"]

routes_results = re.findall(routes_pattern, content)

urls = {}
for route_url, route_name in routes_results:
    urls[route_name.strip()] = url + route_url

# export routes

with open(routes_filename, "w") as file:
    obj = dict(
        headers=dict(response.headers),
        urls=urls,
    )
    json.dump(obj, file)

# get products

headers = {
    "Cookie": cookie,
    "Upgrade-Insecure-Requests": "1"
}

def get_products(route_name, route_url):
    products = []

    print(route_url)
    with requests.get(url + route_url, headers=headers) as response:
        response.raise_for_status()
        content = response.text

    results = re.findall(products_pattern, content)
    for codigo, marca, nombre, id, precio in results:
        products.append(dict(
            id=id,
            codigo=codigo,
            marca=marca,
            nombre=nombre,
            precio=float(precio.replace(",", "")),
            categoria=route_name,
        ))
    
    pagination = "class=\"active\"> \d+ </a></li><li> <a href=\"([^\"]+)\"> \d+ </a>"
    pages = re.findall(pagination_pattern, content)
    if any(pages):
        products.extend(get_products(route_name, pages[0]))

    return products

products = []
for route_name, route_url in urls.items():
    products.extend(get_products(route_name, route_url))

# export products

df = pd.DataFrame(products)
df.drop_duplicates(inplace=True)
df.to_csv(products_filename, header=True, index=False)

https://www.elabastecedor.com.ar/almacen-aceites
https://www.elabastecedor.com.ar/almacen-aderezos
https://www.elabastecedor.com.ar/almacen-apto-celiacos
https://www.elabastecedor.com.ar/almacen-arroces
https://www.elabastecedor.com.ar/almacen-pascuas
https://www.elabastecedor.com.ar/almacen-azucar
https://www.elabastecedor.com.ar/almacen-bizcochuelos-para-preparar
https://www.elabastecedor.com.ar/almacen-bizcochuelos-preparados
https://www.elabastecedor.com.ar/almacen-budines-magdalenas
https://www.elabastecedor.com.ar/almacen-cacaos
https://www.elabastecedor.com.ar/almacen-cafes
https://www.elabastecedor.com.ar/almacen-caldos
https://www.elabastecedor.com.ar/almacen-cereales
https://www.elabastecedor.com.ar/almacen-conservas-carnes
https://www.elabastecedor.com.ar/almacen-conservas-pescado
https://www.elabastecedor.com.ar/almacen-conservas-legumbres
https://www.elabastecedor.com.ar/almacen-edulcoran
https://www.elabastecedor.com.ar/almacen-encurtido
https://www.elabastecedor.com.ar/a