In [3]:
# Scraping pagina 1 de vivino

import requests
from bs4 import BeautifulSoup
import re 

url = "https://www.vivino.com/es/explore?e=eJzLLbI1VMvNzLM1UMtNrLA1NTBQS660TS1WS7Z1DQ1SKwDKpqfZliUWZaaWJOao5Rel2KakFier5SdV2hYUZSanqpUXR8faGloYqBUBaRMDUwBHtxqD"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36",
}

resp = requests.get(url, headers=headers)
resp.raise_for_status()  # por si hay error de red
soup = BeautifulSoup(resp.text, "html.parser")

# 1) Nombre del vino
name_elems = soup.select('div[class^="wineInfoVintage__truncate"]')
names = [el.get_text(strip=True) for el in name_elems]

# 1b) Añada / Vintage
vintage_elems = soup.select('div[class^="wineInfoVintage__vintage"]')
vintages = [el.get_text(strip=True) for el in vintage_elems]

# 2) Rating
rating_elems = soup.select('div[class^="vivinoRating__averageValue"]')
ratings_raw = [el.get_text(strip=True) for el in rating_elems]

def limpiar_rating(texto):
    if not texto:
        return None
    return float(texto.replace(",", "."))

ratings = [limpiar_rating(r) for r in ratings_raw]

# 3) Precio
price_elems = soup.select('div[class^="addToCartButton__price"]')
prices_raw = [el.get_text(strip=True) for el in price_elems]

def limpiar_precio(texto):
    # Ejemplos: 'EUR\xa03,95', 'EUR 12,50', '19,90 €'
    texto = texto.replace("\xa0", " ")
    match = re.search(r"(\d+[.,]\d+|\d+)", texto)
    if match:
        numero = match.group(1).replace(",", ".")
        return float(numero)
    return None

prices = [limpiar_precio(p) for p in prices_raw]


# Juntar todo (coger el mínimo común para evitar desajustes)
wines = []
for n, v, r, p in zip(names, vintages, ratings, prices):
    titulo_completo = f"{n} ({v})"  # nombre + añada
    wines.append({
        "titulo": titulo_completo,
        "nombre": n,
        "vintage": v,
        "rating": r,
        "precio": p
    })

if not wines:
    print("No se ha encontrado ningún vino. Puede que hayan cambiado las clases CSS o el HTML que devuelve la página.")
else:
    for w in wines:
        print(w)


No se ha encontrado ningún vino. Puede que hayan cambiado las clases CSS o el HTML que devuelve la página.


In [None]:
# Recorrer hasta pág 92
# Scraping páginas 1 a 92 de Vivino

import requests
from bs4 import BeautifulSoup
import re
import time

BASE_URL = "https://www.vivino.com/es/explore?e=eJzLLbI1VMvNzLM1UMtNrLA1NTBQS660TS1WS7Z1DQ1SKwDKpqfZliUWZaaWJOao5Rel2KakFier5SdV2hYUZSanqpUXR8faGloYqBUBaRMDUwBHtxqD"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/123.0 Safari/537.36",
}

def limpiar_rating(texto):
    if not texto:
        return None
    return float(texto.replace(",", "."))

def limpiar_precio(texto):
    # Ejemplos: 'EUR\xa03,95', 'EUR 12,50', '19,90 €'
    texto = texto.replace("\xa0", " ")
    match = re.search(r"(\d+[.,]\d+|\d+)", texto)
    if match:
        numero = match.group(1).replace(",", ".")
        return float(numero)
    return None

def scrape_page(page: int):
    """Devuelve la lista de vinos de una página concreta."""
    if page == 1:
        url = BASE_URL
    else:
        # misma búsqueda, cambiando solo la página
        url = BASE_URL + f"&page={page}"

    resp = requests.get(url, headers=headers)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # 1) Nombre del vino
    name_elems = soup.select('div[class^="wineInfoVintage__truncate"]')
    names = [el.get_text(strip=True) for el in name_elems]

    # 1b) Añada / Vintage
    vintage_elems = soup.select('div[class^="wineInfoVintage__vintage"]')
    vintages = [el.get_text(strip=True) for el in vintage_elems]

    # 2) Rating
    rating_elems = soup.select('div[class^="vivinoRating__averageValue"]')
    ratings_raw = [el.get_text(strip=True) for el in rating_elems]
    ratings = [limpiar_rating(r) for r in ratings_raw]

    # 3) Precio
    price_elems = soup.select('div[class^="addToCartButton__price"]')
    prices_raw = [el.get_text(strip=True) for el in price_elems]
    prices = [limpiar_precio(p) for p in prices_raw]

    # Debug rápido
    print(f"[DEBUG] Página {page}: "
          f"names={len(names)}, vintages={len(vintages)}, "
          f"ratings={len(ratings)}, prices={len(prices)}")

    wines = []
    n_items = min(len(names), len(vintages), len(ratings), len(prices))
    for i in range(n_items):
        n = names[i]
        v = vintages[i]
        r = ratings[i]
        p = prices[i]
        titulo_completo = f"{n} ({v})"
        wines.append({
            "titulo": titulo_completo,
            "nombre": n,
            "vintage": v,
            "rating": r,
            "precio": p,
            "pagina": page,
        })

    return wines

# ---- BUCLE PRINCIPAL: páginas 1 a 92 ----

all_wines = []

for page in range(1, 93):  # 1 a 92 incluido
    print(f"\nScrapeando página {page}...")
    wines = scrape_page(page)

    if not wines:
        print(f"[INFO] Página {page} sin vinos, paro aquí.")
        break

    print(f"[INFO] Página {page}: {len(wines)} vinos")
    all_wines.extend(wines)

    time.sleep(2)  # pequeña pausa para no abusar del servidor

print(f"\n[RESUMEN] Total vinos recogidos: {len(all_wines)}")
for w in all_wines[:10]:  # mostrar solo los 10 primeros
    print(w)


In [None]:
import pandas as pd

df_vinos0 = pd.DataFrame(all_wines)
print(df_vinos0.head())

df_vinos0.to_excel("vinos_vivino0.xlsx", index=False)

In [None]:

#OTRA FORMA DE HACERLO CON APIS:

import requests
import time

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

# Parámetros base sacados del href de tus botones
PARAMS_BASE = {
    "country_code": "ES",          # código país principal (España)
    "country_codes[]": "es",       # vinos de España
    "currency_code": "EUR",
    "min_rating": 0,
    "order_by": "price",
    "order": "desc",
    "price_range_min": 0,
    "price_range_max": 500,        # "todos" los precios en la búsqueda que hiciste
    "discount_prices": "false",
    "wine_style_ids[]": "180",     # estilo: Ribera del Duero Tinto (España)
    "region_ids[]": "405",         # región: Ribera del Duero
    "wine_type_ids[]": "1",        # tipo de vino: tinto
    "wsa_year": "null",
}


API_URL = "https://www.vivino.com/api/explore/explore"

all_wines = []
LAST_PAGE = 92

for page in range(1, LAST_PAGE + 1):
    params = PARAMS_BASE.copy()
    params["page"] = page

    print(f"Scrapeando página {page}...")

    r = requests.get(API_URL, params=params, headers=HEADERS)
    r.raise_for_status()
    data = r.json()

    matches = data["explore_vintage"]["matches"]
    if not matches:
        print(f"Página {page} sin resultados, paro aquí.")
        break

    for t in matches:
        vintage = t["vintage"]
        wine = vintage["wine"]

        wine_name = wine["name"]
        year = vintage["year"]
        rating = vintage["statistics"]["ratings_average"]

        price_info = t.get("price") or {}
        price = price_info.get("amount")   # ya es número, no hace falta regex

        titulo_completo = f"{wine_name} ({year})"

        all_wines.append({
            "titulo": titulo_completo,
            "nombre": wine_name,
            "vintage": year,
            "rating": rating,
            "precio": price,
            "pagina": page,
        })

    print(f"  -> {len(matches)} vinos en página {page}")
    time.sleep(0.5)   # pequeña pausa por respeto al servidor

print(f"\nTotal de vinos recogidos: {len(all_wines)}")

# Igual que antes: ver primeros
for w in all_wines[:10]:
    print(w)


In [None]:
#Generar DF y pasarlo a excel

import pandas as pd

df_vinos = pd.DataFrame(all_wines)
print(df_vinos.head())

df_vinos.to_excel("vinos_vivino.xlsx", index=False)
