In [1]:
import requests
import os
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from concurrent.futures import ThreadPoolExecutor

In [3]:
driver = webdriver.Chrome()  # or use any other driver like Firefox
driver.get('https://www.farfetch.com/es/shopping/women/clothing-1/items.aspx')  # Target URL
# Wait until page fully loads, or you may need to manually handle cookies or prompts
links = []

accept_button = driver.find_element(By.CSS_SELECTOR, 'button[data-testid="Button_PrivacySettingsBanner_AcceptAll"]')

# Hacer clic en el botón para aceptar las cookies
accept_button.click()

def scroll_page(driver, links_set, SCROLL_PAUSE = 2, min_img=150):    
    prev_height = 0
    new_height = 0
    total_height = driver.execute_script("return document.body.scrollHeight")

    # Scroll hasta el fondo varias veces para cargar más productos
    while new_height < total_height:
        new_height = prev_height+1415.6
        driver.execute_script(f'window.scrollTo({prev_height}, {new_height});')
        time.sleep(SCROLL_PAUSE)
        
        if new_height == prev_height:
            break  # Ya no se carga más contenido
        prev_height = new_height

        # Recolectar productos visibles
        try:
            new_links = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH, '//*[@id="catalog-grid"]/li/div/a'))
            )
            for link in new_links:
                href = link.get_attribute("href")
                if href and href not in links_set:
                    links_set.add(href)
                    print(f"[{len(links_set)}] {href}")
                    if len(links_set) >= min_img:
                        break
        except:
            print('No se encuentran más artículos')
            break

def scroll_and_collect(driver, links_set, min_img=150):
    num_page = 1
    while len(links_set) < min_img:
        
        scroll_page(driver, links_set)

        print('Se han recolectado los links de esta página')
        print(len(links_set))

        if len(links_set) >= min_img:
            break

        # Intentar ir a la siguiente página
        try:
            num_page += 1
            href = f'https://www.farfetch.com/es/shopping/women/clothing-1/items.aspx?page={num_page}'
            print(href)
            
            if href:
                driver.get(href)
                time.sleep(3)

                # Cierra modal si aparece
                try:
                    cancel_button = WebDriverWait(driver, 3).until(
                        EC.element_to_be_clickable((By.XPATH, '//*[@id="root"]/div[3]/div[2]/div[4]/div/button'))
                    )
                    cancel_button.click()
                except:
                    print('No se ha encontrado pop-up')  # No modal

                try:
                    scroll_page(driver, links_set)
                except:
                    print('No se ha podido hacer scroll de esta página')

            else:
                print("🚫 No se encontró href en el botón de siguiente página.")
                break

        except:
            print("🚫 No se encontró botón de siguiente página.")
            break
    
    print(f'Se han recolectado {len(links_set)} artículos')
    return list(links_set)

In [4]:
def get_image_sources(driver, product_links):
    image_sources = []
    i = 0

    while i < len(product_links):
        try:
            driver.get(product_links[i])
            time.sleep(1)

            image_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/div[1]/div[1]/div[1]/div/div[1]/button/img'))
            )
            src = image_element.get_attribute("src")
            print(f'[{i}] Obteniendo imagen {product_links[i]}: {src}')
            if src:
                image_sources.append(src)
        except Exception as e:
            print(f"⚠️ Error al obtener imagen de {product_links[i]}: {e}. Saltando a la siguiente")
        i += 1

    return image_sources

def save_image_sources_txt(image_sources, filename="image_sources.txt"):
    with open(filename, "w") as f:
        for src in image_sources:
            f.write(f"{src}\n")
    print(f"📝 Fuentes de imagen guardadas en {filename}")

def read_image_sources_txt(filename="image_sources.txt"):
    with open(filename, "r") as f:
        image_sources = f.readlines()  # Lee todas las líneas del archivo
    image_sources = [src.strip() for src in image_sources]  # Elimina los saltos de línea
    return image_sources

def save_image(src_filename_tuple):
    src, filename = src_filename_tuple
    path = os.path.join("fashion_images", filename)

    print(f"Descargando imagen {src}")
    try:
        response = requests.get(src)
        if response.status_code == 200:
            with open(path, "wb") as f:
                f.write(response.content)
            print(f"✅ Imagen guardada: {filename}")
        else:
            print(f"❌ Error al descargar {src}")
    except Exception as e:
        print(f"⚠️ Error al descargar {src}: {e}")


def download_images():
    os.makedirs("fashion_images", exist_ok=True)

    print("🕵️ Obteniendo URLs de imágenes...")
    # image_sources = get_image_sources(driver, product_links)
    # save_image_sources_txt(image_sources)
    image_sources = read_image_sources_txt()

    # Asociar nombres
    src_filename_list = [(src, f'image_{i}.png') for i, src in enumerate(image_sources)]

    print("🚀 Descargando en paralelo...")
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        # Utilizamos executor.submit para poder manejar el TimeoutError por cada tarea
        futures = [executor.submit(save_image, src_filename) for src_filename in src_filename_list]

        for future in futures:
            try:
                future.result(timeout=60)  # Timeout de 15 segundos por tarea
            except TimeoutError:
                print(f"⚠️ La descarga ha tardado demasiado y se ha omitido.")

    print(f"\n🔚 Descarga completa: {len(src_filename_list)} imágenes.")

In [None]:
collected_links = set()
product_links = scroll_and_collect(driver, collected_links, min_img=300)
len(product_links)
# download_images(driver, product_links)

In [None]:
download_images()

🕵️ Obteniendo URLs de imágenes...
🚀 Descargando en paralelo...
Descargando imagen https://cdn-images.farfetch-contents.com/27/76/30/56/27763056_57680900_1000.jpg
Descargando imagen https://cdn-images.farfetch-contents.com/21/29/57/63/21295763_52555392_1000.jpg
Descargando imagen https://cdn-images.farfetch-contents.com/23/72/99/91/23729991_54212619_1000.jpg
Descargando imagen https://cdn-images.farfetch-contents.com/24/70/89/62/24708962_55242703_1000.jpg
Descargando imagen https://cdn-images.farfetch-contents.com/25/07/13/17/25071317_55288550_1000.jpg
✅ Imagen guardada: image_0.png
Descargando imagen https://cdn-images.farfetch-contents.com/27/33/09/21/27330921_57457080_1000.jpg
✅ Imagen guardada: image_2.png
Descargando imagen https://cdn-images.farfetch-contents.com/23/77/42/73/23774273_54212485_1000.jpg
✅ Imagen guardada: image_1.png
Descargando imagen https://cdn-images.farfetch-contents.com/25/37/10/30/25371030_57316721_1000.jpg
✅ Imagen guardada: image_4.png
Descargando imagen ht