## My version

In [None]:
%pip install webdriver_manager

In [2]:
import os
import urllib
import random

import numpy as np
from os import path
from tqdm import tqdm
from random import choice

from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

In [3]:
class zalandoQuery:
  def __init__(self):

    self.gender = None
    self.q = None
    self.color = None
    self.price_from = None
    self.price_to = None
    self.pattern = None

    self.BASE_URL = "https://www.zalando.it/"
    self.url = None

  def setGender(self, gender):
    """Set the gender"""
    self.gender = gender 

  def setQ(self, q):
    """Set the query(research paramenter)"""
    self.q = q

  def setColor(self, color):
    """Set the color of the item"""
    self.color = color

  def setPrice_from(self, price):
    """Set the lowest price"""
    self.price_from = price

  def setPrice_to(self, price):
    """Set the highest price"""
    self.price_to = price

  def setPatter(self, pattern):
    """Set the pattern of an item"""
    self.pattern = pattern

  def build_query(self):
    """Build the query using all the paramenters"""
    to_build = []

    if self.q:
      to_build.append("q=" + self.q)
    if self.pattern:
      to_build.append("pattern=" + self.pattern)
    if self.price_from:
      to_build.append("price_from=" + self.price_from)
    if self.price_to:
      to_build.append("price_to=" + self.price_to)

    self.url = self.BASE_URL
    if self.gender:
      self.url += "abbigliamento%2d" + self.gender + "/"

    if self.color:
      self.url += "_" + self.color + "/"

    if len(to_build) > 0:
      self.url += "?" + "&".join(to_build)

  def get_articles(self):
    """Build the query and retrieve the response"""
    self.build_query()
    # print(f'The url is: {self.url}')

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless=new")
    chrome = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    chrome.get(self.url)

    html = chrome.page_source
    html_parsed = BeautifulSoup(html, features="html.parser")
    chrome.quit()

    return html, html_parsed
  
  def get_articles_from_external_link(self, link):
    """Retrieve the page given a link"""
    # print(f'The url is: {link}')

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless=new")
    chrome = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    chrome.get(link)

    html = chrome.page_source
    html_parsed = BeautifulSoup(html, features="html.parser")
    chrome.quit()

    return html, html_parsed


In [4]:
# TODO: set the path to the storage folder (no white spaces in the path)
dataset_path = "" 

In [5]:
gender_list = ["uomo", "donna"]
color_list = ["nero", "marrone", "viola", "giallo", "verde", "bianco", "rosso", "argento", "oro"]
product_list = ["t%2Dshirt", "pantalone", "vestito", "camicia", "scarpe", "gioielli", "cintura", 
                "giacca", "cappello", "occhiali", "zaino", "borsa", "valigia", "orologio", "intimo", 
                "perizoma", "mutande", "reggiseno"]
price_range = [4, 200]

# Number of query to do
number_of_samples = 1

# Create random combinations from the lists
samples_combination = [(choice(gender_list), choice(color_list), choice(product_list)) for _ in range(number_of_samples)]

assert len(samples_combination) == number_of_samples

In [6]:
for sample in samples_combination:
        # For each sample, get a random price range
        price_min = random.randint(price_range[0], price_range[1])
        if sample[2] in ["gioielli", "borsa"]:
                # In case of jewel increase the price range
                price_max = price_min + 400
        if sample[2] in ["orologio"]:
                # In case of jewel increase the price range
                price_max = price_min + 200
        price_max = price_min + 10

        # print(f"The current sample is {sample}")

        # Build the query
        query_builder = zalandoQuery()
        query_builder.setGender(sample[0])
        query_builder.setColor(sample[1])
        query_builder.setQ(sample[2])
        query_builder.setPrice_from(str(price_min))
        query_builder.setPrice_to(str(price_max))

        # Send the query to Zalando website and get the page
        page, response = query_builder.get_articles()

        # Get the images links
        items_links = set()
        # For each item retrieved by the query, save the link to its html page
        for img in response.findAll('a', class_=['CKDt_l', 'LyRfpJ', 'JT3_zV', 'CKDt_l', 'q84f1m', '_2dqvZS']):
                link = img.get('href')
                if link[:22] == "https://www.zalando.it" and link[-5:] == ".html":
                        items_links.add(img.get('href'))

        # Convert the set in a list
        items_links = list(items_links)

        # Remove a default link if present
        if "https://www.zalando.it/aiuto/Buoni-sconto-e-buoni-regalo/Buoni-regalo-SisalPay-or-5.html" in items_links:
                items_links.remove("https://www.zalando.it/aiuto/Buoni-sconto-e-buoni-regalo/Buoni-regalo-SisalPay-or-5.html")

        # print(f'The number of links is {len(items_links)}')

        # If there are links
        if len(items_links) > 0:

                # Get the number of items already in the dataset
                files_list = os.listdir(f'{dataset_path}')
                # Convert the string into integers
                name_list = [int(name) for name in files_list]
                item_number = max(name_list) + 1

                # For each item retrieved from the query
                for item_link in tqdm(items_links):

                        # Get the html page of the current item
                        page, response = query_builder.get_articles_from_external_link(item_link)

                        #print("The html page is:")
                        #print(response.prettify())

                        images_links = set()

                        # Get all the static images in the item page (the images of the same item)
                        for img_tag in response.find_all('img', class_="KxHAYs lystZ1 FxZV-M _2Pvyxl JT3_zV EKabf7 mo6ZnF _1RurXL mo6ZnF _7ZONEy"):
                                alt_text = img_tag["alt"]
                                try:
                                        # If the last char of the string is an integer (because the images of interest have a number at the end)
                                        if alt_text[-1:] in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]: 
                                                images_links.add(img_tag["src"])
                                except:
                                        pass

                        # Convert the set in a list
                        images_links = list(images_links)

                        # if there are at least 2 images
                        if len(images_links) > 1:

                                # If the directory already exists, it should be empty. Thus, delete it
                                if path.exists(f'{dataset_path}/{item_number}'):
                                        # If it gives an error it's because the dir is not empty 
                                        # -> need to delete all the files before
                                        os.rmdir(f'{dataset_path}/{item_number}')
                                # Create the directory for the current product
                                os.mkdir(f'{dataset_path}/{item_number}')

                                image_number = 0
                                for img_link in images_links:
                                        # If it's an empty link
                                        if len(img_link) <= 0:
                                                continue
                                        try:
                                                # get the image
                                                img = urllib.request.urlopen(img_link).read()
                                                # Create an image file
                                                localFile = open(f'{dataset_path}/{item_number}/{str(image_number)}.jpg', 'wb')
                                                # Write the content retrieved from the website into the image file
                                                localFile.write(img)
                                                # Close the file
                                                localFile.close()
                                                image_number += 1
                                        except:
                                                pass
                                # Go to the next item directory
                                item_number += 1



100%|██████████| 19/19 [03:56<00:00, 12.47s/it]
