In [1]:
import requests
from bs4 import BeautifulSoup
import os
import cloudscraper
import shutil
import time
import re
from tqdm import tqdm
import pandas as pd

In [2]:
def tile_to_dict(tile):
    item = dict()
    item['title'] = tile.findAll('a')[0]['title']
    item['data-itemid'] = tile['data-itemid']
    item['url'] = tile['data-monetate-producturl']
    try:
        item['colours'] = [a.find('img')['alt'] for a in tile.findAll('li')]
    except Exception as e:
        print(e)
        item['colours'] = 'Exception'
    return item

### Parsing tiles

In [3]:
FOLDER_PERSON = "deep/person"
FOLDER_NO_PERSON = "deep/no_person"
FOLDER_TEST = "deep"
def get_images(url:str, id:str):
    scraper = cloudscraper.create_scraper(delay=2)
    response = scraper.get(url)
    bs = BeautifulSoup(response.content, 'html.parser')
    imgs = bs.findAll('picture', attrs={'class': 'swiper-zoomable'})
    img_urls = [a['data-highres-images'] for a in imgs]
    for img_url in enumerate(img_urls):
        response = scraper.get(img_url[1])
        filename = f'{id}_{img_url[0]}.jpg'
        if 'alternate' in img_url[1]:
            dst_file = os.path.join(FOLDER_PERSON, filename)
        elif 'lifestyle' in img_url[1]:
            dst_file = os.path.join(FOLDER_NO_PERSON, filename)
        else:
            dst_file = os.path.join(FOLDER_TEST, filename)
        with open(dst_file, "wb") as f:
            f.write(response.content)

In [4]:
def scrape_images(url):
    print('Getting items info')
    tiles = []
    for i in range(999):
        scraper = cloudscraper.create_scraper(delay=10)
        print(i*32)
        url = url.replace('?', f'?start={i*32}&')
        response = scraper.get(url)
        if response.status_code == 403:
            print('auth error')
            break
        soup = BeautifulSoup(response.content, 'html.parser')
        product_tiles = soup.select('.product-tile')
        t = [tile_to_dict(a) for a in product_tiles]
        if len(t) == 0:
            print('no tiles')
            break
        tiles.extend(t)
    df = pd.DataFrame(tiles)
    df = df.drop_duplicates(subset=['data-itemid'], keep='first').reset_index(drop=True)
    print('Saving images')
    for a in tqdm(zip(df.url, df['data-itemid']), total=len(df)):
        get_images(a[0], a[1])
    return df

In [5]:
URL = r"https://www.ralphlauren.nl/en/men/clothing/hoodies-sweatshirts/10204?webcat=men%7Cclothing%7Cmen-clothing-hoodies-sweatshirts"
df = scrape_images(URL)
df

Getting items info
0
32
64
96
'NoneType' object is not subscriptable
128
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
160


In [None]:
# tiles = []
# for i in range(999):
#     scraper = cloudscraper.create_scraper(delay=10)
#     print(i*32)
#     url = URL.replace('?', f'?start={i*32}&')
#     response = scraper.get(url)
#     if response.status_code == 403:
#         print('auth error')
#         break
#     soup = BeautifulSoup(response.content, 'html.parser')
#     product_tiles = soup.select('.product-tile')
#     t = [tile_to_dict(a) for a in product_tiles]
#     if len(t) == 0:
#         print('no tiles')
#         break
#     tiles.extend(t)
# df = pd.DataFrame(tiles)

In [None]:
# for a in tqdm(zip(df.url, df['data-itemid']), total=len(df)):
#     get_images(a[0], a[1])

In [None]:
# URL = r"https://www.ralphlauren.nl/en/men/clothing/hoodies-sweatshirts/10204?webcat=men%7Cclothing%7Cmen-clothing-hoodies-sweatshirts"
# scraper = cloudscraper.create_scraper(delay=10)
# response = scraper.get(URL)
# bs = BeautifulSoup(response.content, 'html.parser')
# product_tiles = bs.select('.product-tile')
# tiles = [tile_to_dict(a) for a in product_tiles]