In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import cloudscraper
import requests
import shutil
import os
import re

In [2]:
def replace_special_characters(s):
    return re.sub(r'[^a-zA-Z0-9_]+', '_', s)

In [3]:
def tile_to_dict(tile):
    item = dict()
    item['title'] = tile.findAll('a')[0]['title']
    item['data-itemid'] = tile['data-itemid']
    item['url'] = tile['data-monetate-producturl']
    try:
        item['colors'] = [a.find('img')['alt'] for a in tile.findAll('li')]
    except Exception as e:
        print(e, 'probably no colors')
        item['colors'] = None
    return item

In [4]:
FOLDER_PERSON = "deep/person"
FOLDER_NO_PERSON = "deep/no_person"
FOLDER_TEST = "deep"

def save_images(img_urls, id):
    scraper = cloudscraper.create_scraper()
    for img_url in enumerate(img_urls):
        response = scraper.get(img_url[1])
        filename = f'{id}_{img_url[0]}.jpg'
        if 'alternate' in img_url[1]:
            dst_file = os.path.join(FOLDER_PERSON, filename)
        elif 'lifestyle' in img_url[1]:
            dst_file = os.path.join(FOLDER_NO_PERSON, filename)
        else:
            dst_file = os.path.join(FOLDER_TEST, filename)
        with open(dst_file, "wb") as f:
            f.write(response.content)


def get_images(url: str, id: str, colors: list):  # add colors parse
    scraper = cloudscraper.create_scraper()
    if colors:
        for a in colors:
            colorurl = a.replace(' ', '%20')
            colorurl = 'colorname=' + a
            url = re.sub(r'colorname=([^&]+)(?=$|&)', colorurl, url)
            response = scraper.get(url)
            bs = BeautifulSoup(response.content, 'html.parser')
            imgs = bs.findAll('picture', attrs={'class': 'swiper-zoomable'})
            img_urls = [a['data-highres-images'] for a in imgs]
            save_images(img_urls, id + '_' + replace_special_characters(a))
    else:
        response = scraper.get(url)
        bs = BeautifulSoup(response.content, 'html.parser')
        imgs = bs.findAll('picture', attrs={'class': 'swiper-zoomable'})
        img_urls = [a['data-highres-images'] for a in imgs]
        save_images(img_urls, id)


In [5]:
def scrape_images(url):
    print('Getting items info')
    tiles = []
    for i in range(999): #could be replaced with while True, but it's not necessary
        scraper = cloudscraper.create_scraper()
        print(i*32)
        url = url.replace('?', f'?start={i*32}&')
        response = scraper.get(url)
        if response.status_code == 403:
            print('auth error')
            break
        soup = BeautifulSoup(response.content, 'html.parser')
        product_tiles = soup.select('.product-tile')
        t = [tile_to_dict(a) for a in product_tiles]
        if len(t) == 0:
            print('no tiles')
            break
        tiles.extend(t)
    df = pd.DataFrame(tiles)
    df = df.drop_duplicates(subset=['data-itemid'], keep='first').reset_index(drop=True)
    df.to_pickle('df.pkl')
    print('Saving images')
    for a in tqdm(zip(df.url, df['data-itemid'], df.colors), total=len(df)):
        get_images(a[0], a[1], a[2])
    return df

In [15]:
def scrape_images_frompos(df:pd.DataFrame, pos:int):
    df = df.iloc[pos:,:]
    for a in tqdm(zip(df.url, df['data-itemid'], df.colors), total=len(df)):
        get_images(a[0], a[1], a[2])

In [6]:
URL = r"https://www.ralphlauren.nl/en/men/clothing/hoodies-sweatshirts/10204?webcat=men%7Cclothing%7Cmen-clothing-hoodies-sweatshirts"
df = scrape_images(URL)
df

Getting items info
0
32
64
96
'NoneType' object is not subscriptable probably no colors
128
160
'NoneType' object is not subscriptable probably no colors
'NoneType' object is not subscriptable probably no colors
192
224
256
288
no tiles
Saving images


 96%|█████████▋| 264/274 [54:23<02:03, 12.36s/it]  


ConnectionError: HTTPSConnectionPool(host='www.rlmedia.io', port=443): Max retries exceeded with url: /is/image/PoloGSI/s7-1473088_alternate10?$rl_pdp_mob_zoom$ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002CB3E149AE0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [16]:
df = pd.read_pickle('df.pkl')
scrape_images_frompos(df, 263)

100%|██████████| 11/11 [01:43<00:00,  9.45s/it]


In [17]:
df

Unnamed: 0,title,data-itemid,url,colors
0,Triple-Pony Plaid-Print Fleece Hoodie,624792,https://www.ralphlauren.nl/en/triple-pony-plai...,[]
1,Polo Ski Fleece Hoodie,624775,https://www.ralphlauren.nl/en/polo-ski-fleece-...,[]
2,The RL Fleece Hoodie,490877v1,https://www.ralphlauren.nl/en/the-rl-fleece-ho...,"[Holiday Red, Athletic Green, Harrison Blue]"
3,Double-Knit Pullover,624750v1,https://www.ralphlauren.nl/en/double-knit-pull...,"[Army Olive, Aviator Navy, Montana Khaki]"
4,Snowflake Estate-Rib Jumper,590613v1,https://www.ralphlauren.nl/en/snowflake-estate...,"[Heritage Snowflake New Fo, Heritage Snowflake..."
...,...,...,...,...
269,Colour-Blocked Stretch Jersey Pullover,625356,https://www.ralphlauren.nl/en/colour-blocked-s...,[]
270,Stretch Jersey Half-Zip Pullover,639982,https://www.ralphlauren.nl/en/stretch-jersey-h...,[]
271,Cotton-Blend-Fleece Hoodie,458755,https://www.ralphlauren.nl/en/cotton-blend-fle...,[]
272,Polo Bear Color-Blocked Fleece Hoodie,647816,https://www.ralphlauren.nl/en/polo-bear-color-...,[]
