In [25]:
import requests
from bs4 import BeautifulSoup
import os
import cloudscraper
import shutil
import time
import re
from tqdm import tqdm
import pandas as pd

In [26]:
def tile_to_dict(tile):
    item = dict()
    item['title'] = tile.findAll('a')[0]['title']
    item['data-itemid'] = tile['data-itemid']
    item['url'] = tile['data-monetate-producturl']
    try:
        item['colors'] = [a.find('img')['alt'] for a in tile.findAll('li')]
    except Exception as e:
        print(e, 'probably no colors')
        item['colors'] = None
    return item

### Parsing tiles

In [36]:
FOLDER_PERSON = "deep/person"
FOLDER_NO_PERSON = "deep/no_person"
FOLDER_TEST = "deep"

def save_images(img_urls, id):
    scraper = cloudscraper.create_scraper()
    for img_url in enumerate(img_urls):
        response = scraper.get(img_url[1])
        filename = f'{id}_{img_url[0]}.jpg'
        if 'alternate' in img_url[1]:
            dst_file = os.path.join(FOLDER_PERSON, filename)
        elif 'lifestyle' in img_url[1]:
            dst_file = os.path.join(FOLDER_NO_PERSON, filename)
        else:
            dst_file = os.path.join(FOLDER_TEST, filename)
        with open(dst_file, "wb") as f:
            f.write(response.content)


def get_images(url: str, id: str, colors: list):  # add colors parse
    scraper = cloudscraper.create_scraper()
    if colors:
        for a in colors:
            print(a)
            colorurl = a.replace(' ', '%20')
            colorurl = 'colorname=' + a
            print(colorurl)
            print(url)
            url = re.sub(r'colorname=([^&]+)(?=$|&)', colorurl, url)
            print(url)
            response = scraper.get(url)
            bs = BeautifulSoup(response.content, 'html.parser')
            imgs = bs.findAll('picture', attrs={'class': 'swiper-zoomable'})
            img_urls = [a['data-highres-images'] for a in imgs]
            save_images(img_urls, id + '_' + a.replace(' ', '_'))
    else:
        response = scraper.get(url)
        bs = BeautifulSoup(response.content, 'html.parser')
        imgs = bs.findAll('picture', attrs={'class': 'swiper-zoomable'})
        img_urls = [a['data-highres-images'] for a in imgs]
        save_images(img_urls, id)


In [28]:
def scrape_images(url):
    print('Getting items info')
    tiles = []
    for i in range(999):
        scraper = cloudscraper.create_scraper()
        print(i*32)
        url = url.replace('?', f'?start={i*32}&')
        response = scraper.get(url)
        if response.status_code == 403:
            print('auth error')
            break
        soup = BeautifulSoup(response.content, 'html.parser')
        product_tiles = soup.select('.product-tile')
        t = [tile_to_dict(a) for a in product_tiles]
        if len(t) == 0:
            print('no tiles')
            break
        tiles.extend(t)
    df = pd.DataFrame(tiles)
    df = df.drop_duplicates(subset=['data-itemid'], keep='first').reset_index(drop=True)
    df.to_pickle('df.pkl')
    print('Saving images')
    for a in tqdm(zip(df.url, df['data-itemid'], df.colors), total=len(df)):
        get_images(a[0], a[1], a[2])
    return df

In [29]:
URL = r"https://www.ralphlauren.nl/en/men/clothing/hoodies-sweatshirts/10204?webcat=men%7Cclothing%7Cmen-clothing-hoodies-sweatshirts"
df = scrape_images(URL)
df

Getting items info
0
32
64
96
'NoneType' object is not subscriptable probably no colors
128
160
'NoneType' object is not subscriptable probably no colors
'NoneType' object is not subscriptable probably no colors
192
224
256
288
no tiles
Saving images


  1%|          | 2/272 [00:36<1:21:36, 18.13s/it]


KeyboardInterrupt: 