In [4]:
import requests
from bs4 import BeautifulSoup
import os
import re


HEADERS = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-GB,en;q=0.9",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15"
}

In [9]:
def get_images_from_page(keyword, page_num, family='creative'):

    # Assert the family is either 'creative' or 'editorial'
    assert family in ['creative', 'editorial'], "Family must be either 'creative' or 'editorial'"

    # Modify the keyword to be used in the URL
    keyword = keyword.replace(' ', '%20')

    url = f"https://www.gettyimages.nl/search/2/image?family={family}&page={page_num}&phrase={keyword}&sort=best"
    print(f"Scraping: {url}")
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')

    images_info = []

    # Find all possible <img> tags inside <picture> or standalone <img> tags
    picture_tags = soup.find_all('picture')
    img_tags = soup.find_all('img')

    for picture_tag in picture_tags:
        img_tag = picture_tag.find('img')
        if img_tag:
            image_url = img_tag.get('src')
            alt_text = img_tag.get('alt')
            if image_url and alt_text:
                images_info.append((image_url, alt_text))

    for img_tag in img_tags:
        image_url = img_tag.get('src')
        alt_text = img_tag.get('alt')
        if image_url and alt_text:
            images_info.append((image_url, alt_text))

    return images_info

def download_image(image_url, alt_text, folder_name):
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            image_name = alt_text.replace(" ", "_").replace("/", "_")[:50] + ".jpg"
            image_path = os.path.join(folder_name, image_name)
            
            with open(image_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"Downloaded: {image_name}")
        else:
            print(f"Failed to download: {image_url}")
    except Exception as e:
        print(f"Error downloading {image_url}: {str(e)}")

def scrape_images(keyword, max_pages=5, family='creative'):
    folder_name = keyword.replace(' ', '_')
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}")
        images_info = get_images_from_page(keyword, page, family=family)

        for image_url, alt_text in images_info:
            download_image(image_url, alt_text, folder_name)

In [10]:
keyword = "dress"  # Replace with your keyword
max_pages = 100  # Adjust the number of pages to scrape
scrape_images(keyword, max_pages, family='editorial')


Scraping page 1
Scraping: https://www.gettyimages.nl/search/2/image?family=editorial&page=1&phrase=dress&sort=best
Downloaded: Adina_Schjolden-Pedersen_wears_white_dress,_blazer.jpg
Downloaded: Adina_Schjolden-Pedersen_wears_white_dress,_blazer.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Emilie_Billington_wears_grey_dress,_black_leather_.jpg
Downloaded: Guest_wears_white_dress,_jacket,_pink_Valentino_ba.jpg
Downloaded: Guest_wears_white_dress,_jacket,_pink_Valentino_ba.jpg
Downloaded: Guest_wears_white_dress,_jacket,_pink_Valentino_ba.jpg
Downloaded: Gu

KeyboardInterrupt: 