In [1]:
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import json

## Helpers

In [2]:
def load_names_from_file(file_path):
    """
    Load names from a text file and return them as a list.

    Parameters:
    file_path (str): The path to the text file containing the names.

    Returns:
    list: A list of names.
    """
    names_list = []

    # Open and read the file
    with open(file_path, 'r') as file:
        # Read each line in the file
        for line in file:
            # Strip leading/trailing whitespace and add to the list
            names_list.append(line.strip())

    return names_list

In [3]:
HEADERS = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-GB,en;q=0.9",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15"
}

def get_images_from_page(keyword, page_num, family='creative'):
    assert family in ['creative', 'editorial'], "Family must be either 'creative' or 'editorial'"
    keyword = keyword.replace(' ', '%20')
    url = f"https://www.gettyimages.nl/search/2/image?family={family}&page={page_num}&phrase={keyword}&sort=best"
    print(f"Scraping: {url}")
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')

    images_info = []
    picture_tags = soup.find_all('picture')
    img_tags = soup.find_all('img')

    # for picture_tag in picture_tags:
    #     img_tag = picture_tag.find('img')
    #     if img_tag:
    #         image_url = img_tag.get('src')
    #         alt_text = img_tag.get('alt')
    #         if image_url and alt_text:
    #             images_info.append((image_url, alt_text))

    for img_tag in img_tags:
        image_url = img_tag.get('src')
        alt_text = img_tag.get('alt')
        if image_url and alt_text:
            images_info.append((image_url, alt_text))

    return images_info

def download_image(image_url, folder_name, keyword, image_id):
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            timestamp = datetime.now().strftime("%d%m%Y")
            keyword_modified = keyword.replace(' ', '_')
            image_name = f"{keyword_modified}_{timestamp}_{image_id}.jpg"
            image_path = os.path.join(folder_name, image_name)
            
            with open(image_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"Downloaded: {image_name}")
            return image_name
        else:
            print(f"Failed to download: {image_url}")
            return None
    except Exception as e:
        print(f"Error downloading {image_url}: {str(e)}")
        return None

def save_metadata_to_json(metadata, folder_name):
    json_file_path = os.path.join(folder_name, "metadata.json")
    with open(json_file_path, 'w') as json_file:
        json.dump(metadata, json_file, indent=4)
    print(f"Metadata saved to {json_file_path}")

def scrape_images(keyword, max_pages=5, family='creative'):
    folder_name = keyword.replace(' ', '_')
    folder_name = "x_" + folder_name
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    image_id = 1
    metadata = {"images": []}
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}")
        images_info = get_images_from_page(keyword, page, family=family)

        for image_url, alt_text in images_info:
            image_name = download_image(image_url, folder_name, keyword, image_id)
            if image_name:
                metadata["images"].append({
                    "filename": image_name,
                    "caption": alt_text,
                    "match": False,
                    "design": "NONE",
                    "score": 0.0,
                    "URL": image_url
                })
                image_id += 1

            save_metadata_to_json(metadata, folder_name)

In [5]:
file_path = 'names.txt'
names = load_names_from_file(file_path)

for name in names:
    scrape_images(name, max_pages=5, family='editorial')
    print(f"Scraped images for {name}")

Scraping page 1
Scraping: https://www.gettyimages.nl/search/2/image?family=editorial&page=1&phrase=Denzel%20Washington&sort=best
Error downloading /components/global-nav/static/static/GettyHeaderLogo-4c344fa4f9e47c257bea.svg: Invalid URL '/components/global-nav/static/static/GettyHeaderLogo-4c344fa4f9e47c257bea.svg': No scheme supplied. Perhaps you meant https:///components/global-nav/static/static/GettyHeaderLogo-4c344fa4f9e47c257bea.svg?
Metadata saved to x_Denzel_Washington/metadata.json
Error downloading /components/global-nav/static/static/UnsplashForBrands-00c7af5aed68b4b7f3f3.svg: Invalid URL '/components/global-nav/static/static/UnsplashForBrands-00c7af5aed68b4b7f3f3.svg': No scheme supplied. Perhaps you meant https:///components/global-nav/static/static/UnsplashForBrands-00c7af5aed68b4b7f3f3.svg?
Metadata saved to x_Denzel_Washington/metadata.json
Downloaded: Denzel_Washington_03092024_1.jpg
Metadata saved to x_Denzel_Washington/metadata.json
Downloaded: Denzel_Washington_0309

KeyboardInterrupt: 

In [None]:
# # Keyword to search for
# keyword = "the beatles magical mystery tour"

# # Number of pages to scrape
# max_pages = 1

# # Scrape images
# scrape_images(keyword, max_pages, family='editorial')
