In [1]:
import os
import json
import logging
import requests
from urllib.parse import urlparse
import mimetypes

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("image_fetch.log"),  
        logging.StreamHandler()               
    ]
)

In [3]:
API_KEY = os.getenv('API_KEY', 'AIzaSyA43pbijmUNCtMNgSopT7VOimtgERBRXKU')
SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID', 'b6f2650fd0921483a')

In [4]:
MASTER_FILE = "all_fetched_urls.json"

In [5]:
def fetch_images_with_categories(category, num_results, state_file="page_state.txt"):
    """Fetch image URLs for a given category using pagination and save page state."""
    try:
        # Load the last page state
        page_state = load_page_state(state_file)
        start = page_state.get(category, 1)  # Default to the first page

        url = "https://www.googleapis.com/customsearch/v1"
        image_urls = []

        while len(image_urls) < num_results:
            params = {
                'key': API_KEY,
                'cx': SEARCH_ENGINE_ID,
                'q': category,
                'searchType': 'image',
                'num': min(10, num_results - len(image_urls)),  # Fetch up to 10 results at a time
                'start': start,
            }
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            items = data.get('items', [])
            if not items:
                break
            image_urls.extend([item['link'] for item in items])
            start += len(items)  

        # Save the updated page state
        page_state[category] = start
        save_page_state(page_state, state_file)

        return image_urls[:num_results]
    except requests.exceptions.RequestException as e:
        logging.error(f"API request failed for {category}: {e}")
        return []

def load_page_state(state_file):
    """Load the page state from a file."""
    if os.path.exists(state_file):
        with open(state_file, "r") as file:
            return json.load(file)
    return {}

def save_page_state(page_state, state_file):
    """Save the page state to a file."""
    with open(state_file, "w") as file:
        json.dump(page_state, file)


def save_urls_to_file(urls, file_path):
    """Save a list of URLs to a JSON file.

    Args:
        urls (list[str]): List of URLs to save.
        file_path (str): Path to the JSON file to save the URLs to.

    Notes:
        The URLs are saved to the given file path after deduplicating them with any
        existing URLs in the file.
    """
    existing_urls = load_urls_from_file(file_path)
    all_urls = list(set(existing_urls).union(urls))
    with open(file_path, "w") as file:
        json.dump(all_urls, file)

def load_urls_from_file(file_path):
    """Load a list of URLs from a JSON file.

    Args:
        file_path (str): Path to the JSON file containing the URLs to load.

    Returns:
        list[str]: A list of URLs loaded from the JSON file, or an empty list if the file does not exist.
    """
    if os.path.exists(file_path):
        with open(file_path, "r") as file:
            return json.load(file)
    return []

def download_images_from_urls(urls, folder):
    """Download images from a list of URLs and save them to a specified folder.

    Args:
        urls (list[str]): List of image URLs to download.
        folder (str): Path to the folder where images will be saved.

    The function creates the folder if it does not exist and downloads each image
    from the provided URLs, saving them with their original file name. If the URL
    does not have a file extension, the function attempts to determine the extension
    from the response's content type. Logs an info message for successful downloads
    and an error message for failures.
    """

    os.makedirs(folder, exist_ok=True)
    for url in urls:
        try:
            response = requests.get(url, stream=True, allow_redirects=True)
            response.raise_for_status()
            parsed_url = urlparse(url)
            file_name = os.path.basename(parsed_url.path)
            if not os.path.splitext(file_name)[1]:
                content_type = response.headers.get('Content-Type', '')
                ext = mimetypes.guess_extension(content_type.split(';')[0]) if content_type else '.jpg'
                file_name += ext
            filepath = os.path.join(folder, file_name)
            with open(filepath, 'wb') as file:
                file.write(response.content)
            logging.info(f"Downloaded: {filepath}")
        except Exception as e:
            logging.error(f"Failed to download {url}: {e}")

def fetch_and_save_images(categories, num_results_per_category, base_folder="images", state_file="page_state.txt"):
    """Fetches images from Google Custom Search API, saves the URLs to a JSON file, and downloads the images to a folder.

    Args:
        categories (list[str]): List of categories to search for.
        num_results_per_category (int): Number of results to fetch per category.
        base_folder (str, optional): Base folder where images will be saved. Defaults to "images".
        state_file (str, optional): File path to save the state of the API request. Defaults to "page_state.txt".
    """
    os.makedirs(base_folder, exist_ok=True)
    for category in categories:
        category_folder = os.path.join(base_folder, category)
        os.makedirs(category_folder, exist_ok=True)
        urls_file = os.path.join(category_folder, "urls.json")
        images_folder = os.path.join(category_folder, "images")
        os.makedirs(images_folder, exist_ok=True)

        fetched_urls = fetch_images_with_categories(category, num_results_per_category, state_file)
        new_urls = [url for url in fetched_urls if url not in load_urls_from_file(urls_file)]
        if new_urls:
            save_urls_to_file(new_urls, urls_file)
            download_images_from_urls(new_urls, images_folder)
        else:
            logging.info(f"No new images found for category: {category}.")

In [6]:
# Search Query and Number of Results
categories = ["cats", "dogs", "birds", "horses", "wildlife", "flowers", "cars", "landscapes", "food"]
num_results_per_category = 10
file_path="retrieved_urls.json"

In [7]:
fetch_and_save_images(categories, num_results_per_category)

2025-01-15 21:15:45,812 - INFO - Fetching 10 images for category: cats
2025-01-15 21:15:46,476 - INFO - Fetched 10 image URLs for query: cats
2025-01-15 21:15:46,478 - INFO - Saved 10 new URLs for cats to received_images\cats\retrieved_urls_day_cats.json
2025-01-15 21:15:46,867 - INFO - Downloaded: received_images\cats\images\two-different-breeds-of-cats-side-by-side-outdoors-in-the-garden.jpg
2025-01-15 21:15:47,767 - INFO - Downloaded: received_images\cats\images\NationalGeographic_2572187_square.jpg
2025-01-15 21:15:49,698 - INFO - Downloaded: received_images\cats\images\Orange-colored-cat-yawns-displaying-teeth.jpg
2025-01-15 21:15:50,023 - INFO - Downloaded: received_images\cats\images\NationalGeographic_1468962_square.jpg
2025-01-15 21:15:51,613 - INFO - Downloaded: received_images\cats\images\VIER%20PFOTEN_2020-10-07_00138-2890x2000-1920x1329.jpg
2025-01-15 21:15:52,119 - INFO - Downloaded: received_images\cats\images\NationalGeographic_2572187_3x4.jpg
2025-01-15 21:15:53,511 - 