In [7]:
import requests
import os
import json
import mimetypes
import logging
from urllib.parse import urlparse

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("image_fetch.log"),  
        logging.StreamHandler()               
    ]
)

In [3]:
API_KEY = os.getenv('API_KEY', 'AIzaSyA43pbijmUNCtMNgSopT7VOimtgERBRXKU')
SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID', 'b6f2650fd0921483a')

In [9]:
def fetch_images_with_categories(category, num_results):
    try:
        url = "https://www.googleapis.com/customsearch/v1"
        params = {
            'key': API_KEY,
            'cx': SEARCH_ENGINE_ID,
            'q': category,
            'searchType': 'image',
            'num': num_results
        }

        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        image_urls = [item['link'] for item in data.get('items', [])]
        logging.info(f"Fetched {len(image_urls)} image URLs for query: {category}")
        return image_urls

    except requests.exceptions.RequestException as e:
        logging.error(f"API request failed for {category}: {e}")
        return []
    
def fetch_unique_images(query, num_results, file_path="retrieved_urls.json"):
    previous_urls = load_previous_urls(file_path)
    new_urls = []

    while len(new_urls) < num_results:
        remaining = num_results - len(new_urls)
        fetched_urls = fetch_images_with_categories(query, remaining)
        unique_urls = [url for url in fetched_urls if url not in previous_urls]

        if not unique_urls:
            break

        new_urls.extend(unique_urls)

    save_new_urls(new_urls, file_path)
    return new_urls

def fetch_images(categories, getting_amount=0, total_limit=1000, file_path="retrieved_urls.json"):
    category_count = len(categories)
    images_per_category = total_limit // category_count
    all_images = {}

    for category in categories:
        print(f"Fetching {getting_amount} unique images for category: {category}")
        images = fetch_unique_images(category, getting_amount, file_path)
        all_images[category] = images

    return all_images

def load_previous_urls(file_path="retrieved_urls.json"):
    if os.path.exists(file_path):
        with open(file_path, "r") as file:
            return set(json.load(file))
    return set()

def save_new_urls(new_urls, file_path="retrieved_urls.json"):
    all_urls = load_previous_urls(file_path) | set(new_urls)
    with open(file_path, "w") as file:
        json.dump(list(all_urls), file)

def download_images(json_file="retrieved_urls.json", folder="images"):
    # Load URLs from JSON file
    if not os.path.exists(json_file):
        logging.error(f"JSON file {json_file} not found.")
        return
    
    with open(json_file, "r") as file:
        image_urls = json.load(file)

    if not image_urls:
        logging.warning(f"No URLs found in {json_file}.")
        return

    # Create folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Download images
    for url in image_urls:
        try:
            response = requests.get(url, stream=True, allow_redirects=True)
            response.raise_for_status()

            # Extract the file name from the URL
            parsed_url = urlparse(url)
            file_name = os.path.basename(parsed_url.path)

            # If the file name has no extension, determine it from Content-Type
            if not os.path.splitext(file_name)[1]:
                content_type = response.headers.get('Content-Type', '')
                ext = mimetypes.guess_extension(content_type.split(';')[0]) if content_type else '.jpg'
                file_name += ext

            # Save the image with the original name
            filepath = os.path.join(folder, file_name)
            with open(filepath, 'wb') as file:
                file.write(response.content)

            logging.info(f"Downloaded: {filepath}")

        except Exception as e:
            logging.error(f"Failed to download {url}: {e}")

In [5]:
# Search Query and Number of Results
categories = ["cats", "dogs", "birds", "horses", "wildlife", "flowers", "cars", "landscapes", "food"]
num_results = 10
file_path="retrieved_urls.json"

In [7]:
image_urls_by_category = fetch_images(categories=categories, getting_amount=num_results, total_limit=1000, file_path=file_path)

Fetching 5 unique images for category: cats


2025-01-14 09:51:31,415 - INFO - Fetched 5 image URLs for query: cats


Fetching 5 unique images for category: dogs


2025-01-14 09:51:31,962 - INFO - Fetched 5 image URLs for query: dogs


In [11]:
category_folder = "images"
download_images(json_file=file_path, folder=category_folder)

2025-01-14 10:06:56,695 - INFO - Downloaded: images\domestic-dog_thumb_3x2.jpg
2025-01-14 10:06:57,237 - INFO - Downloaded: images\GettyImages-1445770180.jpg
2025-01-14 10:06:58,786 - INFO - Downloaded: images\GettyImages-598175960-cute-dog-headshot.jpg
2025-01-14 10:07:01,186 - INFO - Downloaded: images\VIER%20PFOTEN_2020-10-07_00138-2890x2000-1920x1329.jpg
2025-01-14 10:07:02,091 - INFO - Downloaded: images\Orange-colored-cat-yawns-displaying-teeth.jpg
2025-01-14 10:07:03,286 - INFO - Downloaded: images\NationalGeographic_2572187_square.jpg
2025-01-14 10:07:03,502 - INFO - Downloaded: images\two-different-breeds-of-cats-side-by-side-outdoors-in-the-garden.jpg
2025-01-14 10:07:04,085 - INFO - Downloaded: images\NationalGeographic_2572187_3x4.jpg
2025-01-14 10:07:04,123 - ERROR - Failed to download https://dogsforbetterlives.org/wp-content/uploads/2022/12/trains-both-shelter-dogs-and-purpose-bred-dogs-as-service-dogs.webp: ('Connection aborted.', ConnectionResetError(10054, 'An existin