## Relevant Libraries

An note here, to webscrape google images I'll be using selenium, because the Google Search Image page is dynamic.

In [2]:
import requests
import hashlib
import io
import os
from PIL import Image
import time
from selenium import webdriver

### Helper Functions for fetching Google Image Urls

This will make more sense after reading throught the rest of the code

In [3]:
def initiate_selenium_and_search_google(web_driver_path, query:str):
    WEB_DRIVER = webdriver.Chrome(executable_path=web_driver_path)
    google_image_query= 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img'

    WEB_DRIVER.get(google_image_query.format(q=query))

    return WEB_DRIVER

In [4]:
def scroll_to_bottom_of_page(web_driver, sleep_between_interaction):
    web_driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    time.sleep(sleep_between_interaction)

In [5]:
def click_load_more_button(web_driver):
    load_more_button = web_driver.find_element_by_css_selector('.mye4qd')

    if load_more_button:
            web_driver.execute_script('document.querySelector(".mye4qd").click();') 

## Main Helper Functions

### Fetch Image Urls From Google Search

The top three helper functions helped make this function more readable and understandable

In [6]:
def fetch_google_images_urls(query:str, web_driver_path, sleep_between_interaction=1, number_of_images=100):

    WEB_DRIVER = initiate_selenium_and_search_google(web_driver_path, query=query)

    image_urls = set()
    results_start = 0
    number_results = 0

    while number_results < number_of_images:
        
        scroll_to_bottom_of_page(
            WEB_DRIVER, 
            sleep_between_interaction=sleep_between_interaction)

        thumbnail_results = WEB_DRIVER.find_elements_by_css_selector('img.Q4LuWd')
        number_results = len(thumbnail_results)
        print(number_results)

        click_load_more_button(WEB_DRIVER)

    print(f'Found {number_results} search results.', end=' ') 
    print(f'Extracting links from {results_start} to {number_results}')

    for image in thumbnail_results[results_start:number_results]:
        try:
            image.click()
            time.sleep(sleep_between_interaction)
        except Exception:
            continue

        actual_images = WEB_DRIVER.find_elements_by_css_selector('img.n3VNCb')
        
        for actual_image in actual_images:
            if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                image_urls.add(actual_image.get_attribute('src'))
        
        if len(image_urls) >= number_of_images:
            break

    print('Extracted URLS from', len(image_urls), 'results')  

    WEB_DRIVER.quit() 

    return image_urls

### Take URLs of Google Images and download them

In [7]:
def download_images(folder_path:str, url:str):
    successfully_downloaded = 0
    unsuccessfully_downloaded = 0

    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f'Error could not download {url} - {e}')

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, 'JPEG', quality=85)
        
        print(f"SUCCESS - saved {url} - as file {file_path}")

    except Exception as e:
        print(f"ERROR - couldn't save {url} - {e}")


# Putting It All together

This function will take URLs from google images, download them, and put them in a folder that I specify

In [8]:
def search_and_download_google_images(search_term:str, driver_path:str, target_path='./image', number_of_images=5):
    
    target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    list_of_urls = fetch_google_images_urls(
        query=search_term, 
        web_driver_path=driver_path,
        sleep_between_interaction=0.001,
        number_of_images=number_of_images)

    for url in list_of_urls:
        download_images(target_folder, url)

    print(len(os.listdir(target_folder)), 'pictures downloaded')

## Now lets finally scrape some images

Check your current directory and see if the images were scraped correctly

You're going to need the a selenium Chrome driver to make these functions work

In [9]:
DRIVER_PATH = r'C:/Users/Mostafa Elbannan/Desktop/Programming/Web_scraping/chromedriver'

### Let's download some dolphin images

In [9]:
search_and_download_google_images(search_term='dolphin', driver_path=DRIVER_PATH, number_of_images=600)

rlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc&enable=upscale&s=5cd7f8ca387971c0202d3b20cce38c93 - as file ./image\dolphin\d106c76fc0.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcT-JSZG_2rh_k4dELaQbjnCi0FDH0bJyc260A&usqp=CAU - as file ./image\dolphin\87bc09d6e4.jpg
SUCCESS - saved https://images.theconversation.com/files/46930/original/8x6wnb39-1398261971.jpg?ixlib=rb-1.1.0&q=45&auto=format&w=1200&h=1200.0&fit=crop - as file ./image\dolphin\a2a4d7d5c4.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQhJqdRhdzrLBiC6lDfTj-GMTya-YRPJZjGcw&usqp=CAU - as file ./image\dolphin\42a993f458.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQhrGAfw26tpIrE6MrPuiDe0P7qPEUreIY-pQ&usqp=CAU - as file ./image\dolphin\e460ab1e21.jpg
SUCCESS - saved https://fh-sites.imgix.net/sites/1580/2019/10/12072324/Common-Dolphin-0081_copyright-Dolphin-Safari.jpg?auto=compre

### Let's download some kitty images (I know you're excited)

In [10]:
search_and_download_google_images(search_term='cat', driver_path=DRIVER_PATH, number_of_images=600)

age\cat\6df6f2e657.jpg
SUCCESS - saved https://secureservercdn.net/72.167.241.134/d9c.726.myftpupload.com/wp-content/uploads/Tootles3.jpeg - as file ./image\cat\8bfd81943b.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcSPigFfT-RO7U6PSFXQM3FtuKM4sSRHQT21-g&usqp=CAU - as file ./image\cat\2f44f07b2b.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTAyAVzvBflsZz-kInpwN-4z5InESkQhEvqQw&usqp=CAU - as file ./image\cat\fabd226eb2.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcT4_4BEcMcs6MKIbFHnlkaUQgGj2_SE8VgF4g&usqp=CAU - as file ./image\cat\7985359c98.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcR74-Z7DH1UbTmbwePmJbsopDdBl5Q5EEm2zw&usqp=CAU - as file ./image\cat\aa6b4ce889.jpg
SUCCESS - saved https://i.insider.com/5aa10ca0d877e618008b4678?width=1100&format=jpeg&auto=webp - as file ./image\cat\b8f364ac84.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9G