# Google Image Scraper

*This notebook is to scrap images from Google Image using a set of search keywords.*

In [1]:
import os
import json
import requests
from tqdm import tqdm
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def starter(headless=False):

    # Initialize driver
    service = Service(ChromeDriverManager().install())
    options = Options()
    if headless:
        options.add_argument("--headless")
    driver = webdriver.Chrome(service=service, options=options)
    driver.implicitly_wait(5)

    # Go to Google
    driver.get("https://www.google.com")

    # Reject cookies
    driver.find_element(By.XPATH, "//button[.//div[contains(text(), 'Tout refuser')]]").click()

    # Disable SafeSearch
    driver.get("https://www.google.com/safesearch")
    driver.find_element(By.XPATH, "//div[contains(text(), 'Désactiver')]").click()
    return driver

In [3]:
def download_image(image_url, save_path):
    url = image_url.split("?")[0]
    extension = url.split(".")[-1]
    if extension in ["jpg", "jpeg", "png"]:
        try:
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=3)
            with open(f"{save_path}.{extension}", 'wb') as f:
                f.write(response.content)
            return True
        except:
            return False        
    return False

In [4]:
def keyword_search(driver, keyword, category_path):

    print(f"Looking for '{keyword}' images...")
    url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}&tbm=isch"
    driver.get(url)

    # Scroll down
    for i in range(5):
        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Find images
    time.sleep(5)
    images = driver.find_elements(By.XPATH, "//div[@id='islrg']//div[@role='listitem']")
    print(f"- {len(images)} images found")

    # Download images
    idx = 0
    for image in tqdm(images):
        image.click()
        image_details = driver.find_elements(By.XPATH, "//img[@jsname='kn3ccd']")
        if len(image_details) == 1:
            image_url = image_details[0].get_attribute("src")
            success = download_image(
                image_url=image_url, 
                save_path=f"{category_path}/{keyword.replace(' ', '_')}_{idx}")
            if success:
                idx += 1

    print(f"{idx + 1} images downloaded.")

In [5]:
IMAGE_PATH = "../../data/google_image/"

with open("keywords/google_image_keywords.json") as f:
    keywords = json.load(f)

print(f"Number of classes: {len(keywords)}")
print(f"Number of search keywords: {len([search for category in keywords.values() for search in category])}")

Number of classes: 30
Number of search keywords: 122


In [8]:
# Very long!
driver = starter(headless=True)

for category in keywords:
    
    print(f"\n¤ ¤ ¤ ¤ ¤ Class {category} ¤ ¤ ¤ ¤ ¤")
    
    # Creates category folder
    category_path = IMAGE_PATH + category
    if not os.path.exists(category_path):
        os.makedirs(category_path)
    
    # Download images
    for keyword in keywords[category]:
        keyword_search(driver, keyword, category_path)


¤ ¤ ¤ ¤ ¤ Class traveling ¤ ¤ ¤ ¤ ¤
Looking for 'travel luggage' images...
- 400 images found


100%|██████████| 400/400 [08:40<00:00,  1.30s/it]


312 images downloaded.
Looking for 'packing bags' images...
- 400 images found


100%|██████████| 400/400 [07:01<00:00,  1.05s/it]


345 images downloaded.

¤ ¤ ¤ ¤ ¤ Class working ¤ ¤ ¤ ¤ ¤
Looking for 'working' images...
- 400 images found


100%|██████████| 400/400 [10:00<00:00,  1.50s/it]


353 images downloaded.
Looking for 'computer' images...
- 400 images found


100%|██████████| 400/400 [09:40<00:00,  1.45s/it]


342 images downloaded.
Looking for 'work meeting' images...
- 400 images found


100%|██████████| 400/400 [08:37<00:00,  1.29s/it]


360 images downloaded.
Looking for 'work office' images...
- 400 images found


100%|██████████| 400/400 [08:48<00:00,  1.32s/it]

335 images downloaded.



