# Google Image Scraper

*This notebook is to scrap images from Google Image using a set of search keywords.*

In [1]:
import os
import json
import requests
from io import BytesIO
from tqdm import tqdm
import time
from PIL import Image

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def starter(headless=False):
    "Initialize the web-driver"

    # Initialize driver
    service = Service(ChromeDriverManager().install())
    options = Options()
    if headless:
        options.add_argument("--headless")
    driver = webdriver.Chrome(service=service, options=options)
    driver.implicitly_wait(5)

    # Go to Google
    driver.get("https://www.google.com")

    # Reject cookies
    driver.find_element(By.XPATH, "//button[.//div[contains(text(), 'Tout refuser')]]").click()

    # Disable SafeSearch
    driver.get("https://www.google.com/safesearch")
    driver.find_element(By.XPATH, "//div[contains(text(), 'Désactiver')]").click()
    
    return driver

In [3]:
def download_image(image_url, save_path):
    "Download an image from its URL"
    
    url = image_url.split("?")[0]
    extension = url.split(".")[-1]
    if extension in ["jpg", "jpeg", "png"]:
        try:
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=1)
            image = Image.open(BytesIO(response.content))
            resized_image = image.resize((500, 500))
            resized_image.save(f"{save_path}.{extension}")
            return True
        except:
            return False
    return False

In [4]:
def download_keyword_images(driver, keyword, category_path):
    "Download images based on a keyword search"

    print(f"Looking for '{keyword}' images...")
    url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}&tbm=isch"
    driver.get(url)

    # Scroll down
    for i in range(5):
        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Find images
    time.sleep(5)
    images = driver.find_elements(By.XPATH, "//div[@id='islrg']//div[@role='listitem']")
    print(f"- {len(images)} images found")

    # Download images
    idx = 0
    for image in tqdm(images):
        image.click()
        image_details = driver.find_elements(By.XPATH, "//img[@jsname='kn3ccd']")
        if len(image_details) == 1:
            image_url = image_details[0].get_attribute("src")
            success = download_image(
                image_url=image_url, 
                save_path=f"{category_path}/{keyword.replace(' ', '_')}_{idx}")
            if success:
                idx += 1

    print(f"{idx + 1} images downloaded.")

In [5]:
IMAGE_PATH = "../../data/google_image/"

# Load keywords
with open("keywords/google_image_keywords.json") as f:
    keywords = json.load(f)

print(f"Number of classes: {len(keywords)}")
print(f"Number of search keywords: {len([search for category in keywords.values() for search in category])}")

Number of classes: 30
Number of search keywords: 122


In [6]:
# Very long!
driver = starter(headless=True)

for category in keywords:
    
    print(f"\n¤ ¤ ¤ ¤ ¤ Class {category} ¤ ¤ ¤ ¤ ¤")
    
    # Creates category folder
    category_path = IMAGE_PATH + category
    if not os.path.exists(category_path):
        os.makedirs(category_path)
    
    # Download images
    for keyword in keywords[category]:
        download_keyword_images(driver, keyword, category_path)


¤ ¤ ¤ ¤ ¤ Class reading ¤ ¤ ¤ ¤ ¤
Looking for 'book' images...
- 400 images found


100%|██████████| 400/400 [17:12<00:00,  2.58s/it]


311 images downloaded.
Looking for 'books' images...
- 400 images found


100%|██████████| 400/400 [14:52<00:00,  2.23s/it]


330 images downloaded.
Looking for 'book reading' images...
- 400 images found


100%|██████████| 400/400 [16:17<00:00,  2.44s/it]


308 images downloaded.
Looking for 'home library' images...
- 400 images found


100%|██████████| 400/400 [15:26<00:00,  2.32s/it]


333 images downloaded.
Looking for 'reading ebook' images...
- 400 images found


100%|██████████| 400/400 [15:38<00:00,  2.35s/it]


310 images downloaded.
Looking for 'magazines' images...
- 400 images found


100%|██████████| 400/400 [15:00<00:00,  2.25s/it]


335 images downloaded.

¤ ¤ ¤ ¤ ¤ Class running ¤ ¤ ¤ ¤ ¤
Looking for 'running' images...
- 400 images found


100%|██████████| 400/400 [15:05<00:00,  2.26s/it]


311 images downloaded.
Looking for 'running shoes' images...
- 400 images found


100%|██████████| 400/400 [11:06<00:00,  1.67s/it]


326 images downloaded.
Looking for 'running gear' images...
- 400 images found


100%|██████████| 400/400 [15:05<00:00,  2.26s/it]


298 images downloaded.
Looking for 'running equipment' images...
- 400 images found


100%|██████████| 400/400 [15:56<00:00,  2.39s/it]


313 images downloaded.

¤ ¤ ¤ ¤ ¤ Class sport ¤ ¤ ¤ ¤ ¤
Looking for 'amateur soccer' images...
- 400 images found


100%|██████████| 400/400 [16:51<00:00,  2.53s/it]


271 images downloaded.
Looking for 'badminton' images...
- 400 images found


100%|██████████| 400/400 [16:52<00:00,  2.53s/it]


302 images downloaded.
Looking for 'padel tennis' images...
- 400 images found


100%|██████████| 400/400 [15:21<00:00,  2.30s/it]


337 images downloaded.
Looking for 'tennis' images...
- 400 images found


100%|██████████| 400/400 [15:02<00:00,  2.26s/it]


296 images downloaded.

¤ ¤ ¤ ¤ ¤ Class train ¤ ¤ ¤ ¤ ¤
Looking for 'in a train' images...
- 400 images found


100%|██████████| 400/400 [17:21<00:00,  2.60s/it]


288 images downloaded.
Looking for 'train view window' images...
- 400 images found


100%|██████████| 400/400 [16:05<00:00,  2.41s/it]


281 images downloaded.
Looking for 'train seat window' images...
- 400 images found


100%|██████████| 400/400 [17:59<00:00,  2.70s/it] 


245 images downloaded.
Looking for 'train station' images...
- 400 images found


100%|██████████| 400/400 [18:53<00:00,  2.83s/it]


294 images downloaded.

¤ ¤ ¤ ¤ ¤ Class traveling ¤ ¤ ¤ ¤ ¤
Looking for 'travel luggage' images...
- 400 images found


100%|██████████| 400/400 [13:57<00:00,  2.09s/it]


299 images downloaded.
Looking for 'packing bags' images...
- 400 images found


100%|██████████| 400/400 [11:59<00:00,  1.80s/it]


315 images downloaded.

¤ ¤ ¤ ¤ ¤ Class working ¤ ¤ ¤ ¤ ¤
Looking for 'working' images...
- 400 images found


100%|██████████| 400/400 [14:48<00:00,  2.22s/it]


327 images downloaded.
Looking for 'computer' images...
- 400 images found


100%|██████████| 400/400 [21:07<00:00,  3.17s/it]


252 images downloaded.
Looking for 'work meeting' images...
- 400 images found


100%|██████████| 400/400 [17:15<00:00,  2.59s/it]


315 images downloaded.
Looking for 'work office' images...
- 400 images found


100%|██████████| 400/400 [17:45<00:00,  2.66s/it]

312 images downloaded.



