# 6.3 IMDB images

3. Suppose we want to build a data set for a Computer vision task that involves gender images. 
4. Your tasks are the following:
   * Collect 10k male/female images from: https://www.imdb.com
   * Make sure to render the whole page using selenium and then use BeautifulSoup  to scrape the images
   * Create a folder for male/female
   * Each image will be named after the person in the picture


# Class with scraping functions

In [26]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium import webdriver
import time
import os
import requests
from bs4 import BeautifulSoup
import json

class IMDBScraper():

    # constructor
    def __init__(self):
        self.image_dict = {}
        self.driver = self.create_webdriver()
       
    def save_dictionary_to_JSON(self):
        print("Saving dictionary as JSON")
        with open('imdb_images.json', 'w') as json_file:
            json.dump(self.image_dict, json_file)
    
    def create_webdriver(self):
        # create a webdriver instance
        exe_location = r"C:\Webdriver\geckodriver.exe"
        firefox_binary_location = r"C:\Program Files\Mozilla Firefox\firefox.exe"  
        options = Options()
        options.binary_location = firefox_binary_location
        service = Service(executable_path=exe_location)
        driver = webdriver.Firefox(service=service, options=options)
        return driver
    
    # download and save an image 
    def download_image(self, url, gender, filename):
        # create directory path for the gender
        dir_path = os.path.join("./imdb_images", gender)
        file_path = os.path.join(dir_path, filename)
        # create directory if it doesn't exist
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)        
        # do nothing if file was already downloaded earlier
        if os.path.exists(file_path):
            return
        # otherwise, download and save the image
        response = requests.get(url, stream=True)
        response.raise_for_status()
        # write to file
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    
    def scrape_images(self, gender, n_images_to_download):
        # initialize
        n_downloaded = 0
        batch_size = 250
        batch_nr = 1
        self.driver.get("https://www.imdb.com/search/name/?gender=" + gender + "&count=" + str(batch_size) + "&start=1&ref_=rlm")
        while n_downloaded < n_images_to_download:
            start_time = time.time()
            soup = BeautifulSoup(self.driver.page_source, "html.parser")
            image_elements = soup.select('.lister-item-image img')
            for image_element in image_elements:
                # get the URL and name of actor/actress
                img_url = image_element['src']
                lister_item_content = image_element.find_parent('div', class_='lister-item').find('div', class_='lister-item-content')
                name = lister_item_content.find('h3', class_='lister-item-header').find('a').text.strip()
                # construct file name based on name
                filename = name.strip().replace(' ','_').replace('.','_').replace('?','') + ".jpg"
                filename = ''.join(c for c in filename if c.isalnum() or c.isspace() or c in ('.', '_', '-'))
                # download and save the image                
                self.download_image(img_url, gender, filename)
                # update dictionary
                self.image_dict[name] = (gender, filename, img_url)                
                # are we done?
                if n_downloaded >= n_images_to_download:                   
                    break
            # Print some progress info
            n_seconds = time.time() - start_time
            print(f"Batch nr {batch_nr} ({batch_size} images) took {round(n_seconds,1)} seconds; current repo size = {len(self.image_dict)}")
            batch_nr += 1
            # Click 'next' button (the &start=... parameter works only for the first 10k images)
            button = self.driver.find_element(By.XPATH, '//a[@class="lister-page-next next-page"]')
            button.click()            
            time.sleep(10)
    
        # Save the dictionary as a JSON file
        self.save_dictionary_to_JSON()

    def print_image_dict(self):
        for name, (gender, image_file_name, img_url) in self.image_dict.items():
            print(f"Name: {name}\nGender: {gender}\nImage File Name: {image_file_name}\n")
    
    def close_driver(self):
        self.driver.quit()


# Main script

In [27]:
# Example usage
scraper = IMDBScraper()
scraper.scrape_images('female', 10000)
scraper.scrape_images('male', 10000)
scraper.close_driver()

#scraper.print_image_dict()


Batch nr 1 (250 images) took 2.5 seconds; current repo size = 250
Batch nr 2 (250 images) took 0.3 seconds; current repo size = 500
Batch nr 3 (250 images) took 0.3 seconds; current repo size = 750
Batch nr 4 (250 images) took 0.4 seconds; current repo size = 999
Batch nr 5 (250 images) took 0.3 seconds; current repo size = 1249
Batch nr 6 (250 images) took 0.4 seconds; current repo size = 1499
Batch nr 7 (250 images) took 0.3 seconds; current repo size = 1749
Batch nr 8 (250 images) took 0.3 seconds; current repo size = 1999
Batch nr 9 (250 images) took 0.4 seconds; current repo size = 2249
Batch nr 10 (250 images) took 0.3 seconds; current repo size = 2499
Batch nr 11 (250 images) took 0.4 seconds; current repo size = 2749
Batch nr 12 (250 images) took 0.3 seconds; current repo size = 2999
Batch nr 13 (250 images) took 0.3 seconds; current repo size = 3249
Batch nr 14 (250 images) took 0.4 seconds; current repo size = 3499
Batch nr 15 (250 images) took 0.3 seconds; current repo size 

KeyboardInterrupt: 