# How to (quickly) build a deep learning image dataset
https://www.pyimagesearch.com/2018/04/09/how-to-quickly-build-a-deep-learning-image-dataset/

In [1]:
from requests import exceptions
import requests
import cv2
import os

## Settings

In [2]:
query = 'paella'
output_path = 'output/'

In [3]:
# Set your Microsoft Cognitive Services API key along with (1) the
# maximum number of results for a given search and (2) the group size
# for results (maximum of 50 per request)
API_KEY = "XXX"
MAX_RESULTS = 100
GROUP_SIZE = 50

# Set the endpoint API URL
URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"

# When attempting to download images from the web both the Python
# programming language and the requests library have a number of
# exceptions that can be thrown so let's build a list of them now
# so we can filter on them
EXCEPTIONS = set([IOError, FileNotFoundError,
    exceptions.RequestException, exceptions.HTTPError,
    exceptions.ConnectionError, exceptions.Timeout])

## Make the search

In [4]:
# Store the search term in a convenience variable then set the
# headers and search parameters
term = query
headers = {"Ocp-Apim-Subscription-Key" : API_KEY}
params = {"q": term, "offset": 0, "count": GROUP_SIZE}

# Make the search
print("Searching Bing API for '{}'".format(term))
search = requests.get(URL, headers=headers, params=params)
search.raise_for_status()

# Grab the results from the search, including the total number of
# estimated results returned by the Bing API
results = search.json()
estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
print("{} total results for '{}'".format(estNumResults, term))

Searching Bing API for 'paella'
100 total results for 'paella'


In [5]:
# Initialize the total number of images downloaded thus far
total = 0

# Loop over the estimated number of results in `GROUP_SIZE` groups
for offset in range(0, estNumResults, GROUP_SIZE):
    # Update the search parameters using the current offset, then
    # make the request to fetch the results
    print("Making request for group {}-{} of {}...".format(offset, offset + GROUP_SIZE, estNumResults))
    params["offset"] = offset
    search = requests.get(URL, headers=headers, params=params)
    search.raise_for_status()
    results = search.json()
    print("Saving images for group {}-{} of {}...".format(offset, offset + GROUP_SIZE, estNumResults))
    
    # Loop over the results
    for v in results["value"]:
        # Try to download the image
        try:
            # Make a request to download the image
            print("Fetching: {}".format(v["contentUrl"]))
            r = requests.get(v["contentUrl"], timeout=30)
            
            # Build the path to the output image
            ext = v["contentUrl"][v["contentUrl"].rfind("."):]
            p = os.path.sep.join([output_path, "{}{}".format(str(total).zfill(8), ext)])
            
            # Write the image to disk
            f = open(p, "wb")
            f.write(r.content)
            f.close()
            
        # Catch any errors that would not unable us to download the image
        except Exception as e:
            # Check to see if our exception is in our list of exceptions to check for
            if type(e) in EXCEPTIONS:
                print("Skipping: {}".format(v["contentUrl"]))
                continue
                
        # Try to load the image from disk
        image = cv2.imread(p)
        
        # If the image is `None` then we could not properly load the
        # image from disk (so it should be ignored)
        if image is None:
            print("Deleting: {}".format(p))
            os.remove(p)
            continue
        
        # Update the counter
        total += 1

Making request for group 0-50 of 100...
Saving images for group 0-50 of 100...
Fetching: https://upload.wikimedia.org/wikipedia/commons/thumb/5/57/Homemade_Paella_with_lots_of_seafood.jpg/1200px-Homemade_Paella_with_lots_of_seafood.jpg
Fetching: https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Paella_de_marisco_01.jpg/1200px-Paella_de_marisco_01.jpg
Fetching: http://i.kinja-img.com/gawker-media/image/upload/s--M4OE0GV9--/851002898017268549.jpg
Fetching: https://img1.cookinglight.timeinc.net/sites/default/files/styles/4_3_horizontal_-_1200x900/public/image/2017/03/main/shrimp-paella-1705p55.jpg?itok=KpoZxRRP
Fetching: https://www.simplyrecipes.com/wp-content/uploads/2018/07/Seafood-Paella-LEAD-VERTICAL.jpg
Fetching: https://o.aolcdn.com/images/dims3/GLOB/legacy_thumbnail/1200x630/format/jpg/quality/85/http%3A%2F%2Fo.aolcdn.com%2Fhss%2Fstorage%2Fmidas%2F6411f3782c0151d45f62312d57ad12ac%2F204207047%2F476864622.jpg
Fetching: https://gbc-cdn-public-media.azureedge.net/img64614.1426

Fetching: http://3.bp.blogspot.com/-otyIaXXN4BU/U1j1aaGTPLI/AAAAAAAAAVE/eMlMjinRXM8/s1600/paella4.jpg
Fetching: http://cdn.shopify.com/s/files/1/0654/1551/files/SeafoodPaella_1024x1024.jpg?7856200432557518239
Fetching: http://1.bp.blogspot.com/-Jga_g7ynRR4/TxKuVpBiiwI/AAAAAAAAA_0/0BSSXx9WRmA/s1600/Leftbanker-Mi+Paella.jpeg
Fetching: https://i.ytimg.com/vi/DQGSkkDpFB0/maxresdefault.jpg
Fetching: https://media.blueapron.com/recipes/2456/square_newsletter_images/1500659446-8-0012-2411/821_FPV_Veg-Paella_74289_WEB_SQ_hi_res.jpg
Fetching: http://www.bbcgoodfood.com/sites/default/files/styles/recipe/public/recipe/recipe-image/2018/06/paella.jpg?itok=7-8NZbQE
Deleting: output//00000075.jpg?itok=7-8NZbQE
Fetching: https://upload.wikimedia.org/wikipedia/commons/thumb/1/1e/Paella_de_mariscos.jpg/1200px-Paella_de_mariscos.jpg
Fetching: https://2.bp.blogspot.com/_E9gfBDmEIws/S-CNZb3yTKI/AAAAAAAAAY8/p9Bs8b6KA_s/s1600/IMG_0182.JPG
Fetching: https://hipfoodiemom.com/wp-content/uploads/2015/10/IMG_097