In [11]:
# Photo Source URL description: https://www.flickr.com/services/api/misc.urls.html

# Find the id of a group by group name: https://www.flickr.com/services/api/explore/flickr.urls.lookupGroup
# Get photos from a group: https://www.flickr.com/services/api/explore/flickr.groups.pools.getPhotos

# flickr photo search api: https://www.flickr.com/services/api/explore/flickr.photos.search

# Flickr photo licenses explained: https://www.flickr.com/creativecommons/
# Flick photo licenses: https://www.flickr.com/services/api/explore/flickr.photos.licenses.getInfo

# sample download scripts: https://gist.github.com/zmwangx/b1c16b197b5416143c7a
#                          https://www.programcreek.com/python/example/6468/flickrapi.FlickrAPI
#% reset

In [12]:
import os
import sys

import math
import time
import pickle

from urllib.request import urlretrieve
from multiprocessing import Pool
import flickrapi

In [20]:
api_key = #''
secret = #''

SyntaxError: invalid syntax (<ipython-input-20-71f9eb60a1ba>, line 1)

In [14]:
def get_flickr_search_base_config(pagenum):
    """
    Generate base param configuration for Flickr API query.
    
    Params:
        pagenum (int): Index (starting from 1) of the result page to fetch.  
    
    Returns: 
        A Python dictionary containing the configuration
    """
    
    config = dict(per_page=100,
                  content_type=1,
                  license='1,2,3,4,5,6,9,10',
                  extras='url_o,url_c',
                  sort='relevance',
                 )

    # add pagenum to config if has valid value  
    if not pagenum is None:
        config['page'] = pagenum
        
    return config

In [15]:
def download_flickr_img(photos, download_dir='/disks/data/datasets/dloaded/', 
                        im_size = 'c', max_num = None, show_log = False, worker_id = None):
    """
    
    Downloads images from Flickr.
    
    Params: 
        photos (sequence): Sequence of 'photo' elements (python dictionary) from Flickr API image search result.
        download_dir (str): Location where the downloaded images will be stored. (current directory).
        im_size (str): One of 'm', 'c', 'o', 'b' etc. See https://www.flickr.com/services/api/misc.urls.html for details.
        max_num (int): Maximum number of images to download from the given sequence (None). 
        show_log (Bool): Enables logging (False).
        worker_id (int): For tracking the worker this invokation belongs to (None).
        
    Returns: Does not return any value    
    """
    
    if show_log and not worker_id is None:
        print(f"image downloader#{worker_id} received list of {len(photos)} images ")
        
    # create the download directory if does not exist
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
        
    # limit maximum number of images to be downloaded if was specified
    if max_num is None:
        pass
    else:
        photos = photos[:min(max_num, len(photos))]
    
    # build image URL from component information contained in the photo elements
    def build_im_url(photo, im_size):
        try:
            farm_id = photo['farm']
            server_id = photo['server']  
            im_id = photo['id'] 
            secret = photo['secret'] 
            im_sz = im_size
        except:
            return None
        
        # format: https://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}_[mstzb].jpg
        return f'https://farm{farm_id}.staticflickr.com/{server_id}/{im_id}_{secret}_{im_sz}.jpg'  

    
    for photo in photos:
        #img_url = photo.get('url_c') or photo.get('url_o') or build_im_url(photo, im_size)
        img_url = build_im_url(photo, im_size)
    
        # move on if image is not available
        if img_url is None:
            continue
        
        # build image name from location directory and image id
        img_name = os.path.join(download_dir, img_url.split('/')[-1])
        
        # download and save image
        urlretrieve(img_url, img_name)
        
        if show_log:
            print('Downloaded '+img_name)
        

In [16]:
def download_flickr_img_url_by_tag(tags, pagenum = None, show_log = False):
    """
    Downloads information about images matching the tag(s) 
    
    Params:
        tags (str): Comma separated tag words to search with
        #tags_conjunct (Bool): Indicates if intended tag combination is 'AND' (True).
        #search_title (Bool): Indicates if should search within the image title (True).
        pagenum (int): Index (starting from 1) of the result page to fetch (None).                       
        show_log (Bool): Enables logging (False).
    
    Returns: 
        dictionary containing the query result
    """
    
    # search with 'AND'ed tags (conjunction of tags)
    tags_conjunct = True
    
    # do search within title
    search_title = True 
    
    
    # initialize flickr api
    flickr = flickrapi.FlickrAPI(api_key, secret,format='parsed-json',
                               store_token=False,
                               cache=True)
    
    # generate base configuration for the flickr query    
    config = get_flickr_search_base_config(pagenum)
    
    # add tag query specific params
    if search_title:
        config['text'] = " ".join(tags.split(","))
        
    config['tags'] = tags
    config['tag_mode'] = 'all' if tags_conjunct else 'any' 

    # fetch image infos
    result = flickr.photos.search(**config)
    
    if(show_log):
        print(f"Downloading page: {pagenum}")

    return result


def download_flickr_img_by_group(group_name, pagenum = None, show_log = False):

    """
    Downloads information about images from a given Flickr group
    
    Params: 
        group_name (str): Name of the flickr group.
        pagenum (int): Index (starting from 1) of the result page to fetch (None).                       
        show_log (Bool): Enables logging (False).
    
    Returns: 
        dictionary containing the query result 
    """

    # initialize flickr api
    flickr = flickrapi.FlickrAPI(api_key, secret,format='parsed-json', store_token=False, cache=True)
    
    # look up the id of the group by provided group name
    groupinfo = flickr.urls.lookupGroup(url='https://www.flickr.com/groups/' + group_name + '/')
    
    # extract the group id from the result
    group_id = groupinfo.get('group').get('id')
        
    # generate base configuration for the flickr query    
    config = get_flickr_search_base_config(pagenum)

    # add the group id to the config
    config['group_id'] = group_id
    
    # retrieve information of the photos from the group
    result = flickr.groups.pools.getPhotos(**config)

    if(show_log):   
        print(f"Downloading page: {pagenum}")
    
    return result

In [17]:
# https://stackoverflow.com/questions/11546858/python-multiprocessing-keyword-arguments
# https://stackoverflow.com/questions/34031681/passing-kwargs-with-multiprocessing-pool-map

def downloader_wrapper(arg):
    """
    Wrapper for the image downloader.
    
    Params:
        arg (tuple): arguments (iterable, kwargs) to be passed to the image downloader.
        
    Returns: forwards image downloader result.     
    """
    
    args, kwargs = arg
    
    return download_flickr_img(args, **kwargs)

In [18]:
def get_worker_args(num_processes, query_photos_dict, max_img_dload_cnt_per_worker, download_dir, im_size, max_num):
    """
        Creates a list of arguments for the image downloader processes
        
        Params:
            num_processes (int): The intended number of image downloader processes
            query_photos_dict (dictionary): Containing Flickr ids of the images as keys and 'photo' elements as values 
                               that contain information such as farm-id, secret, etc.
            max_img_dload_cnt_per_worker (int): The maximum number of images to be assigned to a worker process.
            download_dir (str): The base directory for saving the downloaded images. Each process has its own subdirectory
                          under this dir in which it saves the images.
            im_size (str): One of the available image sizes (e.g. 'c', 'm', 'b', 'o') in Flickr.
            
        Returns:
            A list, each element of which corresponds to the argument to be passed to an image downloader process. 
    """
    
    arg = []
    
    # Retrieve the photo elements as a list
    query_photos = list(query_photos_dict.values())
    
    for i in range(num_processes):
        # create the keyword arguments for the processes
        kwargs = {   
                     'download_dir' : os.path.join(download_dir,str(i+1)),
                     'im_size' : im_size,
                     'max_num' : max_num,
                     'worker_id': i+1,
                 }
        
        # compute the section end points' of the photo element list to be used for this worker process    
        photo_start_idx = max_img_dload_cnt_per_worker*i
        photo_end_idx = max_img_dload_cnt_per_worker*(i+1)
        
        # The regular argument and the keyword argument will be sent as argument to a process
        arg.append((query_photos[photo_start_idx:photo_end_idx], kwargs))        
        
    return arg

In [19]:

# Recommended way for creating a multi-process program in python
if __name__ == '__main__':

    
    # number of images to download
    num_images_to_download = 500 

    # Specify whether to download from a group or by tags
    config = 'tags' # possible values: 'group' or 'tags'

    # base download directory
    download_dir = f'{gdrive_root}/dataset/test_dataset'


    # name of the group (if downloading from a group, ignored otherwise)
    group_name = 'portrait-gallery'

    # tags to be used to searching (if downloading using search tags, ignored otherwise)
    tags = 'tigers'

    
    # maximum number of images to assign to an image downloader process
    MAX_LOAD_PER_IMG_DLOADER = 200 
    
    # maximum number of images information to be contained in a query result page.
    # NOTE: Flickr sends the same set of image information if used a number higher than 100.
    MAX_IMG_PER_PAGE = 100    
    
    # hack since api downloads 100 less than the specified number of images
    num_images_to_download +=100 
    
    # finds the number of pages needed to be downloaded
    num_pages = int(math.ceil(num_images_to_download/MAX_IMG_PER_PAGE))

    print(f'Number of pages needed for {num_images_to_download} images: {num_pages}')
    
    # settings for downloading images from a group or by search keywords
    configs = { 'group': {
                            'search_criteria' : group_name,
                            'download_dir' : download_dir,    
                            'image_info_save_file' : 'query_results_bygroup__' + group_name,
                            'url_downloader' : download_flickr_img_by_group, # image info downloader function for groups
                        },
               'tags': {
                            'search_criteria' : tags,
                            'download_dir' : download_dir,    
                            'image_info_save_file' : 'query_results_bytags__' + ".".join(tags.split(",")),
                            'url_downloader' : download_flickr_img_url_by_tag, # image info downloader function for tags
                        }
              }
    
    # download image information if no saved version exists on disk
    if not os.path.exists(configs[config]['image_info_save_file']):
        print('image_info_save_file does not exist')
        
        # execute the query for finding out the number of pages available 
        result = configs[config]['url_downloader'](configs[config]['search_criteria'])

        # Number of pages in the result    
        pagecount = result['photos']['pages']

        print(f"Total available images: {result['photos']['total']}")
        print(f'Total available pages: {pagecount}')

        results = []

        # download image info (can't be done parallely since can't instantiate more than one Flickr api instances with 
        # the same api key)
        for i in range( min(num_pages, pagecount) ):
            
            # download image info as a dictionary and append to a list  
            results.append(configs[config]['url_downloader'](configs[config]['search_criteria'], i+1, show_log = True))

            # enforced delay not to overwhelm the Flickr API  
            time.sleep(0.5)

        # construct a dictionary of returned photos to eliminate ducplicate images (based on image id)      
        query_photos = { photo['id']:photo for result in results for photo in result['photos']['photo']}

        # compute the number of image information obtained       
        actual_total_image_count = len(query_photos.items()) # Flickr returns 100 less images for some reason

        print(f'Obtained total {actual_total_image_count} image information')

        # save the search result dictionary to disk to save time should we need to re-download      
        pickle.dump(query_photos, open(configs[config]['image_info_save_file'],'wb'))

    else:                  
        print('image_info_save_file exists')
              
        # load saved search result containing photo elements
        query_photos = pickle.load(open(configs[config]['image_info_save_file'],'rb'))      
    
        print(f"Loaded image info file from disk for {config} {configs[config]['search_criteria']}")
              
        # compute the number of image information obtained       
        actual_total_image_count = len(query_photos.items()) # Flickr returns 100 less images for some reason
        
        if( num_images_to_download < actual_total_image_count):
            im_ids = list(query_photos.keys())
            im_ids = im_ids[:num_images_to_download]
            photos = list(query_photos.values())
            photos = photos[:num_images_to_download]
              
            query_photos = dict(zip(im_ids, photos))
              
            actual_total_image_count = num_images_to_download  
    
    print(f'Attempting to download total {actual_total_image_count} images')          
              
    # update num_process to reflect actual image count
    num_processes = int(math.ceil(actual_total_image_count/MAX_LOAD_PER_IMG_DLOADER))
              
    print(f'Number of image downloaders: {num_processes}')
              
    # arrange the image downloader arguments for multi-process environment          
    args = get_worker_args(num_processes, query_photos, MAX_LOAD_PER_IMG_DLOADER, configs[config]['download_dir'], 
                            'c', None)
    
    # for suppressing process output
    def mute():
        sys.stdout = open(os.devnull, 'w')   
              
    start = time.time()
              
    with Pool(num_processes) as p:
        p.map(downloader_wrapper, args);
    
    end = time.time()
    
    print(f'Elapsed time {end-start} seconds')
              

Number of pages needed for 600 images: 6
image_info_save_file does not exist
Total available images: 48431
Total available pages: 485
Downloading page: 1
Downloading page: 2
Downloading page: 3
Downloading page: 4
Downloading page: 5
Downloading page: 6
Obtained total 600 image information
Attempting to download total 600 images
Number of image downloaders: 3
Elapsed time 121.71158790588379 seconds
