# Image collection from Bing

In [2]:
import hashlib
import json
import math
import os
import re
import urllib

import numpy as np
import requests
from tqdm import tqdm

In [3]:
BING_API_KEY = 'YOUR_API_KEY'

In [None]:
project_name = "cat"

path_save_dir = f'../data/{project_name}'
os.makedirs(path_save_dir, exist_ok=True)

path_image_dir = os.path.join(path_save_dir, 'images')
os.makedirs(path_image_dir, exist_ok=True)

In [5]:
def fetch_image_urls(
    query: str, num_images_required: int, query_language: str = 'ja-JP', num_images_per_transaction: int = 30
) -> list[str]:
    if num_images_per_transaction > 150:
        raise ValueError('num_images_per_transaction must be less than 150')
    offset_count = math.floor(num_images_required / num_images_per_transaction)

    endpoint = 'https://api.bing.microsoft.com/v7.0/images/search'
    headers = {'Ocp-Apim-Subscription-Key': BING_API_KEY,
            'Content-Type': 'multipart/form-data'}

    image_infos = []
    for offset in tqdm(np.arange(offset_count)):
        params = {
            'q': query, 'mkt': query_language, 'count': num_images_per_transaction,
            'offset': offset * num_images_per_transaction,
        }

        try:
            response = requests.get(endpoint, headers=headers, params=params)
            response.raise_for_status()
            image_infos.append(response.json())
        except Exception as e:
            print(e)
            continue

    urls = []
    for info in image_infos:
        for values in info['value']:
            unquoted_url = urllib.parse.unquote(values['contentUrl'])
            img_url = re.search(r'(http[^\?]+)(\?|.*)', unquoted_url)
            if img_url:
                urls.append(img_url.groups()[0])

    return urls

In [34]:
keyword = '猫'
num_images_required = 50

image_urls = fetch_image_urls(keyword, num_images_required)
print(f'# of images: {len(image_urls)}')

# of images: 30


In [30]:
valid_extensions = ['jpg', 'jpeg', 'gif', 'png', 'bmp']

image_url_paths = {}
for url in image_urls:
    image_extension = url.split('.')[-1]
    if image_extension.lower() not in valid_extensions:
        continue

    hashed_url = hashlib.sha3_256(url.encode('utf-8')).hexdigest()
    path_image = os.path.join(path_image_dir, hashed_url + '.' + image_extension.lower())
    if url not in image_url_paths.keys():
        image_url_paths[url] = path_image

In [27]:
for url, path_image in tqdm(image_url_paths.items()):
    try:
        response = requests.get(url, allow_redirects=True, timeout=10)
        if response.status_code != 200:
            continue

        with open(path_image, "wb") as f:
            f.write(response.content)

    except Exception as e:
        print(e)
        continue

 44%|████▍     | 12/27 [00:18<00:20,  1.33s/it]

HTTPConnectionPool(host='pic22.nipic.com', port=80): Max retries exceeded with url: /20120718/4499633_230543624000_2.jpg (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x108df4290>: Failed to resolve 'pic22.nipic.com' ([Errno 8] nodename nor servname provided, or not known)"))


 52%|█████▏    | 14/27 [00:20<00:12,  1.08it/s]

HTTPConnectionPool(host='pica.nipic.com', port=80): Max retries exceeded with url: /2008-05-29/2008529215619143_2.jpg (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10e756090>: Failed to resolve 'pica.nipic.com' ([Errno 8] nodename nor servname provided, or not known)"))


 81%|████████▏ | 22/27 [00:42<00:07,  1.41s/it]

HTTPConnectionPool(host='pic14.nipic.com', port=80): Max retries exceeded with url: /20110506/3320946_063914334000_2.jpg (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10e755790>: Failed to resolve 'pic14.nipic.com' ([Errno 8] nodename nor servname provided, or not known)"))


100%|██████████| 27/27 [00:52<00:00,  1.94s/it]


In [39]:
metadata = {}
metadata['keyword'] = keyword
metadata["image_url_paths"] = {url: path for url, path in image_url_paths.items() if os.path.exists(path)}

with open(os.path.join(path_save_dir, 'metadata.json'), mode='w') as f:
    json.dump(metadata, f, indent=4, separators=(',', ': '))