In [1]:
import os
from backoff import on_exception, expo
import json
import requests
from collections import Counter
from datetime import datetime
from ratelimit import limits, sleep_and_retry, RateLimitException
import time
import urllib.request
import asyncio
import aiohttp
import nest_asyncio
from async_timeout import timeout
from requests.exceptions import HTTPError

In [2]:
data_path = "D:\\METIS\\data\\"
image_path = "D:\\METIS\\images\\"

images_saved = []
met_objects = []

In [3]:
url = 'https://collectionapi.metmuseum.org/public/collection/v1/objects'
response = requests.get(url, allow_redirects=True)
all_objects = json.loads(response.text)
objectIDs = all_objects['objectIDs']

In [4]:
len(objectIDs)

474632

In [5]:
@sleep_and_retry
@on_exception(expo, RateLimitException, max_tries=8)
@limits(calls=300, period=300)
def get_object_details(object_id, img_url=True):
    url ='https://collectionapi.metmuseum.org/public/collection/v1/objects/{}'.format(object_id)
    response = requests.get(url, timeout=10)
    response_json = json.loads(response.text)
    met_objects.append(response_json)
    
    if img_url:
        return response_json['primaryImage']

@on_exception(expo, RateLimitException, max_tries=8)
@limits(calls=300, period=300)
def get_image(object_id, url):
    filename = image_path + "{}.jpg".format(object_id)
    im = urllib.request.urlretrieve(url, filename)
    images_saved.append({
                        'objectID': object_id, 
                        'url': url,
                        'filename': filename
                        })

In [6]:
img_url = get_object_details(1567)
get_image(1567, img_url)

In [7]:
images_saved

[{'objectID': 1567,
  'url': 'https://images.metmuseum.org/CRDImages/ad/original/79684.jpg',
  'filename': 'D:\\METIS\\images\\1567.jpg'}]

In [8]:
images_saved = []
met_objects = []

start = datetime.now()

for i, object_id in enumerate(objectIDs):
    if i % 1000 == 0:
        with open(data_path + "images_saved.json", "w") as write_file:
            json.dump(images_saved, write_file)   
            
        with open(data_path + "met_objects.json", "w") as write_file:
            json.dump(met_objects, write_file)  
            
        print("{}: {} elapsed".format(i, datetime.now() - start))
            
    try:
        img_url = get_object_details(object_id)
        get_image(object_id, img_url)
        
    except:
        pass
   # print(i, object_id)

In [9]:
start = datetime.now()

for i, object_id in enumerate(objectIDs[360000:380000]):
    if i % 1000 == 0:
        with open(data_path + "images_saved_4.json", "w") as write_file:
            json.dump(images_saved, write_file)   
            
        with open(data_path + "met_objects_4.json", "w") as write_file:
            json.dump(met_objects, write_file)  
            
        print("{}: {} elapsed".format(i + 360000, datetime.now() - start))
            
    try:
        img_url = get_object_details(object_id)
        get_image(object_id, img_url)
        
    except:
        pass

360000: 0:00:00.012017 elapsed
361000: 0:15:42.241953 elapsed
362000: 0:32:49.916585 elapsed
363000: 0:50:38.806487 elapsed
364000: 1:06:24.231644 elapsed
365000: 1:24:20.361207 elapsed
366000: 1:42:00.416819 elapsed
367000: 2:00:03.400775 elapsed
368000: 2:18:28.241076 elapsed
369000: 2:34:16.020850 elapsed
370000: 2:53:04.037591 elapsed
371000: 3:09:42.198654 elapsed
372000: 3:26:56.418064 elapsed
373000: 3:44:55.918755 elapsed
374000: 4:00:13.614045 elapsed
375000: 4:18:36.851527 elapsed
376000: 4:36:07.220203 elapsed
377000: 4:52:40.691663 elapsed
378000: 5:11:26.946943 elapsed
379000: 5:28:29.255199 elapsed


In [10]:
with open(data_path + "images_saved_360.json", "w") as write_file:
    json.dump(images_saved, write_file)   
            
with open(data_path + "met_objects_360.json", "w") as write_file:
    json.dump(met_objects, write_file)  

In [11]:
len(images_saved)

6237