In [1]:
import os
from backoff import on_exception, expo
import json
import requests
from collections import Counter
from datetime import datetime
from ratelimit import limits, sleep_and_retry, RateLimitException
import time
import urllib.request
import asyncio
import aiohttp
import nest_asyncio
from async_timeout import timeout
from requests.exceptions import HTTPError

In [2]:
data_path = "D:\\METIS\\data\\"
image_path = "D:\\METIS\\images\\"

In [3]:
url = 'https://collectionapi.metmuseum.org/public/collection/v1/objects'
response = requests.get(url, allow_redirects=True)
all_objects = json.loads(response.text)
objectIDs = all_objects['objectIDs']

with open(data_path + 'all_objects.txt', 'w', encoding='utf8') as f:
    f.write(str(all_objects))

In [4]:
def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

@sleep_and_retry
@on_exception(expo, RateLimitException, max_tries=8)
@limits(calls=300, period=300)
def get_object_details(object_id):
    url ='https://collectionapi.metmuseum.org/public/collection/v1/objects/{}'.format(object_id)
    response = requests.get(url, timeout=10)
    response_json = json.loads(response.text)
    met_objects.append(response_json)
    return response_json['primaryImage']
            
def get_image(filename, obj_id):
    #obj_id = img_url.split('/')[-1]
    img_url = 'https://images.metmuseum.org/CRDImages/ep/original/{}.jpg'.format(obj_id)

    im = urllib.request.urlretrieve(url, filename, timeout=10)
    images_saved.append({
                        'objectID': obj_id,
                        'url': img_url,
                        'filename': filename
                        })
    

In [None]:
https://collectionapi.metmuseum.org/public/collection/v1/objects/1567

In [5]:
range(10)
objectIDs[0]

1

In [6]:
len(objectIDs)

474495

In [7]:
images_saved = []
met_objects = []

In [None]:
start = datetime.now()
for i in range(len(objectIDs)):
    obj_id = objectIDs[i]
    
    if i % 1000 == 0:
        print("{}: {} elapsed".format(i, datetime.now() - start))
        print("{} saved responses, {} saved images".format(len(met_objects), len(images_saved)))
        print()
        
        with open(data_path + "images_saved.json", "w") as write_file:
            json.dump(images_saved, write_file)   
            
        with open(data_path + "met_objects.json", "w") as write_file:
            json.dump(met_objects, write_file)   
        
    try:
        img_url = get_object_details(obj_id)
        get_image(image_path + "{}.jpg".format(obj_id, img_url), img_url)
        
    except:
        pass

0: 0:00:00.001005 elapsed
0 saved responses, 0 saved images

1000: 0:14:59.862793 elapsed
900 saved responses, 0 saved images

2000: 0:30:02.823348 elapsed
1800 saved responses, 0 saved images

3000: 0:45:02.816543 elapsed
2700 saved responses, 0 saved images

4000: 1:05:02.928380 elapsed
3900 saved responses, 0 saved images

5000: 1:20:02.884573 elapsed
4800 saved responses, 0 saved images

6000: 1:35:02.946756 elapsed
5700 saved responses, 0 saved images

7000: 1:55:02.930847 elapsed
6900 saved responses, 0 saved images

8000: 2:10:02.958282 elapsed
7800 saved responses, 0 saved images

9000: 2:25:03.126647 elapsed
8700 saved responses, 0 saved images

10000: 2:45:03.036016 elapsed
9900 saved responses, 0 saved images

11000: 3:00:03.050863 elapsed
10800 saved responses, 0 saved images

12000: 3:15:03.097136 elapsed
11700 saved responses, 0 saved images

13000: 3:35:03.084906 elapsed
12900 saved responses, 0 saved images

14000: 3:50:03.108324 elapsed
13800 saved responses, 0 saved i

In [None]:
total_time = datetime.now() - start
print("Total time to scrape: {}".format(total_time))