In [1]:
import os
from backoff import on_exception, expo
import json
import requests
from collections import Counter
from datetime import datetime
from ratelimit import limits, sleep_and_retry, RateLimitException
import time
import urllib.request

In [2]:
data_path = "D:\\METIS\\data\\"
image_path = "D:\\METIS\\images\\"

In [3]:
with open(data_path + 'urls.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
data[0]

{'object_id': '435868',
 'img_url': 'https://storage.cloud.google.com/gcs-public-data--met/435868/0.jpg',
 'api_url': 'https://collectionapi.metmuseum.org/public/collection/v1/objects/435868'}

In [4]:
@sleep_and_retry
@on_exception(expo, RateLimitException, max_tries=8)
@limits(calls=80, period=1)
def get_met_response(link, return_page=True):     
    response = requests.get(link)
    page = response.text
    
    if return_page == True:
        return page

In [8]:
page = get_met_response(data[0]['api_url'], return_page=True)
json.loads(page)

{'objectID': 435868,
 'isHighlight': True,
 'accessionNumber': '61.101.1',
 'accessionYear': '1961',
 'isPublicDomain': True,
 'primaryImage': 'https://images.metmuseum.org/CRDImages/ep/original/DP231550.jpg',
 'primaryImageSmall': 'https://images.metmuseum.org/CRDImages/ep/web-large/DP231550.jpg',
 'additionalImages': ["https://images.metmuseum.org/CRDImages/ep/original/CP d'orsay.jpg"],
 'constituents': [{'constituentID': 161761,
   'role': 'Artist',
   'name': 'Paul Cézanne',
   'constituentULAN_URL': 'http://vocab.getty.edu/page/ulan/500004793',
   'constituentWikidata_URL': 'https://www.wikidata.org/wiki/Q35548',
   'gender': ''}],
 'department': 'European Paintings',
 'objectName': 'Painting',
 'title': 'The Card Players',
 'culture': '',
 'period': '',
 'dynasty': '',
 'reign': '',
 'portfolio': '',
 'artistRole': 'Artist',
 'artistPrefix': '',
 'artistDisplayName': 'Paul Cézanne',
 'artistDisplayBio': 'French, Aix-en-Provence 1839–1906 Aix-en-Provence',
 'artistSuffix': '',
 'a

In [5]:
def get_img(link, image_path):
    object_id = link.split('/')[-1]
    filename = image_path + object_id + ".jpg"
    
    im = urllib.request.urlretrieve(link, filename)

In [6]:
links = []

for obj in data:
    links.append(obj['api_url'])

In [7]:
response = get_met_response(links[1], return_page=True)
img_url = json.loads(response)['primaryImage']
get_img(img_url, image_path)

In [8]:
failed = []
successful = []

In [9]:
start = datetime.now()
# just based on previous scrape
for count, link in enumerate(links):
    
    obj_id = link.split('/')[-1]
            
    if count % 1000 == 0:
        elapsed = datetime.now() - start
        print("{}: {} | elapsed: {}".format(count, time.ctime(),str(elapsed)))
        time.sleep(5)

    try:
        response = get_met_response(link, return_page=True)
        img_url = json.loads(response)['primaryImage']
        get_img(img_url, image_path)
        
        successful.append(response)
        
        with open(data_path + "met_responses.json", "w") as write_file:
            json.dump(successful, write_file)
    except:
        failed.append(obj_id)
        
        with open(data_path + "failed_responses.json", "w") as write_file:
            json.dump(failed, write_file)

0: Tue Nov 24 11:17:41 2020 | elapsed: 0:00:00.001003
1000: Tue Nov 24 11:33:44 2020 | elapsed: 0:16:02.685069
2000: Tue Nov 24 11:59:55 2020 | elapsed: 0:42:14.363116
3000: Tue Nov 24 12:35:13 2020 | elapsed: 1:17:31.715816
4000: Tue Nov 24 13:24:51 2020 | elapsed: 2:07:10.552258
5000: Tue Nov 24 14:29:16 2020 | elapsed: 3:11:35.319156
6000: Tue Nov 24 16:04:50 2020 | elapsed: 4:47:09.192939
7000: Tue Nov 24 17:50:17 2020 | elapsed: 6:32:35.837919
8000: Tue Nov 24 19:55:10 2020 | elapsed: 8:37:28.871622
9000: Tue Nov 24 22:27:29 2020 | elapsed: 11:09:48.485093
10000: Wed Nov 25 02:33:56 2020 | elapsed: 15:16:14.918637
11000: Wed Nov 25 06:58:40 2020 | elapsed: 19:40:59.072542
12000: Wed Nov 25 11:57:12 2020 | elapsed: 1 day, 0:39:30.928924
13000: Wed Nov 25 16:39:36 2020 | elapsed: 1 day, 5:21:55.035404
14000: Wed Nov 25 22:06:55 2020 | elapsed: 1 day, 10:49:14.040969
15000: Thu Nov 26 03:40:07 2020 | elapsed: 1 day, 16:22:26.072959
16000: Thu Nov 26 09:30:26 2020 | elapsed: 1 day, 22