In [2]:
import pandas as pd
import re

import requests
import json
from pprint import pprint
import time

In [46]:
# require object id for cat and dog, unable to avoid overlap with other words

cat_url = "https://collectionapi.metmuseum.org/public/collection/v1/search?q=cat"
cat_id = requests.get(cat_url).json()

dog_url = "https://collectionapi.metmuseum.org/public/collection/v1/search?q=dog"
dog_id = requests.get(dog_url).json()

In [47]:
# combine object id list
id_list = cat_id['objectIDs'] + dog_id['objectIDs']

cat_n = len(cat_id['objectIDs'])
dog_n = len(dog_id['objectIDs'])

print(f'cat objects {cat_n}')
print(f'dog objects {dog_n}')
print(f'total objects {len(id_list)}')

cat objects 45903
dog objects 5463
total objects 51366


In [23]:
# require each object's data, due to api shut down in middle, there are multiple file for api request
object_json = {}
id_n = len(id_list)

for i in range(i,id_n):
    temp_url = f'https://collectionapi.metmuseum.org/public/collection/v1/objects/{id_list[i]}'
    temp_response = requests.get(temp_url).json()
    object_json[f'id_{id_list[i]}'] = temp_response
    
    time.sleep(0.01) # less than 70 calls/second
    
    if (i+1) % 1000 == 0 :
        print( f'{i+1} / {id_n}' )
        
print("----------complete-----")

47000 / 51366
48000 / 51366
49000 / 51366
50000 / 51366
51000 / 51366
----------complete-----


In [25]:
# save json file. due to api shut down for several times, we have multiple files
with open("../data/original_data/original_api_output_4.json", "w") as outfile:
    json.dump(object_json, outfile)

In [25]:
# help file to assinge geolocation
map_df = pd.read_csv( "../data/original_data/map_point.csv" )

# function to clean data
def CleanData(object_json , start_json) :
    for o_id in object_json :
        try: # SKIP - "not valid object"

            temp_object = {}
            object_one = object_json[o_id]
            # object_one['testline'] = "cat's"

            # test if the object fit word 'cat|cats' or 'dog|dogs' , there are lots overlapping in search, ie. category
            test_cat = bool(re.search( '[^a-z](cat|cats)[^a-z]' , str(object_one) , re.IGNORECASE )) # team cat
            test_dog = bool(re.search( '[^a-z](dog|dogs)[^a-z]' , str(object_one) , re.IGNORECASE )) # team dog

            if any( [test_cat , test_dog] ) : # SKIP - objects with overlapping words

                # team
                if all( [test_cat , test_dog] ):
                    temp_object['team'] = 'both' # team both
                elif test_cat :
                    temp_object['team'] = 'cat'
                elif test_dog :
                    temp_object['team'] = 'dog'
                else :
                    temp_object['team'] = 'NA' # based on loop design, should not have team NA

                # basic object info
                temp_object['objectID'] = object_one['objectID']
                temp_object['displayGallery'] = object_one['GalleryNumber']
                temp_object['objectName'] = object_one['objectName']
                temp_object['objectURL'] = object_one['objectURL']
                temp_object['objectImage'] = object_one['primaryImageSmall']
                if len(temp_object['objectImage']) > 0 :
                    temp_object['isImage'] = True
                else:
                    temp_object['isImage'] = False
                temp_object['title'] = object_one['title']

                temp_object['artist'] = object_one['artistDisplayName']
                temp_object['department'] = object_one['department']
                temp_object['isHighlight'] = object_one['isHighlight']
                temp_object['classification'] = object_one['classification']
                temp_object['medium'] = object_one['medium']
                temp_object['culture'] = object_one['culture']
                temp_object['period'] = object_one['period']
                temp_object['region'] = object_one['region']

                # Object Year and dateing 
                temp_year = int(object_one['objectBeginDate'])
                temp_object['year'] = temp_year
                
                # YearDecade
                temp_object['yearDecade'] = f'{(temp_year // 10)*10}s'

                # YearCentury. BC, AD
                temp_century = (abs(temp_year -1) // 100) + 1
                try:
                    if temp_year < 0 :
                        temp_object['yearCentury'] = f'{ abs(temp_century) } BC'
                        temp_object['yearCenturyInt'] = -temp_century *100
                    elif temp_year >= 0 :
                        temp_object['yearCentury'] = f'{ temp_century } AD'
                        temp_object['yearCenturyInt'] = temp_century *100
                except:
                    temp_object['yearCentury'] = 'NA'
                
                # YearMultipleCentury
                try:
                    if temp_year < 0 :
                        temp_object['yearCenturyMultiple'] = "B.C."
                    elif temp_year < 500 :
                        temp_object['yearCenturyMultiple'] = "1st to 5th Century"
                    elif temp_year < 1000 :
                        temp_object['yearCenturyMultiple'] = "6th to 10th Century"
                    elif temp_year < 1500 :
                        temp_object['yearCenturyMultiple'] = "11th to 15th Century"
                    elif temp_year >= 1500 :
                        temp_object['yearCenturyMultiple'] = "16th to 21st Century"
                except:
                    temp_object['yearCenturyMultiple'] = 'NA'                        

                # Object Country, use object country if not blank, otherwise use artistNationality or culture
                try:
                    if len( object_one['country'] ) > 0:
                        temp_country = re.findall( '[\w\s\.]+' , object_one['country'] )[0].strip()
                    elif len( object_one['artistNationality'] ) > 0:
                        temp_country = re.findall( '[\w\s\.]+' , object_one['artistNationality'] )[0].strip()
                    else:
                        temp_country = re.findall( '[\w\s\.]+' , object_one['culture'] )[0].strip()
                except:
                    temp_country = 'NA'

                temp_object['country'] = temp_country
                
                # TBC - countryGeo, convert country to formal country with Geocode. Geocode file is prepared based on met_data's country list.
                try:
                    temp_object['geoCode'] = map_df.loc[ map_df['country'] == temp_country]["geoCode"].item()
                except:
                    temp_object['geoCode'] = "NA"
                
                # TBC - Continent
                try:
                    temp_object['continent'] = 'NA'
                except:
                    pass
                
                # Object Tags. combine multiple tags to one string
                try:
                    object_tags = object_one['tags']
                    tmep_tags = []
                    for one_tag in object_tags:
                        tmep_tags.append(one_tag['term'])
                    temp_object['tags'] = ', '.join(tmep_tags)
                except:
                    temp_object['tags'] = 'NA'

                # add object pass test, with selected content to clean database
                start_json.append(temp_object)
                # print( o_id, temp_object['team'] )
        except:
            pass
                
    return start_json

In [26]:
# read all files and select object with proper key words

start_json = []

def ReadJson( path ):
    start_n = len(start_json)
    print( path )
    path = open(path)
    object_json = json.load( path )
    print(f'original count {len(object_json)}')
    clean_json = CleanData(object_json , start_json) # use function to clean and select data
    print(f'clean count {len(start_json) - start_n}' )
    return start_json

clean_1 = ReadJson( '../data/original_data/original_api_output_1.json' )
clean_2 = ReadJson( '../data/original_data/original_api_output_2.json' )
clean_3 = ReadJson( '../data/original_data/original_api_output_3.json' )
clean_4 = ReadJson( '../data/original_data/original_api_output_4.json' )
    
print(f'total clean count {len(start_json)}' )

with open("../data/met_data.json", "w") as outfile:
    json.dump(start_json, outfile)

../data/original_data/original_api_output_1.json
original count 28172
clean count 1576
../data/original_data/original_api_output_2.json
original count 1067
clean count 16
../data/original_data/original_api_output_3.json
original count 17288
clean count 797
../data/original_data/original_api_output_4.json
original count 4815
clean count 3475
total clean count 5864


In [27]:
# data review
met_df = pd.read_json("../data/met_data.json")
met_df.to_csv("../data/original_data/met_data.csv") # to convert sqlit
met_df

Unnamed: 0,team,objectID,displayGallery,objectName,objectURL,objectImage,isImage,title,artist,department,...,region,year,yearDecade,yearCentury,yearCenturyInt,yearCenturyMultiple,country,geoCode,continent,tags
0,cat,545971,134,"Figurine, cat",https://www.metmuseum.org/art/collection/searc...,https://images.metmuseum.org/CRDImages/eg/web-...,True,Cat,,Egyptian Art,...,,-664,-670s,7 BC,-600,B.C.,Egypt,c_13,,Cats
1,cat,203392,512,Toy,https://www.metmuseum.org/art/collection/searc...,https://images.metmuseum.org/CRDImages/es/web-...,True,Cat,Saint James's Factory,European Sculpture and Decorative Arts,...,,1750,1750s,18 AD,1800,16th to 21st Century,British,c_11,,Cats
2,cat,544118,134,"Statuette, cat",https://www.metmuseum.org/art/collection/searc...,https://images.metmuseum.org/CRDImages/eg/web-...,True,Cat Statuette intended to contain a mummified cat,,Egyptian Art,...,,-332,-340s,4 BC,-300,B.C.,Egypt,c_13,,Cats
3,cat,49698,,Folding fan mounted as an album leaf,https://www.metmuseum.org/art/collection/searc...,,False,Cat,Zhang Yuguang,Asian Art,...,,1900,1900s,19 AD,1900,16th to 21st Century,Chinese,c_18,,Cats
4,cat,199446,,Figure,https://www.metmuseum.org/art/collection/searc...,https://images.metmuseum.org/CRDImages/es/web-...,True,Cat,John Astbury,European Sculpture and Decorative Arts,...,,1740,1740s,18 AD,1800,16th to 21st Century,,,,Cats
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5859,dog,724696,,Print,https://www.metmuseum.org/art/collection/searc...,,False,"The Dog Ball...., from ""L'Illustration""","CHAM (Amédée Charles Henri, Comte de Noé)",Drawings and Prints,...,,1844,1840s,19 AD,1900,16th to 21st Century,French,c_17,,
5860,dog,381472,,Print,https://www.metmuseum.org/art/collection/searc...,,False,"The Dog and Ox, from Aesop's Fables",Francis Barlow,Drawings and Prints,...,,1755,1750s,18 AD,1800,16th to 21st Century,British,c_11,,
5861,dog,634111,,Print,https://www.metmuseum.org/art/collection/searc...,,False,Section of The Print at Lower Right Depicting ...,Titian (Tiziano Vecellio),Drawings and Prints,...,,1549,1540s,16 AD,1600,16th to 21st Century,Italian,c_26,,
5862,dog,397041,,Print,https://www.metmuseum.org/art/collection/searc...,,False,The Dog Which Carried Round His Neck His Maste...,Gustave Doré,Drawings and Prints,...,,1868,1860s,19 AD,1900,16th to 21st Century,French,c_17,,
