Initial Exploration of Artsy.net API 

Downloading images and uploading to a bucket

In [1]:
# Import cell
import requests
import os
from google.cloud import storage
import pandas as pd
import shutil

In [2]:
# global params

# these will only work if you've got your env file set up 
# if you don't, replace them with your values
API_XAPP_TOKEN = os.environ.get("API_XAPP_TOKEN")
GCP_PROJECT = os.environ.get("GCP_PROJECT")
BUCKET_NAME = os.environ.get("BUCKET_NAME")

# this will work with no .env file - 
# we set it up on the first day and it links to a json file on our local machines
# print it to check
GOOGLE_APPLICATION_CREDENTIALS = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")

# in your bucket, create a folder called images_raw - do this in the console


In [3]:
# Get 1000 artworks

url = 'https://api.artsy.net/api/artworks'
params = {'size':'5',
          'xapp_token':API_XAPP_TOKEN}
response = requests.get(url,params=params).json()

# get list of artworks only
items = response['_embedded']['artworks']


In [4]:
artworks_df = pd.DataFrame(columns=('artwork_id','title','category','medium','date','height_cm','width_cm','image_versions','image_url','collecting_institution','image_url_is_template'))

In [6]:

# iterate through artworks and get relevant info
artworks = []

for artwork in items:
    artwork_dict = {}
    artwork_dict['artwork_id'] = artwork['id']
    artwork_dict['title'] = artwork['title']
    artwork_dict['category'] = artwork['category']
    artwork_dict['medium'] = artwork['medium']
    artwork_dict['date'] = artwork['date']
    artwork_dict['height_cm'] = artwork['dimensions']['cm']['height']
    artwork_dict['width_cm'] = artwork['dimensions']['cm']['width']
    artwork_dict['depth_cm'] = artwork['dimensions']['cm']['depth']
    artwork_dict['diameter_cm'] = artwork['dimensions']['cm']['diameter']
    artwork_dict['collecting_institution'] = artwork['collecting_institution']
    # not every artwork has images so use try except
    try:
        artwork_dict['image_versions'] = artwork['image_versions']
        artwork_dict['image_url'] = artwork['_links']['image']['href']
        artwork_dict['image_url_is_template'] = artwork['_links']['image']['templated']
    except:
        artwork_dict['image_versions'] = []
        artwork_dict['image_url'] = ''
        artwork_dict['image_url_is_template'] = False
    artworks.append(artwork_dict)
    



In [7]:
artworks_df = pd.DataFrame(columns=('artwork_id','title','category','medium','date','height_cm','width_cm','image_versions','image_url','collecting_institution','image_url_is_template'))

In [8]:
for artwork in artworks:
    artwork_temp = pd.DataFrame([artwork])
    artworks_df = pd.concat([artworks_df,artwork_temp], ignore_index=True)

In [14]:
artworks_df.iloc[0,:]

artwork_id                                         4d8b92eb4eb68a1b2c000968
title                                                   Der Kuss (The Kiss)
category                                                           Painting
medium                                          Oil and gold leaf on canvas
date                                                              1907-1908
height_cm                                                             180.1
width_cm                                                              180.1
image_versions            [large, larger, medium, medium_rectangle, norm...
image_url                 https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa...
collecting_institution            Österreichische Galerie Belvedere, Vienna
image_url_is_template                                                  True
depth_cm                                                               None
diameter_cm                                                            None
Name: 0, dty

In [24]:
# create /data folder - if it already exists, delete and recreate 
path = './data'
if not os.path.exists(path):
  os.mkdir(path)
  print("Folder %s created!" % path)
else:
    shutil.rmtree(path)
    os.mkdir(path)
    print("Folder deleted and recreated")
  

Folder deleted and recreated


In [25]:
# loop through images, download to local file, upload to cloud, delete original file
for i in range(len(artworks_df)):
    # if this artwork has a medium version
    if 'medium' in artworks_df.loc[i,'image_versions']:
        # get url name and file name from artwork name
        url = artworks_df.loc[i,'image_url'].replace('{image_version}','medium')
        file_name = f"data/{artworks_df.loc[i,'title']}.jpg"
        
        # This statement requests the resource at
        # the given link, extracts its contents
        # and saves it in a variable
        data = requests.get(url).content
        
        # Opening a new file 
        # This file would store the data of the image file
        f = open(file_name,'wb')
        
        # Storing the image data inside the data variable to the file
        f.write(data)
        f.close()

        # Now upload the image to the cloud
        # instantiate storage client
        storage_client = storage.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS, project=GCP_PROJECT)
        
        # get bucket name
        bucket = storage_client.get_bucket(BUCKET_NAME)

        # create blob
        blob = bucket.blob('images_raw/{}.jpg'.format(f"{artworks_df.loc[i,'title']}"))

        # set content type
        blob.content_type = 'image/jpeg'

        # upload file
        with open(file_name, 'rb') as f:

            blob.upload_from_file(f)
            
        f.close()
        
        #delete file
        #os.remove(file_name)
    
    
  

https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/medium.jpg
https://d32dm0rphc51dk.cloudfront.net/m4X41Fun8gpDjn7Gat9cUg/medium.jpg
https://d32dm0rphc51dk.cloudfront.net/IG8ZLvVmZgQiTn2zK0Bp8w/medium.jpg
https://d32dm0rphc51dk.cloudfront.net/5L1xjKC_und1uiKCpUPHhw/medium.jpg
https://d32dm0rphc51dk.cloudfront.net/zFA7cwdkWxbIrmuAAd21VA/medium.jpg


In [2]:
import os
cd = '/home/mollyppl/code/molpl/artsy-fartsci-back/notebooks/project_start_off.ipynb'
os.path.join(os.path.dirname(cd),'.','encoded_images','encoded_images.npy')

'/home/mollyppl/code/molpl/artsy-fartsci-back/notebooks/./encoded_images/encoded_images.npy'