# Collecting Art Data

In [3]:
import requests
import json
from dotenv import load_dotenv
import os

In [4]:


# Load environment variables from the .env file
load_dotenv(os.path.join(os.getcwd(), '..', '.env'))

True

current code that works, creates a json document with name, place, year, topic, object type and culture. although most are not visual artworks 

In [5]:
import requests
import json
import re

# Define the API details
BASE_URL = "https://api.si.edu/openaccess/api/v1.0/search"
API_KEY =  os.getenv('SMITHSONIAN_TOKEN')

# Define country variations
COUNTRIES = ["US", "United States", "the United States", "USA", "the United States of America", "America"]

# JSON file to store results
OUTPUT_FILE = "artworks.json"

# Function to normalize dates
def parse_date(raw_date):
    if not raw_date:
        return "Unknown"
    
    # Handle century formats
    century_match = re.search(r"(\d+)(st|nd|rd|th)\scentury", raw_date, re.IGNORECASE)
    if century_match:
        return century_match.group(0)
    
    # Handle decades
    decade_match = re.search(r"(\d{4}s)", raw_date)
    if decade_match:
        return decade_match.group(0)
    
    # Handle approximate dates
    approx_match = re.search(r"(around|circa|ca\.?)\s?(\d+)", raw_date, re.IGNORECASE)
    if approx_match:
        return approx_match.group(2)
    
    # Extract year or fallback to raw string
    year_match = re.search(r"\d{4}", raw_date)
    return year_match.group(0) if year_match else raw_date

# Function to fetch data from the API
def fetch_artworks_by_country_and_date(country_names):
    results = []
    
    for country in country_names:
        # Simplify the query to check for any artwork (no specific topics)
        query = f"country:{country} AND date:*"
        params = {
            "q": query,
            "start": 0,
            "rows": 100,
            "api_key": API_KEY
        }
        
        try:
            response = requests.get(BASE_URL, params=params)
            response.raise_for_status()
            data = response.json()
            
            # Debug: print the raw response to check for issues
            print(f"Response from {country}: {data}")
            
            if "response" in data and "rows" in data["response"]:
                for item in data["response"]["rows"]:
                    freetext = item.get("content", {}).get("freetext", {})
                    
                    # Extract object_type
                    object_type = [ot.get("content", "Unknown") for ot in freetext.get("objectType", [])]
                    
                    artwork = {
                        "name": item.get("title", "Unknown"),
                        "place": country,
                        "date": parse_date(freetext.get("date", [{"content": "Unknown"}])[0]["content"]),
                        "topic": [t.get("content", "Unknown") for t in freetext.get("topic", [])],
                        "culture": [c.get("content", "Unknown") for c in freetext.get("culture", [])],
                        "object_type": object_type  # Added object_type to the artwork dictionary
                    }
                    results.append(artwork)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for country {country}: {e}")
    
    return results

# Main logic
def main():
    print("Fetching artworks...")
    artworks = fetch_artworks_by_country_and_date(COUNTRIES)
    
    if artworks:
        with open(OUTPUT_FILE, "w") as f:
            json.dump(artworks, f, indent=4)
        print(f"Fetched {len(artworks)} artworks. Results saved to {OUTPUT_FILE}.")
    else:
        print("No artworks found.")

if __name__ == "__main__":
    main()


Fetching artworks...
Response from US: {'status': 200, 'responseCode': 1, 'response': {'rows': [], 'facets': {}, 'rowCount': 0, 'message': 'no results found'}}
Response from United States: {'status': 200, 'responseCode': 1, 'response': {'rows': [], 'facets': {}, 'rowCount': 0, 'message': 'no results found'}}
Response from the United States: {'status': 200, 'responseCode': 1, 'response': {'rows': [], 'facets': {}, 'rowCount': 0, 'message': 'no results found'}}
Response from USA: {'status': 200, 'responseCode': 1, 'response': {'rows': [], 'facets': {}, 'rowCount': 0, 'message': 'no results found'}}
Response from the United States of America: {'status': 200, 'responseCode': 1, 'response': {'rows': [{'id': 'sro-1663946979444-1663947024669-0', 'title': 'The mineral industries of the United States, vol. 1 . The energy resources of the United States: A field for reconstruction, pt. 2: Fertilizers: An interpretation of the situation in the United States', 'unitCode': 'SLA_SRO', 'type': 'edanmd

code below works but returns data that are not typical artworks such as manuscripts

In [5]:
import requests
import json
import re

# Define the API details
BASE_URL = "https://api.si.edu/openaccess/api/v1.0/search"
API_KEY = os.getenv('SMITHSONIAN_TOKEN')

# Define country variations
COUNTRIES = ["US", "United States", "the United States", "USA", "the United States of America", "America"]

# JSON file to store results
OUTPUT_FILE = "artworks.json"

# Function to normalize dates
def parse_date(raw_date):
    if not raw_date:
        return "Unknown"
    # Handle century formats
    century_match = re.search(r"(\d+)(st|nd|rd|th)\scentury", raw_date, re.IGNORECASE)
    if century_match:
        return century_match.group(0)
    # Handle decades
    decade_match = re.search(r"(\d{4}s)", raw_date)
    if decade_match:
        return decade_match.group(0)
    # Handle approximate dates
    approx_match = re.search(r"(around|circa|ca\.?)\s?(\d+)", raw_date, re.IGNORECASE)
    if approx_match:
        return approx_match.group(2)
    # Extract year or fallback to raw string
    year_match = re.search(r"\d{4}", raw_date)
    return year_match.group(0) if year_match else raw_date

# Function to fetch data from the API
def fetch_artworks_by_country_and_date(country_names):
    results = []
    for country in country_names:
        query = f"country:{country} AND date:*"
        params = {
            "q": query,
            "start": 0,
            "rows": 100,
            "api_key": API_KEY
        }
        try:
            response = requests.get(BASE_URL, params=params)
            response.raise_for_status()
            data = response.json()
            
            if "response" in data and "rows" in data["response"]:
                for item in data["response"]["rows"]:
                    freetext = item.get("content", {}).get("freetext", {})
                    artwork = {
                        "name": item.get("title", "Unknown"),
                        "place": country,
                        "date": parse_date(freetext.get("date", [{"content": "Unknown"}])[0]["content"]),
                        "topic": [t.get("content", "Unknown") for t in freetext.get("topic", [])],
                        "culture": [c.get("content", "Unknown") for c in freetext.get("culture", [])]
                    }
                    results.append(artwork)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for country {country}: {e}")
    return results

# Main logic
def main():
    print("Fetching artworks...")
    artworks = fetch_artworks_by_country_and_date(COUNTRIES)
    if artworks:
        with open(OUTPUT_FILE, "w") as f:
            json.dump(artworks, f, indent=4)
        print(f"Fetched {len(artworks)} artworks. Results saved to {OUTPUT_FILE}.")
    else:
        print("No artworks found.")

if __name__ == "__main__":
    main()


Fetching artworks...
Fetched 100 artworks. Results saved to artworks.json.
