In [1]:
import requests

In [2]:
import pandas as pd

In [3]:
from bs4 import BeautifulSoup

In [4]:
#access to the whole collection in general to inspect it
response = requests.get("https://www.rijksmuseum.nl/en/collection/discover")
html_string = response.text

document = BeautifulSoup(html_string, "html.parser")

In [5]:
document

<!DOCTYPE html>
<html data-capo="" lang="en"><head><meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Discover the collection - Rijksmuseum</title>
<link href="/_collection/entry.B0dTSblS.css" rel="stylesheet"/>
<link href="/_collection/_...CBIr6fYz.css" rel="stylesheet"/>
<link href="/_collection/useTabs.DxfvFMOB.css" rel="stylesheet"/>
<link href="/_collection/useFocusTrap.PtEbhjV-.css" rel="stylesheet"/>
<link href="/_collection/index.BENb9xkj.css" rel="stylesheet"/>
<link href="/_collection/index.CbsG4FDN.css" rel="stylesheet"/>
<link href="/_collection/index.Drq4oQSe.css" rel="stylesheet"/>
<link href="/_collection/index.BBXKVJlY.css" rel="stylesheet"/>
<link href="/_collection/index.R-u0Schx.css" rel="stylesheet"/>
<link href="/_collection/index.Do7o4_Cy.css" rel="stylesheet"/>
<link href="/_collection/Base.CPiFPDn5.css" rel="stylesheet"/>
<link href="/_collection/useModalStore.CETig8YK.css" rel="stylesheet"/>
<link href="/_collec

In [6]:
#limiting the scope of the data and selecting a specific collection "Portraits" from the Rijksmuseum
response = requests.get("https://www.rijksmuseum.nl/en/collection/node/Portraiture--a7c5ba17a2c44f96a25b7c8e0f6fa33d?collectionSearchContext=Art&page=1&sortingType=Popularity&facets[0].id=a7c5ba17a2c44f96a25b7c8e0f6fa33d&facets[0].nodeRelationType=HasRijksTheme")
html_string = response.text

document = BeautifulSoup(html_string, "html.parser")

In [7]:
document

<!DOCTYPE html>
<html data-capo="" lang="en"><head><meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Portraiture - Rijksmuseum</title>
<link href="/_collection/entry.B0dTSblS.css" rel="stylesheet"/>
<link href="/_collection/default.DVL-a1YW.css" rel="stylesheet"/>
<link href="/_collection/index.Caumw-Rq.css" rel="stylesheet"/>
<link href="/_collection/useFocusTrap.PtEbhjV-.css" rel="stylesheet"/>
<link href="/_collection/index.C35lhP4s.css" rel="stylesheet"/>
<link href="/_collection/index.C5EJz4Mt.css" rel="stylesheet"/>
<link href="/_collection/Base.CPiFPDn5.css" rel="stylesheet"/>
<link href="/_collection/useModalStore.CETig8YK.css" rel="stylesheet"/>
<link href="/_collection/useObjectActionsMenu.CjmQHHRA.css" rel="stylesheet"/>
<link href="/_collection/useUserCreationsStore.BA1ae6s5.css" rel="stylesheet"/>
<link href="/_collection/index.D38s1jtw.css" rel="stylesheet"/>
<link href="/_collection/index.CsiaHZmA.css" rel="stylesheet"/>

In [9]:
import csv

#create a function to extract specific information to create the csv file
def scrape_artwork_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find("h1", class_="heading-4 object-title") #title of the portrait
    title = title.text.strip() if title else "Title not found"
    
    subtitle = soup.find("p", class_="body object-subtitle") #the subtitle is divided in date and technique
    if subtitle:
        subtitle_text = subtitle.text.strip()
        parts = subtitle_text.split(', ') #as they are in the same line divided by a "," we split the information
        if len(parts) >= 2:
            technique = parts[-2]
            date = parts[-1]
        else:
            technique = "Unknown" #in case the information is not found
            date = "Unknown"
    else:
        technique = "Unknown"
        date = "Unknown"
    
    description = soup.find("p", class_="body-large") 
    description = description.text.strip() if description else "Description not found" 
    
    image = soup.find("img", class_="lazy-image") #access to a link to the portrait's image
    image_url = image['data-src'] if image and 'data-src' in image.attrs else "Image URL not found"
    
    return {
        'title': title,
        'technique': technique,
        'date': date,
        'description': description,
        'image_url': image_url
    }

# List of URLs to scrape (the URLs are from specific portraits in the Rijksmuseum)
urls = [
    "https://www.rijksmuseum.nl/en/collection/object/Self-portrait-as-the-Apostle-Paul--4faa97ed774e6e3f81b76cf3aed6226d?collectionSearchContext=Art&page=1&sortingType=Popularity&facets[0].id=a7c5ba17a2c44f96a25b7c8e0f6fa33d&facets[0].nodeRelationType=HasRijksTheme",
    "https://www.rijksmuseum.nl/en/collection/object/Self-portrait--72f97ac66c33f86b161cd51d62f7d365?collectionSearchContext=Art&page=1&sortingType=Popularity&facets[0].id=a7c5ba17a2c44f96a25b7c8e0f6fa33d&facets[0].nodeRelationType=HasRijksTheme",
    "https://www.rijksmuseum.nl/en/collection/object/Portrait-of-Marie-Jeanette-de-Lange--443eae859f95c387ab0ad79562c98340?collectionSearchContext=Art&page=1&sortingType=Popularity&facets[0].id=a7c5ba17a2c44f96a25b7c8e0f6fa33d&facets[0].nodeRelationType=HasRijksTheme",
    "https://www.rijksmuseum.nl/en/collection/object/Isabella--ead5f623d828c7250ce5413e809b3551?collectionSearchContext=Art&page=1&sortingType=Popularity&facets[0].id=a7c5ba17a2c44f96a25b7c8e0f6fa33d&facets[0].nodeRelationType=HasRijksTheme"
]

# Scrape data from selected URLs and loop through them
artworks_data = []
for url in urls:
    artwork_info = scrape_artwork_info(url)
    artworks_data.append(artwork_info)

# Save data to CSV file
csv_filename = "portraits_metadata_rijksmuseum.csv"
fieldnames = ["title", "technique", "date", "description", "image_url"]

with open(csv_filename, "w", newline="", encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for artwork in artworks_data:
        writer.writerow(artwork)
