# Met scraping code

what you will need to adjust before running:
- f_path : is path to the met txt database
- output_folder : path to where the images should be saved (big parquet file will be saved into wd)

have fun! 🌱

## Setting up scraping environment

In [1]:
import os
import time
import random
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Scraping

### Setting up db

In [2]:
# reading in the met txt
f_path = ""
data = pd.read_csv(f_path)
# for images and parquet file of urls
output_folder = ""

In [None]:
#getting just id and url
cols = [4, 47]
scrape_dt = data[data.columns[cols]]
scrape_dt["im_link"] = None

### Looping through the database, scraping and saving images

the codes makes batches of 15 rows (this can be changed by assigning different value to batch_size)
by batch 
- it scrapes the image url associated with the id in the met txt catalouge. Pastes the image url into the dataframe
- saves the images into the predefied output_folder by ID + .jpg
- writes the dataframe into a parquet file so it will not be lost if the process crashes
- sets a random sleeper before continuing to the next batch. This ensures that we don't overstep the allowed query maximum of the met site

code was tested on the first 2000 rows


In [4]:
# must include the columns: 'Link Resource', 'im_link', and 'Object ID'


os.makedirs(output_folder, exist_ok=True)
big_parquet_file = "complete_data.parquet"

batch_size = 15
total_rows = len(scrape_dt)

for start in range(0, total_rows, batch_size):
    end = min(start + batch_size, total_rows)
    print(f"Processing rows {start} to {end - 1}...")

    # --- Scraping Phase ---
    for index in range(start, end):
        url = scrape_dt.loc[index, "Link Resource"]
        try:
            response = requests.get(url)
            response.raise_for_status()  # Throws error on bad responses
        except requests.RequestException as e:
            print(f"Request failed for {url}: {e}")
            scrape_dt.loc[index, "im_link"] = np.nan
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        elements = soup.select(
            "img#artwork__image.artwork__image.js-artwork__image.gtm__artwork__image"
        )
        if elements:
            image_link = elements[0].get("src", np.nan)
            scrape_dt.loc[index, "im_link"] = image_link
            print(f"Row {index}: Found image link: {image_link}")
        else:
            print(f"Row {index}: No image found for {url}. Setting to NA.")
            scrape_dt.loc[index, "im_link"] = np.nan

    # --- Image Downloading Phase ---
    for index in range(start, end):
        image_url = scrape_dt.loc[index, "im_link"]
        image_id = scrape_dt.loc[index, "Object ID"]
        if not isinstance(image_url, str) or image_url.lower() in ["nan", "none"]:
            print(
                f"Skipping row {index}: No valid image link for Object ID {image_id}."
            )
            continue

        image_filename = os.path.join(output_folder, f"{image_id}.jpg")
        try:
            response = requests.get(image_url, stream=True)
            response.raise_for_status()
            with open(image_filename, "wb") as file:
                for chunk in response.iter_content(1024):  # Download in 1KB chunks
                    file.write(chunk)
            print(f"Downloaded image for Object ID {image_id} to {image_filename}")
        except requests.RequestException as e:
            print(
                f"Failed to download image for Object ID {image_id} from {image_url}: {e}"
            )

    # --- Save the Entire DataFrame to a Single Parquet File ---
    scrape_dt.to_parquet(big_parquet_file, index=False)
    print(f"Updated the complete Parquet file: {big_parquet_file}")

    # --- Take a Random Snooze Before Next Batch ---
    sleep_time = random.uniform(1, 5)  # Sleep between 1 and 5 seconds
    print(
        f"Batch {start} to {end - 1} complete. Sleeping for {sleep_time:.2f} seconds...\n"
    )
    time.sleep(sleep_time)

Processing rows 0 to 14...
Row 0: No image found for http://www.metmuseum.org/art/collection/search/1. Setting to NA.
Row 1: No image found for http://www.metmuseum.org/art/collection/search/2. Setting to NA.
Row 2: No image found for http://www.metmuseum.org/art/collection/search/3. Setting to NA.
Row 3: No image found for http://www.metmuseum.org/art/collection/search/4. Setting to NA.
Row 4: No image found for http://www.metmuseum.org/art/collection/search/5. Setting to NA.
Row 5: No image found for http://www.metmuseum.org/art/collection/search/6. Setting to NA.
Row 6: Found image link: https://collectionapi.metmuseum.org/api/collection/v1/iiif/7/20225/restricted
Row 7: Found image link: https://collectionapi.metmuseum.org/api/collection/v1/iiif/8/20226/restricted
Row 8: No image found for http://www.metmuseum.org/art/collection/search/9. Setting to NA.
Row 9: No image found for http://www.metmuseum.org/art/collection/search/10. Setting to NA.
Row 10: No image found for http://www.