# Setting up the scraping environment

In [21]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import pandas as pd
import requests
import numpy as np
from tqdm import tqdm
import os


# Looping through the pages to gather article links

## loading db and creating a db for the new links


In [12]:
#database reading in
 
f_path = ""
data = pd.read_csv('''f_path, nrows = 20''')


In [None]:
#new database containing just the id and the url

cols = [4,47]
scrape_dt = data[data.columns[cols]]
scrape_dt["im_link"] = None

## (chatgpt modified) code for downloading the image links into the scrape_dt database


In [22]:
for index, row in scrape_dt.iterrows():
    ''' looping through the rows of the database. If the image lik can be downloaded, put it into "im_link" column.
    The scraping looks for the right section in the html, transforms the section into a string
    Slice the string based on the markers in front of and after the image url. 
    If getting the link is not possible (there are a lot of missing pictures on the subpages), change the cell to NA '''
    
    url = row['Link Resource']  # Get the URL from the current row
    
    try:
        # Sending a request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Ensure request was successful (raises an error for 4xx and 5xx responses)
    except requests.RequestException as e:
        print(f"Request failed for {url}: {e}")
        scrape_dt.loc[index, "im_link"] = np.nan  # If request fails, set im_link to NA
        continue  # Skip to the next row

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Selecting the right image section from the webpage
    elements = soup.select('img#artwork__image.artwork__image.js-artwork__image.gtm__artwork__image')

    if elements:
        # Extracting the 'src' attribute directly, avoiding unnecessary string manipulation
        image = elements[0].get("src", np.nan)  # If 'src' attribute is missing, default to NA
        scrape_dt.loc[index, "im_link"] = image  # Assign the scraped link to the 'im_link' column
        print(image)  # Print the extracted image link for debugging

    else:
        # If no element is found, set the column to NA instead of skipping
        print(f"No image found for {url}. Setting to NA.")
        scrape_dt.loc[index, "im_link"] = np.nan  # Set im_link to NA if no image is found


No image found for http://www.metmuseum.org/art/collection/search/1. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/2. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/3. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/4. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/5. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/6. Setting to NA.
https://collectionapi.metmuseum.org/api/collection/v1/iiif/7/20225/restricted
https://collectionapi.metmuseum.org/api/collection/v1/iiif/8/20226/restricted
No image found for http://www.metmuseum.org/art/collection/search/9. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/10. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/11. Setting to NA.
No image found for http://www.metmuseum.org/art/collection/search/12. Setting to NA.
N

## Downloading images to set directory ( image name will be the "ID column + jpg".)

In [26]:
output_folder = ""


# Define the folder where images will be saved

os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn’t exist

for index, row in scrape_dt.iterrows():
    image_url = row["im_link"]  # Get the image link from the column
    image_id = row["Object ID"]  # Use the ID column for naming

    # Skip if the image link is NA or None
    if not isinstance(image_url, str) or image_url.lower() in ["nan", "none"]:
        print(f"Skipping row {index}: No valid image link.")
        continue

    # Define the filename with the correct extension
    image_filename = os.path.join(output_folder, f"{image_id}.jpg")

    try:
        # Send a request to download the image
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Ensure request was successful

        # Write the image to a file
        with open(image_filename, "wb") as file:
            for chunk in response.iter_content(1024):  # Download in chunks
                file.write(chunk)
        
        print(f"Downloaded: {image_filename}")

    except requests.RequestException as e:
        print(f"Failed to download {image_url} for ID {image_id}: {e}")




Skipping row 0: No valid image link.
Skipping row 1: No valid image link.
Skipping row 2: No valid image link.
Skipping row 3: No valid image link.
Skipping row 4: No valid image link.
Skipping row 5: No valid image link.
Downloaded: /Users/zoeungvari/databases/met_images/7.jpg
Downloaded: /Users/zoeungvari/databases/met_images/8.jpg
Skipping row 8: No valid image link.
Skipping row 9: No valid image link.
Skipping row 10: No valid image link.
Skipping row 11: No valid image link.
Skipping row 12: No valid image link.
Skipping row 13: No valid image link.
Skipping row 14: No valid image link.
Skipping row 15: No valid image link.
Skipping row 16: No valid image link.
Skipping row 17: No valid image link.
Skipping row 18: No valid image link.
Skipping row 19: No valid image link.


In [24]:
scrape_dt.head(10)



Unnamed: 0,Object ID,Link Resource,im_link
0,1,http://www.metmuseum.org/art/collection/search/1,
1,2,http://www.metmuseum.org/art/collection/search/2,
2,3,http://www.metmuseum.org/art/collection/search/3,
3,4,http://www.metmuseum.org/art/collection/search/4,
4,5,http://www.metmuseum.org/art/collection/search/5,
5,6,http://www.metmuseum.org/art/collection/search/6,
6,7,http://www.metmuseum.org/art/collection/search/7,https://collectionapi.metmuseum.org/api/collec...
7,8,http://www.metmuseum.org/art/collection/search/8,https://collectionapi.metmuseum.org/api/collec...
8,9,http://www.metmuseum.org/art/collection/search/9,
9,10,http://www.metmuseum.org/art/collection/search/10,
