# Scrapign Gagosian

Wir scrapen im Folgendenden verschiedenen Informationen der Pace Gallery.

## Sammle URLs

Wir sammlen in einem ersten Schritt die URLs der Austellungen. Dies bestifft sowohl vergagenen, aktuelle wie kommende Austellungen.

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

sites = [
    "https://www.pacegallery.com/exhibitions/status/archive/",
    "https://www.pacegallery.com/exhibitions/"
]

options = Options()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)
urls_comp = []

for url in sites:
    driver.get(url)
    time.sleep(1)

    if "archive" in url:
        while True:
            try:
                button = driver.find_element(By.CSS_SELECTOR, ".pagination__button")
                driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", button)
                time.sleep(1)
                if button.is_enabled() and button.is_displayed():
                    button.click()
                    time.sleep(1.5)
                else:
                    break
            except:
                break
    else:
        last_height = 0
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1.5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    elements = driver.find_elements(By.CSS_SELECTOR, "a.archive-list__link")
    urls = [el.get_attribute("href") for el in elements if el.get_attribute("href")]
    urls_comp.extend(urls)

urls_comp = list(set(urls_comp))


In [12]:
urls_comp

['https://www.pacegallery.com/exhibitions/lee-kun-yong-3/',
 'https://www.pacegallery.com/exhibitions/zhang-huan-9/',
 'https://www.pacegallery.com/exhibitions/sam-gilliam-watercolors-2/',
 'https://www.pacegallery.com/exhibitions/david-byrne-2/',
 'https://www.pacegallery.com/exhibitions/mary-corse-seen-unseen/',
 'https://www.pacegallery.com/exhibitions/david-hockney-3/',
 'https://www.pacegallery.com/exhibitions/lucas-samaras-18/',
 'https://www.pacegallery.com/exhibitions/adolph-gottlieb/',
 'https://www.pacegallery.com/exhibitions/barry-flanagan-2/',
 'https://www.pacegallery.com/exhibitions/the-monster-curated-by-robert-nava/',
 'https://www.pacegallery.com/exhibitions/beatriz-milhazes-mistura-sagrada/',
 'https://www.pacegallery.com/exhibitions/wang-guangle-2/',
 'https://www.pacegallery.com/exhibitions/julian-schnabel-6/',
 'https://www.pacegallery.com/exhibitions/julian-schnabel-victory/',
 'https://www.pacegallery.com/exhibitions/richard-avedon-3/',
 'https://www.pacegallery.

Wir gehen im folgenden über die URLs und sammlen folgende Variablen: 

- `name`: Name des Künstlers
- `titel`: Titel der Austellung
- `date`: Datum der Austellung
- `location`: Ort der Austellung
- `about`: Text über die Austellung
- `url`: URL der Austellung
- `html`: HTML der Austellung

In [None]:
from tqdm import tqdm
import random

data = []

for url in tqdm(urls_comp, desc="scraping exhibitions"):
    driver.get(url)
    time.sleep(4.5)

    artist = title = date = location = about_text = html = pd.NA

    # Artist
    try:
        artist = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".hero__text-heading"))
        ).text.strip()
    except:
        artist = pd.NA

    # Title
    try:
        title = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".hero__text-sub-heading"))
        ).text.strip()
    except:
        title = pd.NA

    # Date + Location
    try:
        body = driver.find_element(By.CSS_SELECTOR, ".hero__text-body").get_attribute("innerText").strip()
        lines = [line.strip() for line in body.split("\n") if line.strip()]
        date = lines[1] if len(lines) > 1 else pd.NA
        location = lines[2] if len(lines) > 2 else pd.NA
    except:
        date = location = pd.NA

    about_text = pd.NA

    # Optional Read-more Button
    try:
        button = driver.find_element(By.CSS_SELECTOR, ".js-read-more-btn")
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", button)
        time.sleep(0.5)
        button.click()
        time.sleep(0.5)
    except:
        pass

    # About-Text aus verschiedenen Containern
    selectors = [
        ".exhibition-intro__right-content",
        "body > div.content-container.js-content-container > article > div > div:nth-child(8) > div > div > div > div"
    ]

    for sel in selectors:
        try:
            element = driver.find_element(By.CSS_SELECTOR, sel)
            text = element.text.strip()
            if text:
                about_text = text
                break
        except:
            continue

    try:
        html = driver.page_source
    except:
        html = pd.NA

    data.append({
        "name": artist,
        "titel": title,
        "date": date,
        "location": location,
        "about": about_text,
        "url": url,
        "html": html
    })

df = pd.DataFrame(data)


scraping exhibitions:   0%|          | 0/1198 [00:00<?, ?it/s]

In [33]:
df

Unnamed: 0,name,titel,date,location,about,url,html
0,DRIFT,"Materialism: Past, Present, Future","Nov 5 – Dec 18, 2021",New York,The presentation will include sculptures from ...,https://www.pacegallery.com/exhibitions/drift-3/,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
1,Chuck Close,Recent Paintings,"Nov 2 – Dec 7, 1991",New York,,https://www.pacegallery.com/exhibitions/chuck-...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
2,Robert Whitman,Shading,"Jan 16 – Feb 21, 2004",New York,,https://www.pacegallery.com/exhibitions/robert...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
3,Richard Learoyd,A Loathing of Clocks and Mirrors,"Mar 7 – Apr 26, 2025",New York,,https://www.pacegallery.com/exhibitions/richar...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
4,Xiao Yu,Oblivion,"Sep 12 – Oct 24, 2015",Beijing,,https://www.pacegallery.com/exhibitions/xiao-y...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
5,Claes Oldenburg,New Work,"Sep 18 – Nov 17, 1992",New York,,https://www.pacegallery.com/exhibitions/claes-...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
6,Lee Ufan,"From Point, From Line, From Wind","Sep 15 – Oct 31, 2015",London,,https://www.pacegallery.com/exhibitions/lee-uf...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
7,Lee Krasner,Paintings 1959-1962,"Feb 3 – Mar 10, 1979",New York,,https://www.pacegallery.com/exhibitions/lee-kr...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
8,Yin Xiuzhen,Nowhere to Land,"Jul 20 – Nov 30, 2013",Beijing,,https://www.pacegallery.com/exhibitions/yin-xi...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
9,Summer '99,,"Jul 9 – Aug 13, 1999",New York,,https://www.pacegallery.com/exhibitions/summer...,"<html class=""no-js"" lang=""en"" data-whatinput=""..."
