# Scrapign Gagosian

Wir scrapen im Folgendenden verschiedenen Informationen der Pace Gallery.

## Sammle URLs

Wir sammlen in einem ersten Schritt die URLs der Austellungen. Dies bestifft sowohl vergagenen, aktuelle wie kommende Austellungen.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

sites = [
    "https://www.pacegallery.com/exhibitions/status/archive/",
    "https://www.pacegallery.com/exhibitions/"
]

options = Options()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)
urls_comp = []

for url in sites:
    driver.get(url)
    time.sleep(1)

    if "archive" in url:
        while True:
            button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".pagination__button")))
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", button)
            time.sleep(1)
            try:
                button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".pagination__button")))
                button.click()
                time.sleep(1)
            except:
                break

    elements = driver.find_elements(By.CSS_SELECTOR, ".archive-list__heading")
    urls = [el.get_attribute("href") for el in elements if el.get_attribute("href")]
    urls_comp.extend(urls)

urls_comp = list(set(urls_comp))

Wir gehen im folgenden über die URLs und sammlen folgende Variablen: 

- `name`: Name des Künstlers
- `titel`: Titel der Austellung
- `date`: Datum der Austellung
- `location`: Ort der Austellung
- `about`: Text über die Austellung
- `url`: URL der Austellung
- `html`: HTML der Austellung

In [None]:
from tqdm import tqdm

data = []

for url in tqdm(urls_comp, desc="scraping exhibitions"):
    driver.get(url)
    time.sleep(1.5)

    artist = title = date = location = about_text = html = pd.NA

    try:
        artist = driver.find_element(By.CSS_SELECTOR, "#page-header > div > h1").text.strip()
    except:
        pass

    try:
        title = driver.find_element(By.CSS_SELECTOR, "#page-header > div > h2").text.strip()
    except:
        pass

    try:
        h3 = driver.find_element(By.CSS_SELECTOR, "h3.type-m-md")
        h3_html = h3.get_attribute("innerHTML")
        parts = h3_html.split("<br>")
        date_parts = parts[0].replace('<span class="inline-block whitespace-nowrap">', '').replace('</span>', '')
        date = " ".join(date_parts.split())
        try:
            location = h3.find_element(By.CSS_SELECTOR, "a").text.strip()
        except:
            location = h3.text.strip().split("\n")[-1]
    except:
        pass

    try:
        button = driver.find_element(By.CSS_SELECTOR,
            "#about > div > div > div:nth-child(1) > div > button")
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth'});", button)
        time.sleep(0.5)
        button.click()
        time.sleep(0.5)
    except:
        pass

    try:
        about_div = driver.find_element(By.CSS_SELECTOR,
            "#about > div > div > div:nth-child(1) > div > div")
        about_text = about_div.text.strip()
    except:
        pass

    try:
        html = driver.page_source
    except:
        pass

    data.append({
        "name": artist,
        "titel": title,
        "date": date,
        "location": location,
        "about": about_text,
        "url": url,
        "html": html
    })

df = pd.DataFrame(data)
print(df)


scraping exhibitions:   7%|▋         | 101/1382 [08:34<1:43:02,  4.83s/it]

In [None]:
df

In [None]:
#works > div.mx-auto.w-full.max-w-screen-xl-nopad.md\:px-4 > div > div:nth-child(1) > div > div > div > div > figure:nth-child(1) > div > div > div > div > div.w-full.overflow-hidden.max-sm\:aspect-h-1.max-sm\:aspect-w-\[--aspect-ratio\].sm\:flex.sm\:h-full.sm\:max-h-full.sm\:w-auto.sm\:max-w-full.sm\:items-center.sm\:justify-center > div