# Scrapign Gagosian

Wir scrapen im Folgendenden verschiedenen Informationen der Galerie Gagosian.

## Sammle URLs

Wir sammlen in einem ersten Schritt die URLs der Austellungen. Dies bestifft sowohl vergagenen, aktuelle wie kommende Austellungen.

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

sites = [
    "https://gagosian.com/exhibitions/",
    "https://gagosian.com/exhibitions/upcoming/",
    "https://gagosian.com/exhibitions/archive/?as_list"
]

options = Options()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)
urls_comp = []

for url in sites:
    driver.get(url)
    time.sleep(1)

    if "archive" in url:
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            try:
                button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#exhibitions button")))
                button.click()
                time.sleep(1)
            except:
                break

    elements = driver.find_elements(By.CSS_SELECTOR, "#exhibitions section > div > a")
    urls = [el.get_attribute("href") for el in elements if el.get_attribute("href")]
    urls_comp.extend(urls)

urls_comp = list(set(urls_comp))

Wir gehen im folgenden über die URLs und sammlen folgende Variablen: 

- `name`: Name des Künstlers
- `titel`: Titel der Austellung
- `date`: Datum der Austellung
- `location`: Ort der Austellung
- `about`: Text über die Austellung
- `url`: URL der Austellung
- `html`: HTML der Austellung

In [2]:
from tqdm import tqdm

data = []

for url in tqdm(urls_comp, desc="scraping exhibitions"):
    driver.get(url)
    time.sleep(1.5)

    artist = title = date = location = about_text = html = pd.NA

    try:
        artist = driver.find_element(By.CSS_SELECTOR, "#page-header > div > h1").text.strip()
    except:
        pass

    try:
        title = driver.find_element(By.CSS_SELECTOR, "#page-header > div > h2").text.strip()
    except:
        pass

    try:
        h3 = driver.find_element(By.CSS_SELECTOR, "h3.type-m-md")
        h3_html = h3.get_attribute("innerHTML")
        parts = h3_html.split("<br>")
        date_parts = parts[0].replace('<span class="inline-block whitespace-nowrap">', '').replace('</span>', '')
        date = " ".join(date_parts.split())
        try:
            location = h3.find_element(By.CSS_SELECTOR, "a").text.strip()
        except:
            location = h3.text.strip().split("\n")[-1]
    except:
        pass

    try:
        button = driver.find_element(By.CSS_SELECTOR,
            "#about > div > div > div:nth-child(1) > div > button")
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth'});", button)
        time.sleep(0.5)
        button.click()
        time.sleep(0.5)
    except:
        pass

    try:
        about_div = driver.find_element(By.CSS_SELECTOR,
            "#about > div > div > div:nth-child(1) > div > div")
        about_text = about_div.text.strip()
    except:
        pass

    try:
        html = driver.page_source
    except:
        pass

    data.append({
        "name": artist,
        "titel": title,
        "date": date,
        "location": location,
        "about": about_text,
        "url": url,
        "html": html
    })

df = pd.DataFrame(data)
print(df)


scraping exhibitions: 100%|██████████| 1382/1382 [1:55:04<00:00,  5.00s/it] 

                                      name  \
0     DESERT PAINTERS OF AUSTRALIA PART II   
1          ALBERTO GIACOMETTI | YVES KLEIN   
2                               ED PASCHKE   
3                              JOHN CURRIN   
4                             BRIAN CLARKE   
...                                    ...   
1377                            JONAS WOOD   
1378                              INSIGHT?   
1379                      GREGORY CREWDSON   
1380                             ED RUSCHA   
1381                            CY TWOMBLY   

                                                  titel  \
0     With Works from the Collection of Steve Martin...   
1                             In Search of the Absolute   
2                                                  <NA>   
3                                              Memorial   
4                                                Lamina   
...                                                 ...   
1377                              




In [None]:
import os

os.chdir(r"c:\Users\Hueck\OneDrive\Dokumente\GitHub\analyse_art_gallerys")
print(os.getcwd())
df.to_csv("data/gagosian.csv", index=False, encoding="utf-8-sig")




c:\Users\Hueck\OneDrive\Dokumente\GitHub\analyse_art_gallerys
