# Wikiart Scraper

#### Description

The following script is designed to load the page from wikiart.org that contains all of a single artist's works. It uses selenium chrome to scroll down the page slowly in order to load all of the thumbnails. Then, it downloads all of the thumbnail images. Test first to make the page loads all the images for a single artist, then make `DO_DOWNLOAD = True` and run.

#### Requirements

- `requests`
- `beautifulsoup4`
- `selenium` and the corresponding Firefox driver
- `pillow`


#### Ethical Scraping

Wikiart makes its contents available under Fair Use (for more copyright information, visit wikiart.org). And since we are only downloading the image thumbnails, we feel that this does not put undue stress on their servers.

In [None]:
import os
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import ElementNotInteractableException

from PIL import Image

In [None]:
# define constants

WIKIART_URL_PREFIX = 'https://www.wikiart.org/en/'
WIKIART_URL_SUFFIX = '/all-works#!#filterName:all-paintings-chronologically,resultType:masonry'
N_INITIAL_IMAGES_TO_SKIP = 3
N_SCROLLS_PER_LOAD_LENGTH = 4
STANDARD_WAIT_TIME = 2
Y_SCROLL_DISTANCE = 500
N_FAILURES_BEFORE_STOP = 4
RESIZE_DIMS = (224, 224)
SAVE_PATH = '.'

# set this to True once it seems like it's working correctly
DO_DOWNLOAD = False

In [None]:
def construct_wikiart_URL(artist_name):
    artist_name_sani = '-'.join(artist_name.lower().split(' '))
    return WIKIART_URL_PREFIX + artist_name_sani + WIKIART_URL_SUFFIX

def construct_artist_path(artist_type, artist_name):
    return os.path.join(artist_type, artist_name.replace(' ', '').replace('-', ''))

def makedir_safe(save_dir):
    try:
        os.makedirs(save_dir)
    except FileExistsError:
        print(save_dir, 'already exists')

def scroll_down_to_bottom(driver):
    
    def scroll_one_load_length(driver, scrollYPos):
        for timeInd in range(0, N_SCROLLS_PER_LOAD_LENGTH):
            time.sleep(STANDARD_WAIT_TIME)
            driver.execute_script("window.scrollTo(0, " + str(scrollYPos) + ")")
            scrollYPos += Y_SCROLL_DISTANCE
        return scrollYPos
    
    def load_more(driver, failToLoadCount):
        try:
            driver.find_element_by_class_name('load-more-phrase').click()
            return failToLoadCount
        except ElementNotInteractableException:
            failToLoadCount += 1
            return failToLoadCount
        
    scrollYPos = Y_SCROLL_DISTANCE
    failToLoadCount = 0
    
    while failToLoadCount < N_FAILURES_BEFORE_STOP:
        scrollYPos = scroll_one_load_length(driver, scrollYPos)
        failToLoadCount = load_more(driver, failToLoadCount)
        print('failToLoadCount', failToLoadCount)

In [None]:
# define the artists and categories you would like to scrape

abstractArtists = [
    'Frank Stella',
    'Ellsworth Kelly',
    'Sol LeWitt',
    'Piet Mondrian',
    'Sophie Taeuber-Arp',
    'Wassily Kandinsky',
    'Paul Klee',
    'Josef Albers',
    'Kazimir Malevich',
    'Hilma af Klint',
]

surrealArtists = [
    'Yves Tanguy',
    'Salvador Dali',
    'Joan Miro',
    'Rene Magritte',
    'Max Ernst',
    'Giorgio de Chirico',
    'Remedios Varo',
]

artists = {
    'abstract': abstractArtists,
    'surreal': surrealArtists,
}

In [None]:
# scrape

driver = webdriver.Firefox()
driver.maximize_window()

for artistType, artistList in artists.items():
    
    for artistName in artistList:
        
        print('scraping for', artistName)
        
        subPath = construct_artist_path(artistType, artistName)
        
        saveDir = Path(os.path.join(SAVE_PATH, 'scrapedArt', subPath))
        saveDirResize = Path(os.path.join(SAVE_PATH, 'scrapedArtResize', subPath))

        makedir_safe(saveDir)
        makedir_safe(saveDirResize)
        
        targetURL = construct_wikiart_URL(artistName)
        
        driver.get(targetURL)

        status = scroll_down_to_bottom(driver)
        
        artist_art_page = driver.page_source
        soup = BeautifulSoup(artist_art_page, "html.parser")
        imageTags = soup.select('img')
        imageLinks = [tag.get('src') for tag in imageTags]
        
        for imageURL in imageLinks[N_INITIAL_IMAGES_TO_SKIP:]:
            
            fileName = os.path.basename(imageURL)

            savePath = saveDir/fileName
            savePathResize = saveDirResize/fileName
            
            if DO_DOWNLOAD:
                img_data = requests.get(imageURL).content

                with open(savePath, 'wb') as handler:
                    handler.write(img_data)

                image = Image.open(savePath)
                image_resized = image.resize(RESIZE_DIMS)
                image_resized.save(savePathResize)
            else:
                print('we would have downloaded and resized', imageURL)

driver.close()