## Scraping WikiArt by Style, Movement, and Genre

In [1]:
# import os
# import bs4
# import urllib
# import urllib.request
# from bs4 import BeautifulSoup
# import itertools
# from multiprocessing.dummy import Pool as ThreadPool
# import multiprocessing

In [2]:
from bs4 import BeautifulSoup
import os
import requests
import re

# Style

In [3]:
def get_painting_links_by_style(style_name, max_paintings):
    STYLE_URL = 'https://www.wikiart.org/en/paintings-by-style/{style}'.format(style=style_name)
    style_page = requests.get(STYLE_URL)

    # check for request error
    try:
        style_page.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("Error trying to retrieve {}".format(style_page.url))
        raise e

    soup = BeautifulSoup(style_page.text, 'lxml')
    links = soup.find('div',class_='artworks-by-dictionary').get('ng-init')[10:]
    links = links.replace("'", "\"")
    links = links.replace("\n", "")
    
    paintings = re.findall(r"\"paintingUrl\" \: \"(/en/.*?)\", \"artistUrl\"", links)
    
    if len(paintings) > max_paintings:
        print("Style", style_name + ":", max_paintings, "paintings found.")
        return paintings[:max_paintings], max_paintings
    else:
        print("Style", style_name + ":", len(paintings), "paintings found.")
        return paintings, len(paintings)

# Movement

In [4]:
def get_artist_links_by_movement(movement_name, max_paintings):
    MOVEMENT_URL = 'https://www.wikiart.org/en/artists-by-art-movement/{movement}/text-list'.format(movement=movement_name)
    movement_page = requests.get(MOVEMENT_URL)

    # check for request error
    try:
        movement_page.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("Error trying to retrieve {}".format(movement_page.url))
        raise e

    soup = BeautifulSoup(movement_page.text, 'lxml')
    artists = []
    total_paintings = 0
    for artist in soup.find("div", class_="masonry-text-view masonry-text-view-all").find_all('li'):
        works = [info.text[2:-9] for info in artist.find_all('span') if 'artwork' in info.text]
        num_paintings = int(works[0])
        if (total_paintings + num_paintings > max_paintings):
            break;
        else:
            artists.append(artist.find('a').get('href'))
            total_paintings += num_paintings
    print(movement_name + ":", len(artists), "artists and", total_paintings, "paintings found.")
    return artists, total_paintings

In [5]:
def get_painting_links(artist_link):
    ARTIST_URL = "https://www.wikiart.org{artist_link}/all-works/text-list".format(artist_link=artist_link)
    artist_page = requests.get(ARTIST_URL)
    
    # check for request error
    try:
        artist_page.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("Error trying to retrieve {}".format(artist_page.url))
        raise e
        
    soup = BeautifulSoup(artist_page.text, 'lxml')
    
    painting_paths = []
    # retreive all rows in painting-list
    for li in soup.find_all('li', {'class': 'painting-list-text-row'}):

        # retrieve all links in the current row
        for link in li.find_all('a'):
            href = link.get('href')
            # store in dictionary
            painting_paths.append(href)
            
    print(len(painting_paths), "images found for", artist_link)
    return painting_paths

In [6]:
def save_painting(directory, painting_link):
    if not os.path.exists(directory):
        os.makedirs(directory)
    PAINTING_URL = 'https://www.wikiart.org{painting_path}'.format(painting_path=painting_link)
    r_painting_page = requests.get(PAINTING_URL)
    soup = BeautifulSoup(r_painting_page.text, 'lxml')
    for img in soup.find_all('img', {'class': 'ms-zoom-cursor'}):
        img_url = img['src']
        img_url = img_url.split('!')[0]
        filename = img_url.split('/')[-1]

        outfile = os.path.join(directory, filename)                       
        if not os.path.exists(outfile):                        
#             print("Saving: {} in {}".format(filename, directory))
            r = requests.get(img_url, outfile)
            with open(outfile, 'wb') as f:
                f.write(r.content)
        else:
#             print("Skipping existing file: {}".format(filename))
            continue

In [7]:
def save_movements(movements, data_directory):
    for movement, num in movements:
        movement_directory = os.path.join(data_directory, movement)
        artists, total_paintings = get_artist_links_by_movement(movement, num)
        if not artists:
            print("No artists found.")
            return
        for a in artists:
            paintings = get_painting_links(a)
            if not paintings:
                print("No paintings found.")
                return
            print("Saving...")
            for p in paintings:
                save_painting(movement_directory, p)
        print("Finished saving", total_paintings, movement, "paintings.")
    print("Finished saving all paintings.")

In [15]:
def save_styles(styles, data_directory):
    """
    Saves paintings by style.
    Currently can only save max 60 featured paintings per style due to the website.
    TODO: load all paintings before saving
    
    @param styles a list of tuple pairs of style and maximum number of paintings of that style
    @param data_directory a directory in which to save each folder of style paintings in data_directory/style
    """
    for style, num in styles:
        style_directory = os.path.join(data_directory, style)
        paintings, total_paintings = get_painting_links_by_style(style, num)
        if not paintings:
            print("No paintings found.")
            return
        print("Saving...")
        for p in paintings:
            save_painting(style_directory, p)
        print("Finished saving", total_paintings, style, "paintings.")
    print("Finished saving all paintings.")

In [12]:
def main(query, data_directory, by_movement = True):
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)
    if by_movement:
        save_movements(query, data_directory)
    else:
        save_styles(query, data_directory)

In [13]:
movements = [('impressionism', 40)]
DATA_DIR = "../data"
main(movements, DATA_DIR, True)

impressionism: 1 artists and 26 paintings found.
26 images found for /en/jules-dupre
Saving...
Finished saving 26 impressionism paintings.
Finished saving all paintings.


In [16]:
styles = [('expressionism', 40)]
DATA_DIR = "../data"
main(styles, DATA_DIR, False)

Style expressionism: 40 paintings found.
Saving...
Finished saving 40 expressionism paintings.
Finished saving all paintings.


In [None]:
# Access the html of the page given a genre and pagenumber that are used to generate a url, 
# from this html find the urls of all images hosted on the page using page layout as of June 2017, 
# return a list of alls urls to paintings
def soupit(j, movement):
    try:
        url = "https://www.wikiart.org/en/artists-by-art-movement/" + movement + "/" + str(j)
        html = urllib.request.urlopen(url)
        soup =  BeautifulSoup(html)
        found = False
        urls = []
        for i in str(soup.findAll()).split():
            found = i == 'data'
            if found:
                if '}];' in i:
                    break
                if 'https' in i:
                    web = "http" + i[6:-2]
                    urls.append(web)
                    j = j+1
        return urls
    except Exception as e:
        print('Failed to find the following movment page combo: ' + movement + str(j), str(e))

In [None]:
#Given a url for an image, we download and save the image while also recovering information about the painting in the saved name depending on the length of the file.split('/') information (which corresponds to how much information is available)
def dwnld(web,genre):
    i,file = web
    name = file.split('/')
    savename = ''
    if len(name) == 6:
        savename = genre+"/"+ name[4] + "+" + name[5].split('.')[0] +".jpg"
    if len(name) == 5:
        savename = genre+"/"+name[4].split('.')[0]+".jpg"
    if len(name) == 7:
        savename = genre+"/"+ name[5] + "+" + name[6].split('.')[0] +".jpg"
        
    print(genre + str(i))
    #If we get an exception in this operation it is probably because there was a nonstandard unicode character in the name of the painting, do some fancy magic to fix this in the exception handling code
    try:
        urllib.request.urlretrieve(file,savename)
    except Exception:
        ofile = file
        file = urllib.parse.urlsplit(file)
        file = list(file)
        file[2] = urllib.parse.quote(file[2])
        file = urllib.parse.urlunsplit(file)
        try:
            urllib.request.urlretrieve(file,savename)
            print('Suceeded on second try for '+ file)
        except Exception:
            print('We failed on the second try for ' + file)

In [None]:
#We can run both the url retrieving code and the image downloading code in parallel, and we set up the logic for that here
def for_genre(genre,num):
    pool = ThreadPool(multiprocessing.cpu_count()-1)
    nums = list(range(1,num))
    results = pool.starmap(soupit,zip(nums,itertools.repeat(genre)))
    pool.close()
    pool.join()
    
    #build up the list of urls with the results of all the sub-processes that succeeded in a single list
    new_results = []
    for j in results:
        if j:
            for i in j:
                new_results.append(i)
    
    pool = ThreadPool(multiprocessing.cpu_count()-1)
    pool.starmap(dwnld,zip(enumerate(new_results),itertools.repeat(genre)))
    pool.close
    pool.close()

In [1]:
# A list of movements hosted on wikiart.org as well as the number of pages to pull images from, 
# numbers were set from manual inspection and are only approximations of how many pages each genre contains
genres = [('portrait',250),
        ('landscape',250),
        ('genre-painting',250),
        ('abstract',250),
        ('religious-painting',140),
        ('cityscape',110),
        ('figurative',75),
        ('still-life',50),
        ('symbolic-painting',50),
        ('nude-painting-nu',50),
        ('mythological-painting',35),
        ('marina',30),
        ('flower-painting',30),
        ('animal-painting',30)]

for (a,b) in genres:
        if not os.path.exists("./"+a):
            os.mkdir(a)
        for_genre(a,b)

NameError: name 'genres' is not defined