In [1]:
from bs4 import BeautifulSoup, Comment
from itertools import compress
from joblib import Parallel, delayed
from ethnicolr import census_ln, pred_census_ln, pred_wiki_name
import requests
import json
import re
import pandas as pd
import numpy as np
import time
import math
import util
%matplotlib inline

Using TensorFlow backend.


# MoMA exhibits & Artists Exhibited at the MoMA
This script is used to build the database of exhibitions and the database of artists exhibited.

## Part 1 - Web scraping
As of 4/6/18, the MoMA has had 4968 exhibits.

This script will create two dataframes and output them as files in the path 'data/':
- data/artist.pkl
- data/exhibition.pkl

The exhibition dataset is a set of all exhibitions that have been hosted by the The Museum of Modern Art, MoMA PS1, or moma.org. Exhibitions can be scraped from urls such as: [https://www.moma.org/calendar/exhibitions/100](https://www.moma.org/calendar/exhibitions/100), where 100 represents the id of some exhibit.

The artist dataset is created by compiling all artists across all exhibits and removing any duplication. This set of artists will be different from the dataset of artists in their [collection](https://www.moma.org/collection/) since the MoMA does not have to collect an artist's work in order to have shown them. For each exhibit, i, we scrape data from a url : https://www.moma.org/artists?exhibition_id=i


## Part 2 - Getting Race and Gender
The second part of this notebook uses the data that was scraped and assigns a race and gender to each artist.

### Race Data
To get an artist's race, I use a two fold method:

__Part One__:

First I use the python library: [__ethnicolr__](https://github.com/appeler/ethnicolr), that matches names to race. This python library provides several bi-char (Smith ==> sm, mi, it, th) deep learning models that use an LSTM architecture. The specific model I chose is based on wikipedia data as it uses the most international dataset to train the model. It has a model performance of 80% accuracy and 83% recall.

__Part Two__:

For the second part, I try to increase the accuraccy specifically on American artists using the data from the [US-Census](https://api.census.gov/data/2010/surname.html).

To take a conservative stance, I only reassign artists whose race is predicted to be 'white' from _ethnicolr_. If _ethnicolr_ predicts a non-white race, I keep the race assignment as is. This means that I will end up with an under-estimation of white artists, and an over estimation of non-white artists. 

The first part is to figure out what part of the artist_name string is the lastname. To do this, I start from the last word of the artist_name, and iteratively check whether or not the word has a match in the lastname_race_df.

For example, if we get the name "Millie Bobby Brown", 
1. I will start by checking whether or not 'Brown' maps to some name in the lastname_race_df 
2. if so, I will assign the artist with a race, otherwise, check to see whether or not 'Bobby' maps to some name in the lastname_race_df
3. if so, I will assign the artist with a race, otherwise, check to see whether or not 'Millie' maps to some name in the lastname_race_df
4. if so, I will assign the artist with a race, otherwise, we keep the race prediction of _ethnicolr_

The race assignment is done by randomly sampling from probabilities provided in the US Census dataset. This will mean that on each run, there is a chance that the race assigned to each artist will be different.

For example, if we were to assign the lastname "Brown" to a race, we will start by looking for the probability distribution of races. We then randomly sample from this distribution to get our prediction:


### Gender Data
To get an artist's gender, I used the web service: [__genderize.io__](https://www.genderize.io). This service simply takes in a name and spits out a gender, and the probability of its accuracy. 

# Code

Some global variables:

In [2]:
# There have been 4968 exhibitions archived on the MoMA's website as of 4/7/2018
total_exhibitions = 4968

# print progress and errors when scaping data
print_progress_and_errors = False

# scrape a small set of sample data instead of the full data set that would take several hours
pull_sample_data = True

Some helper functions for parsing through scraped strings

In [3]:
def _strip_html(text):
    return re.sub('<[^<]+?>', '', text).strip()

def _strip_non_numbers(text):
    return re.sub("[^0-9]", "", text).strip()

`_parse_exhibit` is used to parse through each exhibit on the MoMA's website. For each exhibit, the function will find key information about the exhibit. This function will also call on the `_parse_exhibit_artists` which will get the list of all artists who participated in the exhibit.

In [4]:
def _parse_exhibit(exhibit_index, print_progress=True):
    """
    Get the following attributes of the exhibit:
    - name of exhibit
    - date text of exhibit
    - year of exhibit
    - which museum (MoMA, MoMA PS1, online...)
    - press release text
    - list of artists (indexed on artist full name)
    """

    if print_progress == True:
        if (exhibit_index % 10) == 0:
            print(str(exhibit_index) + ', ', end='')
    
    try:
        page = requests.get("https://www.moma.org/calendar/exhibitions/%s"%(exhibit_index))
        soup = BeautifulSoup(page.content, 'html.parser')
        
        # get exhibit title
        title = _strip_html(str(soup.find('h1', {'class': 'page-header__title'})))
        
        # get exhibit year and date_full_text
        date_full_text = _strip_html(str(soup.find('h2', {'class': 'page-header__subheading--narrow'})))
        year = ''
        r = re.findall('.*([1-3][0-9]{3})', date_full_text)
        if r:
            year = str(r.pop())
        else:
            year = '-1'
        
        # get which museum (MoMA, PS1, Online...)
        museum = _strip_html(str(soup.find('p', {'class': 'calendar-tile__location--title center'})))
        
        # get press release
        press_release_container = soup.find('div', {'class': 'container-uneven--2 body-copy--simple'})
        press_release = " ".join([str(text) for text in press_release_container.find_all('p')])
    
        # get artists with _parse_exhibit_artists
        exhibit_artists_dict = _parse_exhibit_artists(exhibit_index)
        artists = ", ".join(list(exhibit_artists_dict['artist_name']))

        return title, year, date_full_text, museum, press_release, artists, exhibit_artists_dict
    
    except Exception:
        if print_progress == True:
            print("Error[%i], "%exhibit_index, end='')
        pass

The `_parse_exhibit_artists` function will parse through each artist in a particular exhibit, and retreive specific information for each artist.

In [5]:
def _parse_exhibit_artists(exhibit_index):
    """
    Get the follow attributes of the artists in an exhibit:
    - name of artist
    - associated gender & ethnicity
    - nationality
    - number of exhibitions
    - number of "works online"
    """
    page = requests.get("https://www.moma.org/artists?exhibition_id=%s"%(exhibit_index))
    soup = BeautifulSoup(page.content, 'html.parser')
    artist_tiles = soup.find("div", {"class": "tile-container"})
    
    try:
        artist_names = artist_tiles.findAll("div", {"class": "caption--artist__name center balance-text"})
        artist_nationalities_and_dates = artist_tiles.findAll("div", {"class": "caption--artist__date center balance-text"})
        artist_exhibitions_and_work_online = artist_tiles.findAll("div", {"class": "caption--artist__count center"})

        name_arr, nationality_arr, exhibitions_arr, work_online_arr = ([] for i in range(4))
        for i in range(len(artist_names)):
            # get name
            name = _strip_html(str(artist_names[i]))

            # get nationality if available, None otherwise
            nationality_raw = _strip_html(str(artist_nationalities_and_dates[i])).split(', ')
            nationality_filter = [x.isalnum() for x in nationality_raw]
            nationality_list = list(compress(nationality_raw, nationality_filter))
            nationality = nationality_list[0] if len(nationality_list) > 0 else ""

            exhibitions_and_work_online_raw = _strip_html(str(artist_exhibitions_and_work_online[i])).split(', ')

            exhibitions_filter = ["exhibition" in x for x in exhibitions_and_work_online_raw]
            exhibitions_list = list(compress(exhibitions_and_work_online_raw, nationality_filter))
            exhibitions = _strip_non_numbers(exhibitions_list[0]) if len(exhibitions_list) > 0 else 0

            work_online_filter = ["online" in x for x in exhibitions_and_work_online_raw]
            work_online_list = list(compress(exhibitions_and_work_online_raw, work_online_filter))
            work_online = _strip_non_numbers(work_online_list[0]) if len(work_online_list) > 0 else 0

            name_arr.append(name)
            nationality_arr.append(nationality)
            exhibitions_arr.append(exhibitions)
            work_online_arr.append(work_online)

        return {
            'artist_name': name_arr,
            'nationality': nationality_arr,
            'exhibitions': exhibitions_arr,
            'work_online': work_online_arr
        }
    
    except Exception:
        pass

This function will loop through exhibit indexes 1 to `total_exhibitions`, running the `_parse_exhibit` and `_parse_exhibit_artists` for each index. 

WARNING: This function is extremely costly, taking up to 3-4 hours to execute (on a single thread).

In [31]:
""" COMPUTATIONALLY EXPENSIVE """
# build exhibit dataframe
start_time = time.time()

# using joblib [not fully tested]
# exhibits = Parallel(n_jobs=2)(delayed(_parse_exhibit)(j+1) for j in range(50))

if pull_sample_data:
    exhibits = [_parse_exhibit(j+1, print_progress_and_errors) for j in range(100, 110)]
else:
    exhibits = [_parse_exhibit(j+1, print_progress_and_errors) for j in range(total_exhibitions - 1)]

exhibits = [exhibit for exhibit in exhibits if exhibit is not None]

titles = [exhibit[0] for exhibit in exhibits]
years = [exhibit[1] for exhibit in exhibits]
date_full_text = [exhibit[2] for exhibit in exhibits]
museum = [exhibit[3] for exhibit in exhibits]
press_release = [exhibit[4] for exhibit in exhibits]
artists = [exhibit[5] for exhibit in exhibits]
artist_dict = [exhibit[6] for exhibit in exhibits]

exhibition_df = pd.DataFrame(data={
    'exhibition_title': pd.Series(titles, dtype=str),
    'year': pd.Series(years, dtype=int),
    'date_full_text': pd.Series(date_full_text, dtype=str),
    'artists': pd.Series(artists, dtype=str),
    'museum': pd.Series(museum, dtype=str),
    'press_release': pd.Series(press_release, dtype=str)
})

artist_df = pd.DataFrame()
for d in artist_dict:
    artist_df = artist_df.append(pd.DataFrame(d))
artist_df = artist_df.reset_index().drop("index", axis=1)
    
print()
print("--- %s seconds ---" % (time.time() - start_time))


--- 17.17165780067444 seconds ---


Clean up `artist_df` and assign a race & gender with probabiliy for each artist.

In [33]:
# split artist name into first_name and last_name
# if last name is not there, use first name as last name
artist_df['first_name'] = artist_df['artist_name'].apply(lambda x: x.split(' ', 1)[0])
artist_df['last_name'] = artist_df['artist_name'].apply(lambda x: x.split(' ', 1)[1] if len(x.split(' ', 1)) > 1 else x.split(' ', 1)[0])

# use pred_wiki_ln
artist_df = pred_wiki_name(artist_df, lname_col="last_name", fname_col="first_name")

# drop pcts and rename race to ethnicity
artist_df = artist_df.drop(list(artist_df.columns)[-13:], axis=1)
artist_df.rename(index=str, columns={"race": "ethnicity"}, inplace=True)

def generalize_race(text):
    if "EastAsian" in text:
        return "asian"
    elif "Indian" in text:
        return "indian"
    elif "African" in text:
        return "black"
    elif "Hispanic" in text:
        return "hispanic"
    else:
        return "white"
    
artist_df['race'] = artist_df['ethnicity'].apply(generalize_race)

In [34]:
artist_df.tail(10)

Unnamed: 0,artist_name,exhibitions,nationality,work_online,first_name,last_name,ethnicity,race
237,Karl Schmidt-Rottluff,44,German,68,Karl,Schmidt-Rottluff,"GreaterEuropean,WestEuropean,Germanic",white
238,Richard Serra,58,American,64,Richard,Serra,"GreaterEuropean,British",white
239,David Smith,59,American,21,David,Smith,"GreaterEuropean,British",white
240,Nancy Spero,15,American,13,Nancy,Spero,"GreaterEuropean,WestEuropean,Hispanic",hispanic
241,Frank Stella,72,American,156,Frank,Stella,"GreaterEuropean,British",white
242,Antoni Tàpies,28,Spanish,33,Antoni,Tàpies,"GreaterEuropean,British",white
243,Édouard Vuillard,59,French,39,Édouard,Vuillard,"GreaterEuropean,WestEuropean,French",white
244,Andy Warhol,132,American,244,Andy,Warhol,"GreaterEuropean,British",white
245,Terry Winters,26,American,132,Terry,Winters,"GreaterEuropean,British",white
246,Frank Gohlke,13,American,5,Frank,Gohlke,"GreaterEuropean,British",white


In [9]:
""" COMPUTATIONALLY EXPENSIVE ($)"""
# this operation requires using the genderize.io end point - which costs money...
def get_genders(names):
    url = ""
    cnt = 0
    if not isinstance(names,list):
        names = [names,]

    for name in names:
        # scrub name
        if str(name) == None or str(name) == 'nan':
            name = "Unknown"
        else:
            name = re.sub("[\(\[].*?[\)\]]", "", name).strip()
            name = re.sub('[^A-Za-z0-9]+', ' ', name).replace("&amp;", "")
        
        if url == "":
            url = "name[0]=" + name
        else:
            cnt += 1
            url = url + "&name[" + str(cnt) + "]=" + name

    key = "7dc4df2d75de0a9624773093c6717b50"
    req = requests.get("https://api.genderize.io/?" + url + "&apikey="+key)
    results = json.loads(req.text)
    
    retrn = []
    for result in results:
        if result["gender"] is not None:
            retrn.append((result["gender"], result["probability"]))
        else:
            retrn.append((u'None',u'0.0'))
    return retrn    


# batch calls to get_genders() into name groupings of len=10 (to minimize times it needs to hit the endpoint)
l = len(artist_df)
remainder = l % 10
artist_genders = []
for i in range(1, math.floor(l/10) + 1):
    next_10_artists = artist_df.iloc[int(i*10 - 10):int(i*10), 4] # column index 4 is first_name
    artist_genders.extend(get_genders(list(next_10_artists)))
    if i == math.floor(l/10):
        remaining_artist = artist_df.iloc[int(i*10):int(i*10 + remainder + 1), 4] # column index 4 is first_name
        artist_genders.extend(get_genders(list(remaining_artist)))
        
# save genders/gender_prob as separate dataframe
gender_df = pd.DataFrame(artist_genders, columns=["gender", "gender_prob"])

# attach genders dataframe to artist_df
artist_df = pd.concat([artist_df, gender_df], axis=1)

  result = result.union(other)


In [12]:
artist_df = artist_df.drop_duplicates()

In [13]:
artist_df = artist_df.drop(['gender_prob', 'first_name', 'last_name'], axis=1)

In [29]:
lastname_race_df = util.get_race_dist_of_lastname()
lastname_race_df.tail()

Unnamed: 0,lastname,white,asian,mix,aian,black,hispanic
98909,erdrich,0.74,0.0,0.05,0.12,0.0,0.09
98910,egues,0.06,0.0,0.0,0.0,0.0,0.93
98911,dotan,0.88,0.0,0.0,0.0,0.0,0.08
98912,dionizio,0.92,0.0,0.0,0.0,0.0,0.07
98913,donlea,0.94,0.0,0.0,0.0,0.0,0.06


In [15]:
# filter on american artists only
american_artist_df = artist_df[artist_df['nationality'] == 'American']
non_american_artist_df = artist_df[artist_df['nationality'] != 'American']

In [18]:
""" COMPUTATIONALLY EXPENSIVE """
# create mapping with `lastname_race_df` to find assign a race to each artists
american_artist_df['race'] = american_artist_df.apply(lambda row: pd.Series(util.get_race_from_full_name(row['artist_name'], row['race'], lastname_race_df)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
artist_df = american_artist_df.append(non_american_artist_df)

In [20]:
artist_df['exhibitions'] = artist_df['exhibitions'].apply(pd.to_numeric, errors='ignore')

In [21]:
artist_df = artist_df.sort_values(by='exhibitions', ascending=False).reset_index().drop(['index'], axis=1)

In [36]:
def check_race_and_gender(row):
    """
    manual checked american & non-american artists of years: 1957, 1977, 1997 & 2017
    """
    
    white_male_list = [\
        "sol lewitt", "ellsworth kelly", "edward ruscha", "john marin", "philip guston", "jonathan borofsky", \
        "william brice", "robert morris", "ben shahn", "e. mcknight kauffer", "robert andrew parker", "jan müller", \
        "jules pascin", "william t. wiley", "mark rothko", "varujan boghosian", "raoul hague", "joseph glasco", \
        "robert wilson", "garry winogrand", "albert alcalay", "gandy brodie", "frank lloyd wright", "raimund abraham", \
        "morris graves", "christopher wool", "merce cunningham", "louis michel eilshemius", "robert brownjohn", \
        "woody vasulka", "sam francis","robert indiana", "william wegman", "gordon matta-clark", "mel bochner", \
        "willem de kooning", "milton avery", "felix gonzalez-torres", "douglas huebler", "r. buckminster fuller", \
        "tom wesselmann", "terry allen", "william bailey", "robert mapplethorpe", "richard foreman", "rafael ferrer", \
        "pirkle jones", "philippe halsman", "peter campus", "charles atlas", "douglas davis", "caldecot chubb", \
        "ben schonzeit", "jared bark", "jerry uelsmann", "abraham walkowitz", "peter young", "roger brown", \
        "tim rollins", "leon polk smith", "david park", "charles fahlen", "barry le va", "ian (hugh guiler) hugo", \
        "king vidor", "walter lang", "walter burley griffin", "val telberg", "paul taylor", "r. crumb", "remy charlip", \
        "robert breer", "bruce graham", "busby berkeley", "charles gwathmey", "charles j. brabin", "david levinthal", \
        "albert herbert", "allan mccollum", "eugene masselink", "george cukor", "willy mucha", "théo van rysselberghe", \
        "théophile-alexandre steinlen", "rené magritte", "hans namuth", "tom wesselmann", "mark di suvero", \
        "robert watts", "abraham walkowitz", "russell lee", "robert capa", "barry le va", "jerry uelsmann", \
        "george nelson", "joel meyerowitz", "william lescaze", "francis bruguière", "philip evergood", "jim shaw", \
        "george him", "günther förg", "rené robert bouché", "esteban vicente", "peter grippe", "mark grotjahn", \
        "john hejduk", "cornell capa", "robert gwathmey", "robert heinecken", "ed emshwiller", "donald sultan", \
        "gregory amenoff", "nathan george horwitt", "maurice sterne", "jean charlot", "timothy o'sullivan", \
        "alton pickens", "louis faurer", "richard neutra", "louis lozowick", "alfred leslie", "morris louis", \
        "kim jones", "carleton e. watkins", "william vandivert", "jules olitski", "louis schanker", \
        "walter dorwin teague", "edward kienholz", "nathan lyons", "walter robinson", "larry poons", "john steuart curry", \
        "alphonse mucha", "robert laurent", "bill beckley", "alfred eisenstaedt", "harwell hamilton harris", \
        "thomas wilfred", "ron davis", "geoffrey hendricks", "alain kirili"]
    
    white_female_list = [\
        "helen frankenthaler", "lee bontecou", "yvonne rainer", "imogen cunningham", "charmion von wiegand", "angelo testa" \
        "louise nevelson", "adrian piper", "elizabeth murray", "mona hatoum", "lee krasner", "lois long", \
        "claire (claire mahl) moore", "alexis smith", "trisha brown", "lucinda childs", "margaret c. anderson", \
        "vera (vera neumann)", "susan weil", "noémi raymond", "pat passlof", "jan groover", "elaine de kooning", \
        "nancy holt", "susan weil", "wanda gág", "elizabeth peyton", "judith joy ross", "louise dahl-wolfe", \
        "jane dickson"]
    
    asian_female_list = [\
        "elizabeth mcfadden", "tomiyo sasaki"]
    
    asian_male_list = [\
        "shusaku arakawa", "lee ufan", "wifredo lam", "ai weiwei", "chinn yuen-yuei", "thomas han", \
        "jenova (xinghan) chen", "eikoh hosoe", "ken domon", "lee jong-ok"]
    
    black_male_list = [\
        "david hammons", "kingelez", "raymond saunders", "melvin edwards", "sam gilliam", "Cameron Rowland", \
        "terry adkins", "jacob lawrence", "gordon parks"]
    
    black_female_list = [\
        "minnie evans", "kara walker", "xaviera simmons", "carrie mae weems", "alma woodsey thomas", \
        "barbara chase-riboud"]
    
    hispanic_female_list = [\
        "andrea bowers", "andrea fraser"]
    
    hispanic_male_list = [\
        "rufino tamayo"]
    
    none_list = [\
        "velox ward", "dudley huppler", "schilli maier", "maxi cohen", "richard w. landis", "skip blumberg", \
        "joel fisher", "eve sonneman", "mia ferrara", "john h. lickert", "william c. gannett", "robert p. gottlieb", \
        "orlando giannini", "daniel larossa", "alfred w. fielding", "arthur a. aykanian", "don weinreich", \
        "janet stein", "eliza montgomery", "elizabeth mock", "roland baladi"]
    
    artist_name = str(row['artist_name'].lower())
    if "nknown" in artist_name \
        or "nonymous" in artist_name \
        or "rtist" in artist_name \
        or "ystem" in artist_name \
        or "group" in artist_name \
        or "corp" in artist_name \
        or ", " in artist_name \
        or "ssociates" in artist_name \
        or "tudio" in artist_name \
        or "esearch" in artist_name \
        or "imension" in artist_name \
        or "skidmore" in artist_name \
        or "rchitect" in artist_name \
        or artist_name in none_list:
        return None, None
    
    if artist_name in white_male_list:
        return "white", "male"
    
    if artist_name in white_female_list:
        return "white", "female"
    
    if artist_name in asian_female_list:
        return "asian", "female"
    
    if artist_name in asian_male_list:
        return "asian", "male"
    
    if artist_name in black_male_list:
        return "black", "male"
    
    if artist_name in black_female_list:
        return "black", "female"
    
    if artist_name in hispanic_female_list:
        return "hispanic", "female"
    
    if artist_name in hispanic_male_list:
        return "hispanic", "male"
    
    return row['race'], row['gender']

artist_df[['race', 'gender']] = artist_df.apply(lambda x: pd.Series(check_race_and_gender(x)), axis=1)

Check that dataframes look as expected.

In [None]:
print(artist_df.shape)
artist_df.head()

In [None]:
print(exhibition_df.shape)
exhibition_df.head()

Save dataframes as pickled files

In [None]:
# artist_df.to_pickle('data/artist_%s.pkl' % str(time.strftime("%m%d%Y")))
# exhibition_df.to_pickle('data/exhibition_%s.pkl' % str(time.strftime("%m%d%Y")))