# Art Institute of Chicago Data Cleaning

Relevant imports:

In [1]:
import numpy as np
import pandas as pd
import json
import os
import glob
from geopy.geocoders import Nominatim
from time import sleep
from geopy.exc import GeocoderTimedOut
import pycountry_convert as pc

Loading in the JSON file containing artwork id, title, artist, and department:

In [3]:
aic = pd.read_json('../../artic-api-data/getting-started/allArtworks.jsonl', lines=True)

Loading in extra JSON files for each artwork containing artwork id and place of origin information:

In [5]:
id_and_origin = []
path = '../../artic-api-data/json/artworks'
for filename in glob.glob(os.path.join(path, '*.json')):    
    with open(filename, encoding='utf-8', mode='r') as currentFile:
        data=currentFile.read().replace('\n', '')
        art_id = keyword = json.loads(data)["id"]
        place_of_origin = json.loads(data)["place_of_origin"]
        acquisition_date = json.loads(data)["fiscal_year"]
        id_and_origin.append(tuple((art_id, place_of_origin, acquisition_date)))

id_and_origin[:10]

[(116936, None, 1982),
 (57896, 'New York City', None),
 (127292, 'United States', None),
 (120557, 'United States', None),
 (3721, 'Belgium', 1930),
 (2833, 'Santa Valley', 1957),
 (143079, 'United States', 1996),
 (108928, 'Switzerland', None),
 (49888, 'France', 1928),
 (98178, 'United States', None)]

Converting the list of tuples created in the previous step to a dataframe that can be merged with the original dataframe I loaded in:

In [6]:
id_origin_df = pd.DataFrame(id_and_origin, columns =['id', 'place_of_origin', 'acquisition_date'])
aic_and_origin = pd.merge(aic, id_origin_df, on='id')
aic_origin_drops = aic_and_origin.dropna(subset=['place_of_origin']).reset_index()

In [7]:
# number of unusable rows so far
print(len(aic_and_origin) - len(aic_origin_drops))

9492


Importing a list of countries, so that naming conventions will be the same across datasets

In [10]:
countries_csv = pd.read_csv('../data/interim/CountriesCSV.csv')
countries = list(countries_csv['name.common'])

Splitting the data into two groups, art whose origin location is already a country, and art whose location data isn't

In [11]:
id_and_noncountry = []
id_and_country = []

for i in range(len(aic_origin_drops)):
    if aic_origin_drops['place_of_origin'][i] not in countries:
        id_and_noncountry.append((aic_origin_drops['id'][i], aic_origin_drops['place_of_origin'][i]))
    else:
        id_and_country.append((aic_origin_drops['id'][i], aic_origin_drops['place_of_origin'][i]))

cities = [x[1] for x in id_and_noncountry]
id_country_df = pd.DataFrame(id_and_country, columns =['id', 'country']).dropna()

In [12]:
# number of rows that do not need modification for use
countries_present = [x[1] for x in id_and_country]
print(len(countries_present))

# percentage of rows that do not need modification for use
print(len(countries_present)/len(aic_and_origin))

96337
0.725516628509459


The place of origin information provided in the individual JSON files is a mix of country and city names. Using the geopy package, the following cell converts the place of origin information to an address that contains a country name. The geopy package uses the Nominatim package to look up the provided locations, which has a low limit of the number of requests it can process at a time. To get around this, I created a work-around for a timed out error, which causes the cell to take many hours to run.

In [147]:
# no longer necessary since it has been downloaded


geolocator = Nominatim(user_agent = "cleaning")


def do_geocode(address, attempt=1, max_attempts=5):
    try:
        return geolocator.geocode(address, language="en")
    except GeocoderTimedOut:
        if attempt <= max_attempts:
            sleep(1.1)
            return do_geocode(address, attempt=attempt+1)
        raise


region_and_geopy = []
for i in range(len(set(cities))):
    location = do_geocode(cities[i])
    pre_geopy_name = cities[i]
    if location is not None:
        region_and_geopy.append((pre_geopy_name,location.address))

region_geopy_df = pd.DataFrame(region_and_geopy, columns =['noncountry', 'geopy']).dropna()
region_geopy_df.to_csv('noncountry_geopy.csv')


Loading in the downloaded csv that was created using the previous code

In [13]:
region_geopy_df = pd.read_csv('../data/interim/noncountry_geopy.csv').drop(labels=['Unnamed: 0'], axis=1)

Getting the country name from the geopy location using regex, and combining country, place of origin, and id for easier merging with the main dataframe

In [14]:
region_geopy_df['geopy_country'] = region_geopy_df['geopy'].str.extract("([^,]*$)")
for i in range(len(region_geopy_df)):
    region_geopy_df.loc[i, 'geopy_country'] = region_geopy_df['geopy_country'][i].strip()

In [15]:
id_region_df = pd.DataFrame(id_and_noncountry, columns =['id', 'noncountry']).dropna()
id_region_geopy = pd.merge(id_region_df, region_geopy_df, on='noncountry', how='inner')
id_region_geopy.drop_duplicates(subset=['id'], inplace=True)
id_country_geopy = id_region_geopy[['id', 'geopy_country']].rename(columns={'geopy_country': 'country'})

Stacking the dataframes of the artwork that did not need the location name modified and the dataframe of the artwork that did, then merging the country information with the main dataframe

In [16]:
combined_countries = pd.concat([id_country_df, id_country_geopy], ignore_index=True)

In [17]:
aic_and_country_origin = pd.merge(aic_origin_drops, combined_countries, on='id', how='left')
aic_and_country_origin.head()

Unnamed: 0,index,id,title,main_reference_number,department_title,artist_title,place_of_origin,acquisition_date,country
0,0,4,Priest and Boy,1880.1,Prints and Drawings,Lawrence Carmichael Earle,United States,,United States
1,1,9,"Interior of St. Mark's, Venice",1887.232,Arts of the Americas,David Dalhoff Neal,Munich,,Germany
2,2,11,Self-Portrait,1887.234,Arts of the Americas,Walter Shirlaw,United States,1988.0,United States
3,3,16,The Fall of the Giants,1887.249,Prints and Drawings,Salvator Rosa,Italy,,Italy
4,4,19,"View of Ponte Lugano on the Anio, from Views o...",1887.252,Prints and Drawings,Giovanni Battista Piranesi,Italy,,Italy


Making sure naming conventions match between datasets, and getting rid of columns that have unusable locations

In [18]:
aic_and_country_origin['country'].replace({'Democratic Republic of the Congo':'DR Congo',
                                           "Côte d'Ivoire": 'Ivory Coast',
                                           'Palestinian Territory': 'Palestine',
                                           'East Timor': 'Timor-Leste'}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  aic_and_country_origin['country'].replace({'Democratic Republic of the Congo':'DR Congo',


In [19]:
drop_list_origin = ['Unknown Place', 'Roman Empire', 'Bohemia']
drop_list_country = ['Europe', 'Africa', 'Asia','North America','Gulf of Mexico','South America']

In [23]:
aic_and_country_origin = aic_and_country_origin[~aic_and_country_origin['place_of_origin'].isin(drop_list_origin)]
aic_data = aic_and_country_origin[~aic_and_country_origin['country'].isin(drop_list_country)].dropna(subset=['country'])

In [24]:
# number of unusable rows after all of that cleaning
print(len(aic_and_origin) - len(aic_data))

# percentage of usable rows after all of that cleaning
print(len(aic_data)/len(aic_and_origin))

12489
0.9059449933726955


Using a python package, creating tuples for every country and its' corresponding continent to allow for the ability to do a breakdown by general region.

In [25]:
def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except KeyError:
        return None

In [27]:
country_and_continent = []

countries_list = list(set(aic_data['country']))
for i in range(len(countries_list)):
    if country_to_continent(countries_list[i]) is not None:
        country_and_continent.append((countries_list[i],country_to_continent(countries_list[i])))

country_and_continent.append(('Timor-Leste', 'Asia'))
country_and_continent.append(('DR Congo', 'Africa'))
country_and_continent.append(('Tibet', 'Asia'))

In [28]:
country_and_continent_df = pd.DataFrame(country_and_continent, columns =['country', 'continent']).dropna()
aic_continent = pd.merge(aic_data, country_and_continent_df, on='country', how='left')
aic_continent.head()

Unnamed: 0,index,id,title,main_reference_number,department_title,artist_title,place_of_origin,acquisition_date,country,continent
0,0,4,Priest and Boy,1880.1,Prints and Drawings,Lawrence Carmichael Earle,United States,,United States,North America
1,1,9,"Interior of St. Mark's, Venice",1887.232,Arts of the Americas,David Dalhoff Neal,Munich,,Germany,Europe
2,2,11,Self-Portrait,1887.234,Arts of the Americas,Walter Shirlaw,United States,1988.0,United States,North America
3,3,16,The Fall of the Giants,1887.249,Prints and Drawings,Salvator Rosa,Italy,,Italy,Europe
4,4,19,"View of Ponte Lugano on the Anio, from Views o...",1887.252,Prints and Drawings,Giovanni Battista Piranesi,Italy,,Italy,Europe


Finally, saving the clean data

In [421]:
aic_continent.to_csv('../data/processed/aic_data.csv')

### Bonus: adding continent data to the other dataset

In [414]:
met = pd.read_csv('../data/processed/CleanMETData.csv')

In [392]:
country_and_continent = []

countries_list = list(set(met['MappedCountry'].dropna()))
for i in range(len(countries_list)):
    if country_to_continent(countries_list[i]) is not None:
        country_and_continent.append((countries_list[i],country_to_continent(countries_list[i])))

country_and_continent.append(('Timor-Leste', 'Asia'))
country_and_continent.append(('DR Congo', 'Africa'))
country_and_continent.append(('Tibet', 'Asia'))

In [394]:
country_and_continent_df = pd.DataFrame(country_and_continent, columns =['MappedCountry', 'continent']).dropna()
met_continent = pd.merge(met, country_and_continent_df, on='MappedCountry', how='left')
met_continent.to_csv('../data/processed/CleanMetData_continents.csv')