### MET Database Cleaning

In [23]:
import pandas as pd
import numpy as np
from functools import reduce

In [24]:
og_met_df = pd.read_csv('METsample.csv', on_bad_lines='skip')
og_met_df.columns

Index(['Object Number', 'Is Highlight', 'Is Timeline Work', 'Is Public Domain',
       'Object ID', 'Gallery Number', 'Department', 'AccessionYear',
       'Object Name', 'Title', 'Culture', 'Period', 'Dynasty', 'Reign',
       'Portfolio', 'Constituent ID', 'Artist Role', 'Artist Prefix',
       'Artist Display Name', 'Artist Display Bio', 'Artist Suffix',
       'Artist Alpha Sort', 'Artist Nationality', 'Artist Begin Date',
       'Artist End Date', 'Artist Gender', 'Artist ULAN URL',
       'Artist Wikidata URL', 'Object Date', 'Object Begin Date',
       'Object End Date', 'Medium', 'Dimensions', 'Credit Line',
       'Geography Type', 'City', 'State', 'County', 'Country', 'Region',
       'Subregion', 'Locale', 'Locus', 'Excavation', 'River', 'Classification',
       'Rights and Reproduction', 'Link Resource', 'Object Wikidata URL',
       'Metadata Date', 'Repository', 'Tags', 'Tags AAT URL',
       'Tags Wikidata URL', 'Random'],
      dtype='object')

In [25]:
met_df = og_met_df[['Title', 'Department', 'AccessionYear', 'Artist Display Name', 'Artist Nationality', 'Culture', 'Country', 'Region', 'Subregion',
                    'Classification', 'Tags', 'Object Wikidata URL']]
met_df.head()

Unnamed: 0,Title,Department,AccessionYear,Artist Display Name,Artist Nationality,Culture,Country,Region,Subregion,Classification,Tags,Object Wikidata URL
0,Calligraphic Exercise in Spanish,Drawings and Prints,2014.0,Anonymous,,,,,,Albums|Drawings|Ornament & Architecture,,
1,"Les Spectacles de Paris, ou, Calendrier histor...",The Libraries,,Joseph de Laporte|Duchesne,|French,,France,,,,,
2,Set of Sword Fittings (Mitokoromono) with Two ...,Arms and Armor,1945.0,Gotō Jōshin,Japanese,Japanese,,,,Sword Furniture,,https://www.wikidata.org/wiki/Q116250603
3,Coat,Costume Institute,2005.0,Christian Lacroix|Christian Lacroix|Birger Chr...,French|French|Scandinavian,French,,,,,,
4,Churinga,"Arts of Africa, Oceania, and the Americas",1979.0,,,Mulga Downs Cave,Australia,Western Desert,,Wood-Sculpture,,


In [26]:
met_df.columns

Index(['Title', 'Department', 'AccessionYear', 'Artist Display Name',
       'Artist Nationality', 'Culture', 'Country', 'Region', 'Subregion',
       'Classification', 'Tags', 'Object Wikidata URL'],
      dtype='object')

In [27]:
country_classification_columns = ('Country', 'Culture', 'Artist Nationality', 'Region', 'Subregion', 'Tags')
met_df['country_combd'] = reduce(lambda x, y: x.combine_first(met_df[y]), country_classification_columns, met_df[country_classification_columns[0]])
met_df.head()                                                       

Unnamed: 0,Title,Department,AccessionYear,Artist Display Name,Artist Nationality,Culture,Country,Region,Subregion,Classification,Tags,Object Wikidata URL,country_combd
0,Calligraphic Exercise in Spanish,Drawings and Prints,2014.0,Anonymous,,,,,,Albums|Drawings|Ornament & Architecture,,,
1,"Les Spectacles de Paris, ou, Calendrier histor...",The Libraries,,Joseph de Laporte|Duchesne,|French,,France,,,,,,France
2,Set of Sword Fittings (Mitokoromono) with Two ...,Arms and Armor,1945.0,Gotō Jōshin,Japanese,Japanese,,,,Sword Furniture,,https://www.wikidata.org/wiki/Q116250603,Japanese
3,Coat,Costume Institute,2005.0,Christian Lacroix|Christian Lacroix|Birger Chr...,French|French|Scandinavian,French,,,,,,,French
4,Churinga,"Arts of Africa, Oceania, and the Americas",1979.0,,,Mulga Downs Cave,Australia,Western Desert,,Wood-Sculpture,,,Australia


In [28]:
met_df.isna().sum()

Title                    8884
Department                  0
AccessionYear            1219
Artist Display Name     62517
Artist Nationality      62517
Culture                 85752
Country                126584
Region                 140290
Subregion              143162
Classification          24330
Tags                    90312
Object Wikidata URL    128546
country_combd            1718
dtype: int64

In [29]:
country_terms = pd.read_csv('countries.csv')
country_terms = country_terms[['name.common', 'name.official', 'capital', 'altSpellings', 'demonyms.eng.m', 'demonyms.eng.f']]


In [30]:
country_map = {}

for _, row in country_terms.iterrows():
    official_name = row['name.common']

    for col in country_terms.columns:
        if col != 'name.common':
            country_map[row[col]] = official_name


In [34]:
met_df['country_combd'] = met_df['country_combd'].str.split(r'[|,]| or').str[0]

met_df['country_combd'] = met_df['country_combd'].replace(country_map)

met_df['CountryCleaned'] = met_df['country_combd']

In [37]:
from IPython.display import FileLink
cleaned_met_csv = 'cleaned_met.csv'
met_df.to_csv(cleaned_met_csv)
FileLink(cleaned_met_csv)