In [4]:
import numpy as np
import pandas as pd
from fuzzywuzzy import process
# pd.set_option('display.max_columns', None)

# pd.set_option('display.max_rows', None)

In [5]:
data = pd.read_csv("./art_mod1.csv", index_col=0)
count_by_artist = data["Artist"].value_counts()
artist_names = sorted(list(set(data[data["Artist"].notnull()]["Artist"])))

### Unify Birth Years

In [6]:
# Get the correct birthdate for each artist
# so that birth and name combination is unique
for artist in artist_names:
    birth_years = data.loc[data.Artist == artist,"Birth"].value_counts()
    if len(birth_years) == 1:
        continue;
    else:
        true_birth = birth_years.idxmax()
        data.loc[data.Artist == artist, "Birth"] = true_birth
        
artist_birthdeath = data.loc[:,"Artist":"Birth"].drop_duplicates().set_index("Artist").to_dict()["Birth"]

In [7]:
data.loc[:,"Artist":"Birth"].drop_duplicates().sort_values("Artist")

Unnamed: 0,Artist,Birth
19600,A Hai,1963
9689,A. Balasubramaniam,1971
1830,A. R. Penck,1939
21621,A. Ramachandran,1935
1536,A.R. Penck,1939
21742,A.R.Penck,1939
10310,Aad De Haas,1920
12269,Aaron Curry,1972
33885,Aaron Fowler,1988
23424,Aaron Garber-Maikovska,1978


### Correct Artist_name typo

In [8]:
# Save the typo to the artist name 
# that has the most occurrence in the dataframe
similar_names = {}
wrong_names = []

for i in range(len(artist_names)):
    
    name = artist_names[i]
    # If the name was already a typo, skip it
    if name in wrong_names:
        continue;
    
    fuzzy_process = process.extract(name, artist_names[:i] + artist_names[i+1:])
    candidates = [match[0] for match in fuzzy_process if match[1] >= 95]
    # Check the birth/death year so that it is only a typo
    for candi in candidates:
        if artist_birthdeath[name] != artist_birthdeath[candi]:
            candidates.remove(candi)
    
    if len(candidates) is 0:
        continue;
    else:
        # Check the birth/death year of the artist
        candidates.insert(0, name)
        use_count = []
        for candi in candidates:
            use_count.append(count_by_artist[candi])
        most_used_idx = use_count.index(max(use_count))
        correct_name = candidates[most_used_idx]
        candidates.remove(correct_name)
        
        similar_names[correct_name] = candidates
        # keep track of typos
        wrong_names.extend(candidates)

In [10]:
len(similar_names)
similar_names

{'A. R. Penck': ['A.R. Penck', 'A.R.Penck'],
 'Abdul Hadi El-Gazzar': ['Abdel Hadi El-Gazzar'],
 'Abdel Qader Al-Rais': ['Abdul Qader Al-Rais'],
 'Abdulrahim  Sharif': ['Abdulrahim Sharif'],
 'Adel El-Siwi': ['Adel El Siwi'],
 'Francis Bacon': ['After Francis Bacon'],
 'Willem De Kooning': ['After Willem De Kooning'],
 'Agustin Cardenas': ['Augustin Cardenas'],
 'Ai Weiwei': ['Weiwei Ai'],
 'Ai Xuan': ['Xuan Ai'],
 'Alfi Jumaldi': ['Jumaldi Alfi'],
 'Alfredo Esquillo': ['Alfredo Esquillo Jr.'],
 'Allan Mccollum': ['Allan Mc Collum'],
 'Allen Jones, R.A.': ['Allen Jones R.A.', 'Allen Jones'],
 'Zoran Antonio Music': ['Antonio Zoran Music'],
 'Aref El-Rayess': ['Aref El Rayess'],
 'Atsuko Tanaka': ['Tanaka Atsuko'],
 'Azra Aghighi Bakhshayesh': ['Azra Aghighi Bakhshayeshi'],
 'Bahman Mohasses': ['Bahman Mohassess'],
 'He Baili': ['Baili He'],
 'Baltasar Lobo': ['Balthasar Lobo'],
 'Barry Le Va': ['Barry Leva'],
 'Behjat Sadr': ['Behdjat Sadr'],
 'Bhupen Khakhar': ['Bhupen  Khakhar'],
 'B

In [14]:
for artist in similar_names.keys():
    data.loc[data.Artist.isin(similar_names[artist]),"Artist"] = artist

In [15]:
len(data.Artist.drop_duplicates())

4797

### Unify Death Year

In [202]:
artist_death = data.loc[:,"Artist":"Death"].drop_duplicates()
# Jean-Paul Riopelle, Kees Van Bohemen, Theo Meier, Roger Raveel, Rene Portocarrero, Prunella Clough, Nicolas Schoffer, Lin Fengmian, Ilya Bolotowsky, Cesar

In [1]:
sub1 = artist_death.loc[artist_death.Death.notnull(),:].groupby("Artist").sum()
#sub1.loc[sub1.Birth >= 2000,:]

NameError: name 'artist_death' is not defined

In [10]:
# Fix the misses individually by web searching
data.loc[data.Artist == "Abdul Hadi El-Gazzar","Death"] = 1966
data.loc[data.Artist == "Alfonso Ossorio","Death"] = 1990
data.loc[data.Artist == "Andy Warhol","Death"] = 1987
data.loc[data.Artist == "Antoni Tapies","Death"] = 2012
data.loc[data.Artist == "Bhupen Khakhar","Death"] = 2003
data.loc[data.Artist == "Carl-Henning Pedersen","Death"] = 2007
data.loc[data.Artist == "Cesar","Death"] = 1998
data.loc[data.Artist == "Chen Wen Hsi","Death"] = 1991
data.loc[data.Artist == "Christian Dotremont","Death"] = 1979
data.loc[data.Artist == "Daan Van Golden","Death"] = 2017
data.loc[data.Artist == "David Budd","Death"] = 1991
data.loc[data.Artist == "Donald Evans","Death"] = 1977
data.loc[data.Artist == "Eduardo Kingman","Death"] = 1997
data.loc[data.Artist == "Emilio Tadini","Death"] = 2002
data.loc[data.Artist == "Fausto Melotti","Death"] = 1986
data.loc[data.Artist == "Frank Avray Wilson","Death"] = 2009
data.loc[data.Artist == "Georgette Chen","Death"] = 1993
data.loc[data.Artist == "Germaine Richier","Death"] = 1959
data.loc[data.Artist == "Hanne Darboven","Death"] = 2009
data.loc[data.Artist == "Ilya Bolotowsky","Death"] = 1981
data.loc[data.Artist == "Isamu Noguchi","Death"] = 1904
data.loc[data.Artist == "Jean-Paul Riopelle","Death"] = 2002
data.loc[data.Artist == "Karel Appel","Death"] = 2006
data.loc[data.Artist == "Kees Van Bohemen","Death"] = 1986
data.loc[data.Artist == "Lin Fengmian","Death"] = 1991
data.loc[data.Artist == "Liu Kang","Death"] = 2004
data.loc[data.Artist == "Nicolas Schoffer","Death"] = 1992
data.loc[data.Artist == "Prunella Clough","Death"] = 1999
data.loc[data.Artist == "Reinhoud D'Haese","Death"] = 2007
data.loc[data.Artist == "Rene Portocarrero","Death"] = 1985
data.loc[data.Artist == "Robert Graham","Death"] = 2008
data.loc[data.Artist == "Roger Raveel","Death"] = 2013
data.loc[data.Artist == "Roy Lichtenstein","Death"] = 1997
data.loc[data.Artist == "Sam Francis","Death"] = 1994
data.loc[data.Artist == "Theo Meier","Death"] = 1982
data.loc[data.Artist == "U San Win","Death"] = 1981
data.loc[data.Artist == "Yves Klein","Death"] = 1962

In [24]:
data.loc[data.Artist == "Luo Brothers","Death"] = None
data.loc[data.Artist == "Jake And Dinos Chapman","Death"] = None
data.loc[data.Artist == "Jake And Dinos Chapman","Birth"] = 1964
data.loc[data.Artist == "Thukral And Tagra","Death"] = None
data.loc[data.Artist == "Thukral And Tagra","Birth"] = 1977
data.loc[data.Artist == "Pierre Et Gilles","Birth"] = 1951
data.loc[data.Artist == "Pierre Et Gilles","Death"] = None
data.loc[data.Artist == "Nate Lowman And Aaron Young","Birth"] = 1975
data.loc[data.Artist == "Nate Lowman And Aaron Young","Death"] = None

data.loc[data.Artist == "Nate Lowman And Josh Smith","Birth"] = 1977
data.loc[data.Artist == "Nate Lowman And Josh Smith","Death"] = None
data.loc[data.Artist == "Zhou Chunya + Jaime Hayon","Birth"] = 1964
data.loc[data.Artist == "Zhou Chunya + Jaime Hayon","Birth"] = None
data.loc[data.Artist == "Jenny Saville And Glen Luchford","Birth"] = 1969
data.loc[data.Artist == "Jenny Saville And Glen Luchford","Death"] = None
data.loc[data.Artist == "Manuel Rodriguez Lozano","Birth"] = 1896
data.loc[data.Artist == "Guyton \\ Walker","Birth"] = 1970
data.loc[data.Artist == "Guyton \\ Walker","Death"] = None
data.loc[data.Artist == "Rene Portocarrero","Death"] = 1985

remove_artist= ["Equipo Cronica","Vertical Submarine","Manuel Rodriguez Lozano","Alexander Calder","Josef Albers", "Jean Fautrier","Lucio Fontana"]

data.drop(data[data.Artist.isin(remove_artist)].index, inplace=True)

In [25]:
data.to_csv("art_mod2.csv",header=True)