# Clean the Data
We check the data that we have gathered. See what is usable and what is not.

In [1]:
import pandas as pd
import numpy as np
import glob
import warnings

In [2]:
data_file_path = "./data/"

## Read the Text-based Data
We read the data files generated in the previous notebook.

In [3]:
# define the dataframe to hold all data
combined_artists = pd.DataFrame()

# loop through all of the csv files that we have created in the /data/artists directory. read
# them into a temporary dataframe. append the temporary dataframe to the end of the dataframe 
# to hold all data.
for artist_csv_file in glob.glob("./data/artists/*.csv"):
    artist_data = pd.read_csv(artist_csv_file, sep = ";", index_col = 0)
    combined_artists = combined_artists.append(artist_data)

We combine the data from the _.csv_ files into a single dataframe.

In [4]:
# when we read in the data from the various csv files the index of the new combined
# dataset is contructed from the original indexes. this means that for each index value
# there are multiple instances. this is not so good for our index. but, these original
# index values maybe useful in the future as they provide as they provide a unique
# value per artist. so, the original index will be transformed into a new column named
# "artist_artwork_id" and a new index will be generated for the entire dataset

# create the "artist_artwork_id" from the column
combined_artists["artist_artwork_id"] = combined_artists.index  

# delete the old index and create a new one
combined_artists.reset_index(level = None, drop = True, inplace = True)

## Inspect the Data
A quick visual inspection reveals that the data is in a resonable state. There is a need for some cleaning. But, not an excessive amount.

In [5]:
combined_artists.head(20)

Unnamed: 0,artist,genre,media,style,title,url,year,artist_artwork_id
0,M.C. Escher,symbolic painting,,surrealism,Bookplate Bastiaan Kist,https://uploads4.wikiart.org/images/m-c-escher...,1916,0
1,M.C. Escher,flower painting,,realism,Chrysanthemum,https://uploads3.wikiart.org/images/m-c-escher...,1916,1
2,M.C. Escher,portrait,,expressionism,Escher's Father,https://uploads5.wikiart.org/images/m-c-escher...,1916,2
3,M.C. Escher,portrait,,expressionism,Head of a Child,https://uploads5.wikiart.org/images/m-c-escher...,1916,3
4,M.C. Escher,portrait,,expressionism,Baby,https://uploads3.wikiart.org/images/m-c-escher...,1917,4
5,M.C. Escher,still life,,expressionism,Mascot,https://uploads5.wikiart.org/images/m-c-escher...,1917,5
6,M.C. Escher,animal painting,,expressionism,Hen with Egg,https://uploads0.wikiart.org/images/m-c-escher...,1917,6
7,M.C. Escher,still life,,expressionism,NOT DETECTED,https://uploads0.wikiart.org/images/m-c-escher...,1917,7
8,M.C. Escher,portrait,,expressionism,Portrait of a Man,https://uploads7.wikiart.org/images/m-c-escher...,1917,8
9,M.C. Escher,landscape,,realism,"Railway Bridge, Oosterbeek",https://uploads6.wikiart.org/images/m-c-escher...,1917,9


In [6]:
# count the number of populated fields for each column
combined_artists.count()

artist               3369
genre                3331
media                1754
style                3360
title                3369
url                  3369
year                 2023
artist_artwork_id    3369
dtype: int64

## Clean the Column: _genre_
First let's take a look at all of the unique values that we have for _genre_.

In [7]:
# create a list of all unique values within the genre column
all_genres = combined_artists["genre"].unique()
all_genres

array(['symbolic painting', 'flower painting', 'portrait', 'still life',
       'animal painting', 'landscape', 'self-portrait', 'vanitas',
       'allegorical painting', 'poster', 'tessellation',
       'religious painting', 'nude painting (nu)', 'genre painting',
       'illustration', 'figurative', 'abstract', 'sketch and study',
       'cityscape', 'capriccio', 'design', 'marina', 'quadratura',
       'mythological painting', 'sculpture', 'literary painting', nan,
       'history painting', 'cityscape,marina', 'interior', 'caricature',
       'battle painting', 'nude painting (nu),religious painting',
       'abstract,figurative', 'advertisement',
       'portrait,sketch and study', 'still life,sculpture', 'pastorale',
       'bird-and-flower painting', 'landscape,figurative', 'installation',
       'mural', 'furniture,design', 'wildlife painting'], dtype=object)

We can remove the few that are empty or represent art in forms that are not compatible with this project, _e.g. sculpture and installation._

In [8]:
# remove Remove records with no genre: NaN
combined_artists = combined_artists[combined_artists["genre"].isnull() == False]

# count results
combined_artists.genre.count()

3331

In [9]:
# remove Remove records with a genre of installation - we are not interested in these works of art
combined_artists = combined_artists[combined_artists["genre"] != "installation"]

# count results
combined_artists.genre.count()

3329

In [10]:
# remove Remove records with a genre of sculpture - we are not interested in these works of art
combined_artists = combined_artists[combined_artists["genre"] != "sculpture"]

# count results
combined_artists.genre.count()

3311

In [11]:
# remove Remove records with a genre of quadratura & mural - we are not interested in these works of art
combined_artists = combined_artists[combined_artists["genre"] != "quadratura"]
combined_artists = combined_artists[combined_artists["genre"] != "mural"]

# count results
combined_artists.genre.count()

3308

In [12]:
# remove Remove records with a genre of furniture design - we are not interested in these works of art
combined_artists = combined_artists[combined_artists["genre"] != "furniture,design"]

# count results
combined_artists.genre.count()

3307

Now we can standarize some tof the labels.

In [13]:
# reduce the number of unique values by grouping pairs together
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "portrait"           if x == "self-portrait"                         else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "portrait"           if x == "portrait,sketch and study"             else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "nude-painting"      if x == "nude painting (nu)"                    else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "nude-painting"      if x == "nude painting (nu),religious painting" else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "marina"             if x == "cityscape,marina"                      else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "still-life"         if x == "vanitas"                               else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "still-life"         if x == "still life,sculpture"                  else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "religious-painting" if x == "allegorical"                           else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "religious-painting" if x == "llegorical painting"                   else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "abstract"           if x == "abstract,figurative"                   else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "wildlife-painting"  if x == "animal painting"                       else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "wildlife-painting"  if x == "bird-and-flower painting"              else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "furniture"          if x == "furniture,design"                      else x)
combined_artists["genre"] = combined_artists["genre"].apply(lambda x: "landscape"          if x == "landscape,figurative"                  else x)

In [14]:
# replace spaces with hyphons
combined_artists["genre"] = combined_artists["genre"].str.replace(" ", "-")

A final look at our cleaned _genre_ labels.

In [15]:
# display all genres
all_genres = combined_artists["genre"].unique()
all_genres

array(['symbolic-painting', 'flower-painting', 'portrait', 'still-life',
       'wildlife-painting', 'landscape', 'allegorical-painting', 'poster',
       'tessellation', 'religious-painting', 'nude-painting',
       'genre-painting', 'illustration', 'figurative', 'abstract',
       'sketch-and-study', 'cityscape', 'capriccio', 'design', 'marina',
       'mythological-painting', 'literary-painting', 'history-painting',
       'interior', 'caricature', 'battle-painting', 'advertisement',
       'pastorale'], dtype=object)

## Clean the Column: _media_ 
As only one third of the fields in this column are populated and I see no easy method of identifying the media type. This column will be dropped.

In [16]:
combined_artists.drop("media", axis = "columns", inplace = True)

In [17]:
combined_artists.head()

Unnamed: 0,artist,genre,style,title,url,year,artist_artwork_id
0,M.C. Escher,symbolic-painting,surrealism,Bookplate Bastiaan Kist,https://uploads4.wikiart.org/images/m-c-escher...,1916,0
1,M.C. Escher,flower-painting,realism,Chrysanthemum,https://uploads3.wikiart.org/images/m-c-escher...,1916,1
2,M.C. Escher,portrait,expressionism,Escher's Father,https://uploads5.wikiart.org/images/m-c-escher...,1916,2
3,M.C. Escher,portrait,expressionism,Head of a Child,https://uploads5.wikiart.org/images/m-c-escher...,1916,3
4,M.C. Escher,portrait,expressionism,Baby,https://uploads3.wikiart.org/images/m-c-escher...,1917,4


## Clean the Column: _style_ 
On to the styles. Unlike the other matadata values, _style_ can actually contain more than one value. Therefore we must extract the data of this column as a collection of lists and process accordingly. <br/>But first, correct the empty values.

In [18]:
# find all column style is NaN records
combined_artists[combined_artists["style"].isnull() == True]

Unnamed: 0,artist,genre,style,title,url,year,artist_artwork_id
634,L. S. Lowry,still-life,,Chinese Lantern and Oranges,https://uploads6.wikiart.org/00169/images/l-s-...,1908,1
3181,Roy Lichtenstein,abstract,,Untitled,https://uploads8.wikiart.org/00101/images/roy-...,1965,66


We have just 2 records with a _Null_ values. So, let's manually update them.

In [19]:
#start with L. S Lowry and search for similar records
combined_artists[(combined_artists["artist"] == "L. S. Lowry") & (combined_artists["genre"] == "still-life")]

Unnamed: 0,artist,genre,style,title,url,year,artist_artwork_id
633,L. S. Lowry,still-life,realism,Still Life,https://uploads7.wikiart.org/00169/images/l-s-...,1906,0
634,L. S. Lowry,still-life,,Chinese Lantern and Oranges,https://uploads6.wikiart.org/00169/images/l-s-...,1908,1


In [20]:
# disable warning
pd.set_option('mode.chained_assignment', None)

# the only other still life by L. S Lowry is of sytle: realism. So, let's assign that
combined_artists["style"].loc[ ( (combined_artists["artist"] == "L. S. Lowry") & (combined_artists["title"] == "Chinese Lantern and Oranges") ) ] = "realism"

# re-enable warning
pd.reset_option('mode.chained_assignment')

In [21]:
# check the change was made
combined_artists[combined_artists["title"] == "Chinese Lantern and Oranges"]

Unnamed: 0,artist,genre,style,title,url,year,artist_artwork_id
634,L. S. Lowry,still-life,realism,Chinese Lantern and Oranges,https://uploads6.wikiart.org/00169/images/l-s-...,1908,1


In [22]:
#now switch to Roy Lichtenstein and search for similar records
combined_artists[(combined_artists["artist"] == "Roy Lichtenstein") & (combined_artists["genre"] == "abstract")]

Unnamed: 0,artist,genre,style,title,url,year,artist_artwork_id
3115,Roy Lichtenstein,abstract,pop art,The Valve,https://uploads7.wikiart.org/00101/images/roy-...,1954,0
3155,Roy Lichtenstein,abstract,pop art,Non Objective I,https://uploads4.wikiart.org/00304/images/roy-...,1964,40
3166,Roy Lichtenstein,abstract,abstract art,Nonobjective II,https://uploads7.wikiart.org/images/roy-lichte...,1964,51
3181,Roy Lichtenstein,abstract,,Untitled,https://uploads8.wikiart.org/00101/images/roy-...,1965,66
3182,Roy Lichtenstein,abstract,abstract art,Big painting VI,https://uploads4.wikiart.org/images/roy-lichte...,1965,67
3184,Roy Lichtenstein,abstract,abstract art,Little big painting,https://uploads5.wikiart.org/images/roy-lichte...,1965,69
3188,Roy Lichtenstein,abstract,abstract art,Yellow brushstroke I,https://uploads3.wikiart.org/images/roy-lichte...,1965,73
3189,Roy Lichtenstein,abstract,abstract art,Yellow landscape,https://uploads8.wikiart.org/images/roy-lichte...,1965,74
3195,Roy Lichtenstein,abstract,abstract art,Yellow and green brushstrokes,https://uploads3.wikiart.org/images/roy-lichte...,1966,80
3196,Roy Lichtenstein,abstract,abstract art,Brushstrokes,https://uploads7.wikiart.org/images/roy-lichte...,1966,81


In [23]:
# we see that it could be pop art, abstract art, cubism or something else. The title is set to unknown.
# so, as we have no clues, we shall delete this record.
combined_artists = combined_artists[combined_artists["style"].isnull() == False]

In [24]:
# check the change was made
combined_artists[(combined_artists["artist"] == "Roy Lichtenstein") & (combined_artists["genre"] == "abstract")]

Unnamed: 0,artist,genre,style,title,url,year,artist_artwork_id
3115,Roy Lichtenstein,abstract,pop art,The Valve,https://uploads7.wikiart.org/00101/images/roy-...,1954,0
3155,Roy Lichtenstein,abstract,pop art,Non Objective I,https://uploads4.wikiart.org/00304/images/roy-...,1964,40
3166,Roy Lichtenstein,abstract,abstract art,Nonobjective II,https://uploads7.wikiart.org/images/roy-lichte...,1964,51
3182,Roy Lichtenstein,abstract,abstract art,Big painting VI,https://uploads4.wikiart.org/images/roy-lichte...,1965,67
3184,Roy Lichtenstein,abstract,abstract art,Little big painting,https://uploads5.wikiart.org/images/roy-lichte...,1965,69
3188,Roy Lichtenstein,abstract,abstract art,Yellow brushstroke I,https://uploads3.wikiart.org/images/roy-lichte...,1965,73
3189,Roy Lichtenstein,abstract,abstract art,Yellow landscape,https://uploads8.wikiart.org/images/roy-lichte...,1965,74
3195,Roy Lichtenstein,abstract,abstract art,Yellow and green brushstrokes,https://uploads3.wikiart.org/images/roy-lichte...,1966,80
3196,Roy Lichtenstein,abstract,abstract art,Brushstrokes,https://uploads7.wikiart.org/images/roy-lichte...,1966,81
3197,Roy Lichtenstein,abstract,abstract art,Modern painting with clef,https://uploads8.wikiart.org/images/roy-lichte...,1967,82


Now we view the list of unique values. Notice that some values are themselves comma seperated lists. 

In [25]:
# display all sytles
all_styles = combined_artists["style"].unique()
all_styles

array(['surrealism', 'realism', 'expressionism', 'cubism', 'op art',
       'art nouveau (modern)', 'northern renaissance', 'art deco',
       'naïve art (primitivism)', 'abstract expressionism',
       'action painting', 'abstract expressionism,action painting',
       'symbolism,post-impressionism', 'symbolism', 'post-impressionism',
       'post-impressionism,expressionism',
       'post-impressionism,impressionism',
       'naïve art (primitivism),post-impressionism',
       'expressionism,art deco', 'expressionism,naïve art (primitivism)',
       'naïve art (primitivism),expressionism',
       'naïve art (primitivism),impressionism',
       'naïve art (primitivism),surrealism', 'impressionism',
       'fauvism,neo-impressionism', 'fauvism', 'neo-impressionism',
       'neoplasticism', 'romanticism', 'minimalism',
       'color field painting', 'post-impressionism,symbolism', 'japonism',
       'japonism,post-impressionism', 'expressionism,symbolism',
       'cubism,post-impression

In [26]:
# correct the "op art" typo(s)
combined_artists["style"] = combined_artists["style"].apply(lambda x: "pop art" if x       == "op art"      else x)
combined_artists["style"] = combined_artists["style"].apply(lambda x: "modern"  if x[0:11] == "art nouveau" else x)
combined_artists["style"] = combined_artists["style"].apply(lambda x:              x.replace(" (", ",")           )

In [27]:
# remove rounded brackets "()"
combined_artists["style"] = combined_artists["style"].apply(lambda x: x.replace("(",",").replace(")","").replace(" ","-"))

In [28]:
# redefine and display all sytles
all_styles = combined_artists["style"].unique()
all_styles

array(['surrealism', 'realism', 'expressionism', 'cubism', 'pop-art',
       'modern', 'northern-renaissance', 'art-deco',
       'naïve-art,primitivism', 'abstract-expressionism',
       'action-painting', 'abstract-expressionism,action-painting',
       'symbolism,post-impressionism', 'symbolism', 'post-impressionism',
       'post-impressionism,expressionism',
       'post-impressionism,impressionism',
       'naïve-art,primitivism,post-impressionism',
       'expressionism,art-deco', 'expressionism,naïve-art,primitivism',
       'naïve-art,primitivism,expressionism',
       'naïve-art,primitivism,impressionism',
       'naïve-art,primitivism,surrealism', 'impressionism',
       'fauvism,neo-impressionism', 'fauvism', 'neo-impressionism',
       'neoplasticism', 'romanticism', 'minimalism',
       'color-field-painting', 'post-impressionism,symbolism', 'japonism',
       'japonism,post-impressionism', 'expressionism,symbolism',
       'cubism,post-impressionism', 'neoclassicism',
  

We split the values that are lists.

In [29]:
# for each record. split the string containing a comma separated list of styles into a list
# and then remove leading and trailing space from each item within the list.
combined_artists["style"] = combined_artists["style"].apply(lambda x: [s.strip() for s in str(x).split(",")])

In [None]:
# create list of unique values from the styles strings. we take the unique list
# of string values. but some of these are actually comma separated strings of 
# values. so we convert the list into a string and replace various charaters to
# give us a combined string of comma separted values. now we split the string
# into a list and remove leading and trailing spaces from any of the list items.
# at the point our list contains duplicates. so these are removed by converting 
# it into a set and then back into a list. then the list is sorted into alphabetical
# order just to be pretty
unique_styles = sorted(list(set([s.strip() for s in str(all_styles).replace("\n", "").replace("' '",",").replace("[","").replace("]","").replace("'","").split(",")])))

We can now view the set of unique _sytle_ values.

In [31]:
# view the list of styles
unique_styles

['abstract-art',
 'abstract-expressionism',
 'academicism',
 'action-painting',
 'analytical-cubism',
 'art-deco',
 'color-field-painting',
 'cubism',
 'expressionism',
 'fauvism',
 'impressionism',
 'japonism',
 'minimalism',
 'modern',
 'naïve-art',
 'neo-impressionism',
 'neoclassicism',
 'neoplasticism',
 'northern-renaissance',
 'pointillism',
 'pop-art',
 'post-impressionism',
 'primitivism',
 'realism',
 'romanticism',
 'surrealism',
 'symbolism',
 'synthetic-cubism',
 'ukiyo-e']

In [32]:
# count the list of styles
len(unique_styles)

29

As an Artwork maybe assigned more than one _style_ lets quickly look at how many occurances of each style label are defined.

In [33]:
# loop through all of the artistic styles
for artistic_style in unique_styles:
    # total the occurances for each style
    print("".join([ artistic_style.ljust(30), str(len(combined_artists[combined_artists["style"].apply(lambda x: artistic_style in x)]))]))

abstract-art                  18
abstract-expressionism        36
academicism                   1
action-painting               43
analytical-cubism             48
art-deco                      85
color-field-painting          3
cubism                        235
expressionism                 390
fauvism                       10
impressionism                 293
japonism                      3
minimalism                    54
modern                        17
naïve-art                     256
neo-impressionism             5
neoclassicism                 86
neoplasticism                 34
northern-renaissance          1
pointillism                   2
pop-art                       266
post-impressionism            147
primitivism                   256
realism                       79
romanticism                   520
surrealism                    478
symbolism                     34
synthetic-cubism              56
ukiyo-e                       268


__NOTE:__ there are some values with very low _(even just a single)_ occurance(_s_). It could be argued that these should be removed. I have chosen to keep them. As the number of chosen Artists is small. Sould airworks from more Artists be added later. These values may be of more relevance.

## Clean the Column: _title_ 
We know from the eariler check that there are no empty fields for this column. but a quick look at the data reviels that there are some artworks with the title listes as _NOT DETECTED_. We shall replace this with the value _Unknown_.

In [34]:
# replace "NOT DETECTED" with the value "Unknown"
combined_artists["title"] = combined_artists["title"].apply(lambda x: "Unknown" if x == "NOT DETECTED"else x)

## Clean the Column: _year_
The _year_ is converted to a simple 4 digit number. In the case where we do not have a valid year, a _forward-fill_ approach is taken. The artworks are listed on a chronological order on the _wikiart.org_ website and the data is read in the order that it appears on the website. It is important to only forward fill with year by Artist. In the case of an artwork having no year and the artist has no work of art prior. Then the year of the earlist work of art by that artist is recorded. <br/>__NOTE:__ as the year is only required to provide a _Ball-part_ time period. This method is fine in this case.

In [35]:
# if the first 4 characture of the year data are numeric then use them otherwise set to 0
combined_artists["year"] = combined_artists["year"].apply(lambda x: str(x)[0:4] if str(x)[0:4].isnumeric() else 0)

# convert all years to integers
combined_artists["year"] = combined_artists["year"].astype(float).astype(int)

In [36]:
# create a dict with the lostet recorded year of artwork creation for each artis
artist_year_data = {}
for artist in combined_artists["artist"].unique():
    artist_year_data[artist] = min(combined_artists[(combined_artists["artist"] == artist) & (combined_artists["year"] > 0)]["year"])

In [37]:
# a home grown version of ffill that will forward fill. but, only using values for the specific artist.
# in the event of year not being populated for the first artwork of an artist the minimum recorded year 
# of an artwork for that artist will be inserted.
def intelligent_ffill(artist, year = 0):

    # update the ffill value of the artist if a value greater tha the current value is provided
    if  int(year) > int(artist_year_data[artist]):
        artist_year_data[artist] = int(year)
        
    return artist_year_data[artist]

In [38]:
# apply the intelligent forward fill function
combined_artists["year"] = combined_artists.apply(lambda x: intelligent_ffill(x["artist"], x["year"]), axis=1)

## Check that each field of all the columns is populated

In [39]:
# count results
combined_artists.count()

artist               3306
genre                3306
style                3306
title                3306
url                  3306
year                 3306
artist_artwork_id    3306
dtype: int64

## Write the _combined_artists_ DataFrame to CSV File

In [40]:
combined_artists.to_csv("".join([data_file_path, "combined_artists/combined_artists.csv"]))