# WikiArt dataset cleaning, extending

Wikiart: all art pieces stored as a picture in Wikipedia

In [12]:
import pandas as pd
import numpy as np

In [13]:
df2 = pd.read_csv('datasets/originals/wikiart_art_pieces.csv') # From Kaggle if I remember correctly, but can be downloaded from other sites
print(df2['artist'].value_counts())

artist
Vincent van Gogh              1931
Nicholas Roerich              1843
Pierre-Auguste Renoir         1412
Claude Monet                  1367
Giovanni Battista Piranesi    1352
                              ... 
Arthur Lowe                      2
Vahram Gayfedjian                2
Gisela Colon                     2
Zakar Zakarian                   2
Ferenc Pinter                    1
Name: count, Length: 3209, dtype: int64


In [14]:
drop_artist = ['Byzantine Mosaics', 'Orthodox Icons', 'Romanesque Architecture', 'Viking art', 'Master of the Small Landscapes', 'Fayum portrait', ]
df2_refined = df2[~df2['artist'].isin(drop_artist)].reset_index(drop=True)
df2_refined = df2_refined.drop(columns=['url', 'img', 'file_name']).reset_index(drop=True)
#There are no NaN values in the artist column.
#df2[[row['artist'] is np.nan for i, row in df2.iterrows()]]
print(df2_refined['style'].value_counts()[0:15])
print("\nAmount of pictures:" ,len(df2_refined))
df2_refined[10000:10006]
#Later we drop tags too. 
#A recommendation is to drop style == "Unknown"

style
Impressionism             16083
Realism                   15764
Romanticism               15010
Expressionism             11455
Post-Impressionism         8147
Baroque                    7496
Art Nouveau (Modern)       7382
Surrealism                 6988
Unknown                    6900
Symbolism                  5224
Neoclassicism              4360
Abstract Expressionism     3909
Rococo                     3537
Northern Renaissance       3046
Cubism                     2530
Name: count, dtype: int64

Amount of pictures: 175313


Unnamed: 0,artist,style,genre,movement,tags
10000,Frans Snyders,Baroque,animal painting,Baroque,['Mythology']
10001,Frans Snyders,Baroque,animal painting,Baroque,"['animals', 'birds']"
10002,Frans Snyders,Baroque,still life,Baroque,"['fruits-and-vegetables', 'dishware-and-cutler..."
10003,Frans Snyders,Baroque,still life,Baroque,"['fruits-and-vegetables', 'dishware-and-cutler..."
10004,Frans Snyders,Baroque,still life,Baroque,['food-and-beverages']
10005,Frans Snyders,Baroque,literary painting,Baroque,"['animals', 'foxes', 'herons', 'Wildlife', 'Te..."


In [12]:
#Save refined dataset
df2_refined.to_csv('datasets/wikiart_paintings_refined.csv', index=False)

## Group artists

One way to group:
<details><summary><u>First method</u></summary>
<p>
    
```python
df2_grouped = (df2_refined.drop(columns=['tags'])).groupby(['style','artist', 'movement'])['movement'].count()
df2_grouped = df2_grouped.to_frame()
df2_grouped
#Kind of hard to deal with this frame because it has 2 layers, so you have to get 
```

</p>
</details>

Better method:

In [18]:
df2_grouped2 = (df2_refined.drop(columns=['tags'])).pivot_table(index=['style','artist', 'movement'], aggfunc='size')
df2_grouped2 = df2_grouped2.reset_index(name='count')
df2_grouped2.head(5)

Unnamed: 0,style,artist,movement,count
0,Abstract Art,Ad Reinhardt,Abstract Expressionism,15
1,Abstract Art,Adnan Coker,Abstract Art,25
2,Abstract Art,Akkitham Narayanan,Abstract Art,17
3,Abstract Art,Alberto Magnelli,Abstract Art,19
4,Abstract Art,Alekos Kontopoulos,Social Realism,26


In [None]:
#print(df2_grouped2[df2_grouped2['artist']=="Henri Matisse"])
print(df2_grouped2['movement'].unique()[0:15],'...', '\n Movements:', len(df2_grouped2['movement'].unique()), '\n')
#print('Baroque movement styles:\n', df2_grouped2[df2_grouped2['movement']=='Baroque']['style'].value_counts())
print("Artists:\n", df2_grouped2['artist'].describe())
df2_grouped2[df2_grouped2['artist']=="Pablo Picasso"].sort_values(by=['count'], ascending=False)

In [None]:
print("Amount of groups:", len(df2_grouped2), '\n')
print("Highest style-artist combinations:\n", (df2_grouped2[df2_grouped2['count']> 1200]), '\n\n') 
print(df2_grouped2[df2_grouped2['count']==min(df2_grouped2['count'])][0:6], "\n ... \n Rows with 1 count:", len(df2_grouped2[df2_grouped2['count']==min(df2_grouped2['count'])]), '\n\n') 
print("Unknown styles amount per artist:\n", (df2_grouped2[df2_grouped2['style']=="Unknown"]).sort_values(by=['count'], ascending=False)) 

In [15]:
# Save to csv
df2_grouped2.to_csv('datasets/wikiart_artists_styles_grouped.csv', index=False)

## Fetching birthplaces
We gathered the birthplace of artists from the Wikipedia API.
The fetching mechanism is stored in *wikiart_birthplace_fetch_script.py*, otherwise for a more detailed explanation, see the notebook (*wikiart_birthplace_fetch_notebook.ipynb*). The general output *wikiart_paintings_with_artist_birthplaces.csv*, a more cleaned version is *wikiart_paintings_with_artist_birthplaces_cleaned.csv*, which has some birthplaces manually added, remaining artists with no birthplace are dropped.

In [16]:
paintings_with_birthplaces = pd.read_csv("datasets/wikiart_paintings_with_artist_birthplaces_cleaned.csv")
paintings_with_birthplaces_noncleaned = pd.read_csv("datasets/wikiart_paintings_with_artist_birthplaces.csv")

paintings_with_birthplaces.head(3)

Unnamed: 0,artist,style,genre,movement,tags,url,img,file_name,birth_place
0,Andrei Rublev,Moscow school of icon painting,religious painting,Byzantine Art,"['Christianity', 'saints-and-apostles', 'angel...",https://www.wikiart.org/en/andrei-rublev/angel...,https://uploads5.wikiart.org/images/andrei-rub...,692-angel-presents-monk-pachomius-cenobitic-mo...,Grand Principality of Moscow
1,Andrei Rublev,Moscow school of icon painting,religious painting,Byzantine Art,"['Christianity', 'Old-Testament', 'Daniel', 'p...",https://www.wikiart.org/en/andrei-rublev/proph...,https://uploads8.wikiart.org/images/andrei-rub...,693-prophet-daniel.jpg,Grand Principality of Moscow
2,Andrei Rublev,Moscow school of icon painting,miniature,Byzantine Art,"['Christianity', 'saints-and-apostles', 'Khitr...",https://www.wikiart.org/en/andrei-rublev/st-jo...,https://uploads7.wikiart.org/images/andrei-rub...,694-st-john-the-evangelist.jpg,Grand Principality of Moscow


In [31]:
artists = paintings_with_birthplaces[['artist','birth_place']].drop_duplicates().reset_index(drop=True)
artists

Unnamed: 0,artist,birth_place
0,Andrei Rublev,Grand Principality of Moscow
1,Ivan Rutkovych,Zolochiv Raion
2,Facundus,León
3,Hildegard of Bingen,Bermersheim vor der Höhe
4,Nicholas of Verdun,Verdun
...,...,...
2931,Toshio Saeki,Tokyo
2932,Emily Kame Kngwarreye,Northern Territory
2933,Johnny Warangkula Tjupurrula,Napperby Station
2934,Norval Morrisseau,Greenstone


Manually added birthplaces of some artists where it was missing, stored in *datasets/artist_birthplaces_manually_created.csv*. This includes all of them who had at least 100 paintings in the dataset, and some other artists. About 200 are still missing, although not all are real artists (e.g. 'Viking art' or 'Fayum portrait'):

In [18]:
x = [a for a in paintings_with_birthplaces_noncleaned['artist'].unique() if a not in artists['artist'].unique()]
x

['Ende',
 'Herrad of Landsberg',
 'Claricia',
 'Viking art',
 'Toros Roslin',
 'Fayum portrait',
 'Il Sassetta (Stefano di Giovanni)',
 'Nuno Gonçalves',
 'Jean Hey',
 'Cristovao de Figueiredo',
 'Master of the Small Landscapes',
 'Isaac Fuller',
 'Jacob Peter Gowy',
 'Jan Dirksz Both',
 'Simon Ushakov',
 'Cornelis Norbertus Gysbrechts',
 'Johann Georg Pinzel',
 'Joseph Duplessis',
 'Carl-Ludwig Johann Christineck',
 'Marcos Zapata',
 'Claude-Joseph Vernet',
 'Mikhail Shibanov',
 'John Frederick Herring Sr.',
 'Fyodor Solntsev',
 'Édouard De Bièfve',
 'Rosario Weiss Zorrilla',
 'George Hemming Mason',
 'Michela De Vito',
 'Alexey  Bogolyubov',
 'Richard Caton Woodville Sr.',
 'Johann Koler',
 'August Friedrich Schenck',
 'Berthold  Woltze',
 'Rafael García Hispaleto (El Hispaleto)',
 'Alfred Concanen',
 'Mary Josephine Walters',
 'Edward R. Taylor',
 'Richard Caton Woodville Jr.',
 'Herbert Gustave Schmalz (Herbert Carmichael)',
 'Giovanni Battista Torriglia',
 'Angelo Zoffoli',
 'Var

## Checking for issues
In the Art500k datasets, a common issue is artists stored with different names (e.g. "Rembrandt" and "Rembrandt van Rijn"). Let's check if this is the case here too:

In [40]:
print(artists[artists["artist"].str.contains("Rembrandt")]["artist"].value_counts())
print(artists[artists["artist"].str.contains("Gogh")]["artist"].value_counts())
print(artists[artists["artist"].str.contains("Picasso")]["artist"].value_counts())
print(artists[artists["artist"].str.contains("(?i)Da Vinci")]["artist"].value_counts())
print(artists[artists["artist"].str.contains("Michelangelo")]["artist"].value_counts())
print(artists[artists["artist"].str.contains("Monet")]["artist"].value_counts())
print(artists[artists["artist"].str.contains("Renoir")]["artist"].value_counts())
print(artists[(artists["artist"].str.contains("Dali")) | artists["artist"].str.contains("Dalí")]["artist"].value_counts())
print(artists[artists["artist"].str.contains("Cezanne") | artists["artist"].str.contains("Cézanne")]["artist"].value_counts())

artist
Rembrandt          1
Rembrandt Peale    1
Name: count, dtype: int64
artist
Vincent van Gogh    1
Name: count, dtype: int64
artist
Pablo Picasso    1
Name: count, dtype: int64
artist
Leonardo da Vinci    1
Name: count, dtype: int64
artist
Michelangelo               1
Michelangelo Pistoletto    1
Name: count, dtype: int64
artist
Claude Monet    1
Name: count, dtype: int64
artist
Pierre-Auguste Renoir    1
Name: count, dtype: int64
artist
Salvador Dali    1
Name: count, dtype: int64
artist
Paul Cezanne    1
Name: count, dtype: int64


Appearantly not.

One issue is that some possibly important painters may be missing. In the raw data, I did not find any match for Chagall (or Shagal).