# Import libraries and files

In [1]:
import numpy as np
import pandas as pd
import os 
import shutil 
import time
import re

In [2]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 1000)

In [3]:
art = pd.read_csv("all_data_info.csv")

## Specific checks

In [28]:
# Specific lookup

art[art.new_filename == '2965.jpg']

Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename,date_mod,century,max_century
74474,Stefan Luchian,,landscape,592.0,476.0,71887.0,wikiart,Impressionism,After Rain,train_and_test,False,2965.jpg,-100,0,20


In [29]:
art[art['artist'] == 'Paul Cezanne'][art['in_train'] == False]

  """Entry point for launching an IPython kernel.


Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename,date_mod,century,max_century
30593,Paul Cezanne,1878,genre painting,3176.0,2618.0,1139110.0,wikiart,Post-Impressionism,Four Bathers,train_and_test,False,19446.jpg,1878,19,20
30594,Paul Cezanne,1894,still life,3176.0,2606.0,737724.0,wikiart,Post-Impressionism,"Curtain, Jug and Fruit",train_and_test,False,94593.jpg,1894,19,20
30598,Paul Cezanne,1877,symbolic painting,3176.0,2600.0,958500.0,wikiart,Impressionism,The Eternal Woman,train_and_test,False,64480.jpg,1877,19,20
30608,Paul Cezanne,c.1905,landscape,3176.0,2552.0,836544.0,wikiart,Cubism,Montagne Sainte-Victoire and the Black Chateau,train_and_test,False,71902.jpg,1905,20,20
30609,Paul Cezanne,c.1890,landscape,3176.0,2551.0,811453.0,wikiart,Post-Impressionism,Mont Sainte-Victoire,train_and_test,False,11113.jpg,1890,19,20
30612,Paul Cezanne,c.1875,genre painting,3176.0,2540.0,884549.0,wikiart,Impressionism,Afternoon in Naples,train_and_test,False,7959.jpg,1875,19,20
30621,Paul Cezanne,1865,still life,3176.0,2497.0,582879.0,wikiart,Romanticism,Still Life with Bread and Eggs,train_and_test,False,71756.jpg,1865,19,20
30623,Paul Cezanne,1890,landscape,3176.0,2473.0,920524.0,wikiart,Post-Impressionism,House with Red Roof,train_and_test,False,34815.jpg,1890,19,20
30630,Paul Cezanne,c.1866,portrait,2536.0,3061.0,856153.0,wikiart,Romanticism,Portrait of Uncle Dominique as a Monk,train_and_test,False,64759.jpg,1866,19,20
30636,Paul Cezanne,c.1895,self-portrait,2536.0,3001.0,1097996.0,wikiart,Post-Impressionism,Self-Portrait,train_and_test,False,22331.jpg,1895,19,20


## EDA

#### Analysis over time

In [6]:
def extract_year(x):
    try:
        return int(re.search(r'\d+', str(x)).group(0))
    except:
        return -100

art['date_mod'] = art.date.apply(lambda x: extract_year(x))
art['century'] = art.date_mod.apply(lambda x: x//100 + 1)
art[['century','in_train']].groupby(['century']).count()

Unnamed: 0_level_0,in_train
century,Unnamed: 1_level_1
0,26366
2,1
10,2
11,5
12,49
13,36
14,255
15,1989
16,3093
17,3494


In [7]:
# Manual fix

max_century = art[['artist','century']].groupby(['artist']).max().reset_index()
max_century.columns = ['artist', 'max_century']
art = art.merge(max_century, on = 'artist')
art.head()
art[['max_century','in_train']].groupby(['max_century']).count()

Unnamed: 0_level_0,in_train
max_century,Unnamed: 1_level_1
0,1667
11,7
12,42
13,56
14,327
15,1874
16,4532
17,4505
18,3288
19,18130


In [8]:
art[['max_century','in_train']].groupby(['max_century']).sum()

Unnamed: 0_level_0,in_train
max_century,Unnamed: 1_level_1
0,1293.0
11,7.0
12,34.0
13,41.0
14,246.0
15,1390.0
16,3574.0
17,3416.0
18,2511.0
19,13934.0


In [9]:
art[art.max_century > 18][['artist','in_train']].groupby(['artist']).count().sort_values(by='in_train', ascending= False).head(150).sum()

in_train    42640
dtype: int64

In [10]:
# Check gaps

art[art.max_century == 0][['artist','in_train']].groupby(['artist']).count().sort_values(by='in_train', ascending= False).head(150)

Unnamed: 0_level_0,in_train
artist,Unnamed: 1_level_1
Antoine Blanchard,178
Maurice Utrillo,148
John Miller,80
Edmund Dulac,68
William Orpen,68
Charles Tunnicliffe,60
Adam Baltatu,60
John Austen,57
Mustafa Duzgunman,50
Jacob van Strij,42


## Summaries

In [11]:
summary = art[['artist','style','pixelsx']].groupby(['style','artist']).count()#.sort_values(by='pixelsx', ascending=False).head(100)
summary.reset_index(inplace=True)
#summary.nlargest(5,columns = 'pixelsx')
#summary.sort_values(by='pixelsx', ascending=False)

In [12]:
# Top 50 artists
art[['artist','in_train']].groupby(['artist']).count().sort_values(by='in_train', ascending=False).head(10)

Unnamed: 0_level_0,in_train
artist,Unnamed: 1_level_1
John Singer Sargent,500
Rembrandt,500
Pablo Picasso,500
Ivan Shishkin,500
Ivan Aivazovsky,500
Giovanni Battista Piranesi,500
Gustave Dore,500
Pierre-Auguste Renoir,500
Ilya Repin,500
Marc Chagall,500


In [13]:
art[['genre','in_train']].groupby(['genre']).count().sort_values(by='in_train', ascending=False).head(10)

Unnamed: 0_level_0,in_train
genre,Unnamed: 1_level_1
portrait,16847
landscape,15006
genre painting,14260
abstract,9498
religious painting,7429
cityscape,5348
sketch and study,3644
illustration,3202
still life,3132
symbolic painting,2545


In [14]:
art[['style','in_train']].groupby(['style']).count().sort_values(by='in_train', ascending=False).head(10)

Unnamed: 0_level_0,in_train
style,Unnamed: 1_level_1
Impressionism,10643
Realism,10523
Romanticism,9285
Expressionism,7013
Post-Impressionism,5778
Art Nouveau (Modern),4899
Baroque,4400
Surrealism,4167
Symbolism,3476
Rococo,2733


### Filter for image upload

In [15]:
### Filter for at least 100 images

#art['category_combination'] = art['genre'] + "_" + art['style'] + "_" + art['artist']
#filt = art[art['in_train'] == True] 
#filt = filt[['category_combination','in_train']].groupby(['category_combination']).count().sort_values(by = 'in_train', ascending= False).head(100).reset_index()
#top_combs = filt.category_combination.to_list()
#df_filt = art[art['in_train'] == True][art['category_combination'].isin(top_combs)]
#df_filt.head()

In [16]:
### Filter for sample of top 100 19th century plus artists


# Filter top artists
print(art.shape)
art_filt = art[art.max_century > 18][art.in_train == True]
print(art_filt.shape)
top_100 = art_filt[['artist', 'in_train']].groupby(['artist']).count().sort_values(by = 'in_train', ascending = False).head(100).reset_index().artist.to_list()
art_filt = art_filt[art_filt['artist'].isin(top_100)]
print(art_filt.shape)

# Filter top genres
top_15 = art_filt[['genre', 'in_train']].groupby(['genre']).count().sort_values(by = 'in_train', ascending = False).head(15).reset_index().genre.to_list()
art_filt = art_filt[art_filt['genre'].isin(top_15)]
print(art_filt.shape)

# Filter top styles
top_15 = art_filt[['style', 'in_train']].groupby(['style']).count().sort_values(by = 'in_train', ascending = False).head(15).reset_index()['style'].to_list()
art_filt = art_filt[art_filt['style'].isin(top_15)]
print(art_filt.shape)

# Manual drop of artists after other filters applied
remove_list = ['Albert Bierstadt','George Romney']
art_filt = art_filt[~art_filt.artist.isin(remove_list)]
print(art_filt.shape)


#art_filt = df_filt1.sample(frac=0.5)
#art_filt.shape

(103250, 15)
(66921, 15)
(26326, 15)
(24305, 15)
(21792, 15)
(21763, 15)


  


In [17]:
train_df = art_filt[['new_filename','genre','style','artist']]
train_df.shape

(21763, 4)

In [31]:
train_df.groupby(['genre']).count().sort_values(by='style', ascending=False)

Unnamed: 0_level_0,new_filename,style,artist
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
landscape,4847,4847,4847
portrait,4405,4405,4405
genre painting,3620,3620,3620
cityscape,1467,1467,1467
sketch and study,1212,1212,1212
illustration,1005,1005,1005
symbolic painting,960,960,960
design,834,834,834
still life,737,737,737
nude painting (nu),614,614,614


In [22]:
#train_df = art[art.genre == 'still life'][art.in_train == True]
#train_df = train_df[['new_filename','artist','style']]
#train_df.dropna(axis=0, inplace=True)
#print(train_df.shape)
train_df.to_csv("train_df.csv", index=False)
train_df.head()

Unnamed: 0,new_filename,genre,style,artist
364,99442.jpg,marina,Romanticism,Ivan Aivazovsky
365,81750.jpg,marina,Romanticism,Ivan Aivazovsky
366,82140.jpg,marina,Romanticism,Ivan Aivazovsky
367,74871.jpg,marina,Romanticism,Ivan Aivazovsky
369,7390.jpg,marina,Romanticism,Ivan Aivazovsky


### Create file transfer

In [20]:
file_list = train_df.new_filename.to_list()

In [21]:
start_time = time.time()
source_path = 'C:\\Users\\Mike\\Documents\\Analytics\\Personal projects\\huge_art_collection\\train\\'
destination_path = 'C:\\Users\\Mike\\Documents\\Analytics\\Personal projects\\huge_art_collection\\copy_files\\'

for files in file_list:
    source = source_path + files
    destination = destination_path + files
    dest = shutil.copyfile(source, destination)
    
end_time = time.time()
print("time taken: ", end_time - start_time)

time taken:  1043.89142537117


In [45]:
art[art['genre'] == 'design'][art['style'] == 'Art Nouveau (Modern)']

  """Entry point for launching an IPython kernel.


Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename,date_mod,century,max_century
4390,Odilon Redon,c.1907,design,840.0,1113.0,311877.0,wikiart,Art Nouveau (Modern),Decoration,train_and_test,True,11932.jpg,1907,20,20
4472,Odilon Redon,c.1909,design,646.0,1008.0,172282.0,wikiart,Art Nouveau (Modern),Design for a Prayer Rug,train_and_test,False,53185.jpg,1909,20,20
5804,Henri de Toulouse-Lautrec,c.1896,design,603.0,768.0,86582.0,wikiart,Art Nouveau (Modern),Couverture for Elles,train_and_test,True,63838.jpg,1896,19,20
5836,Henri de Toulouse-Lautrec,1893,design,700.0,601.0,289688.0,wikiart,Art Nouveau (Modern),Coverage of the Original Print,train_and_test,True,49929.jpg,1893,19,20
5973,Konstantin Somov,1907,design,761.0,1000.0,220236.0,wikiart,Art Nouveau (Modern),Title Page of 'Theatre',train_and_test,True,19537.jpg,1907,20,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101257,Emile Galle,1900.0,design,311.0,570.0,54040.0,wikiart,Art Nouveau (Modern),Fireflies,train_and_test,True,39922.jpg,1900,20,20
101258,Emile Galle,1900.0,design,346.0,480.0,101305.0,wikiart,Art Nouveau (Modern),"Vase mit Irisblüten, Frankreich",train_and_test,True,11096.jpg,1900,20,20
101892,Jester,1925,design,475.0,667.0,85352.0,wikiart,Art Nouveau (Modern),"Costume designs for ""Petrushka"" by Stravinsky ...",train_and_test,True,74749.jpg,1925,20,20
102417,"Nikolai Evreinov ""Pro Scena Suo..""",1915,design,454.0,646.0,103982.0,wikiart,Art Nouveau (Modern),Draft for a book cover,train_and_test,True,99511.jpg,1915,20,20


In [37]:
art.groupby(['genre']).count()

Unnamed: 0_level_0,artist,date,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename,date_mod,century,max_century
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
abstract,9498,8148,9498,9498,9498,9498,9465,9498,9498,9498,9498,9498,9498,9498
advertisement,82,67,82,82,82,82,82,82,82,82,82,82,82,82
allegorical painting,1034,762,1034,1034,1034,1034,1033,1034,1034,1034,1034,1034,1034,1034
animal painting,1571,1025,1571,1571,1571,1571,1552,1571,1571,1571,1571,1571,1571,1571
battle painting,358,210,358,358,358,358,358,358,358,358,358,358,358,358
bijinga,95,59,95,95,95,95,94,95,95,95,95,95,95,95
bird-and-flower painting,119,49,119,119,119,119,118,119,119,119,119,119,119,119
calligraphy,160,12,160,160,160,160,155,160,160,160,160,160,160,160
capriccio,236,185,236,236,236,236,236,236,236,236,236,236,236,236
caricature,231,138,231,231,231,231,231,231,231,231,231,231,231,231
