___
### IMPORTS

In [105]:
# Modules used for data handling / test
import pickle


# Modules used for EDA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


# Modules used for ML
from sklearn import preprocessing

In [106]:
# Modules settings
%matplotlib inline

___
### GETTING DATASET

In [107]:
with open('./data/large_museum/large_museum', 'rb') as file:
    museum = pickle.load(file)

In [108]:
museum

Unnamed: 0,img_ID,artist,height,width,whitespace,chiaroscuro,color_01,color_02,color_03,color_04,color_05,color_06,color_07,color_08,color_09,color_10
0,9223372032559824886,caravaggio,250,211,0.39810,0.00659,#3F0000,#FFFF7F,#3F003F,#BF3F3F,#FFBF7F,#7F0000,#BF7F3F,#3F0000,#FFFFBF,#BFBFBF
1,186636,caravaggio,250,239,0.39833,0.01178,#003F00,#BF7F7F,#3F0000,#FFBF7F,#7F0000,#3F7F3F,#3F003F,#FFFFFF,#BF7F00,#3F3F00
2,186724,caravaggio,250,169,0.39763,0.01400,#3F0000,#BFBFBF,#7F3F7F,#003F3F,#003F00,#3F3F3F,#FFFFFF,#7F3F3F,#BF7F7F,#FFFF7F
3,186639,caravaggio,250,347,0.39885,0.02416,#3F3F00,#BF7F00,#7F7F3F,#FFBF7F,#7F003F,#3F003F,#3F3F3F,#7F3F00,#3F3F3F,#BF7F3F
4,186671,caravaggio,250,328,0.39878,0.01706,#7F003F,#BF7F7F,#00003F,#7F0000,#3F0000,#FFBFBF,#3F0000,#003F00,#7F7F3F,#FF7F3F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4633,229217,velazquez,250,362,0.39890,0.03147,#000000,#7F7F7F,#00003F,#BF7F7F,#003F3F,#7F003F,#7F7F00,#3F3F00,#FFFFBF,#7F0000
4634,229182,velazquez,250,192,0.39792,191.00000,#7F7F00,#000000,#BFBF00,#FF3F00,#BF0000,#7F0000,#FFBF00,#FF003F,#7F3F00,#FF7F00
4635,229183,velazquez,250,204,0.39804,0.00714,#BFBF7F,#3F0000,#003F3F,#7F7F00,#000000,#FFBFBF,#3F3F3F,#3F7F7F,#BF3F3F,#7F0000
4636,9223372032559882013,velazquez,250,138,0.39710,0.01824,#BF7F7F,#3F0000,#7F3F00,#003F00,#BF3F00,#7F7F7F,#3F3F3F,#FFFFBF,#3F7F00,#003F00


___
### BASIC EDA

The height column doesn't give any information, but the ratio data may work. Let's clean up!

In [109]:
museum['ratio'] = museum['height'] / museum['width']

museum.drop(labels='height', axis=1, inplace=True)

In [110]:
museum.head()

Unnamed: 0,img_ID,artist,width,whitespace,chiaroscuro,color_01,color_02,color_03,color_04,color_05,color_06,color_07,color_08,color_09,color_10,ratio
0,9223372032559824886,caravaggio,211,0.3981,0.00659,#3F0000,#FFFF7F,#3F003F,#BF3F3F,#FFBF7F,#7F0000,#BF7F3F,#3F0000,#FFFFBF,#BFBFBF,1.184834
1,186636,caravaggio,239,0.39833,0.01178,#003F00,#BF7F7F,#3F0000,#FFBF7F,#7F0000,#3F7F3F,#3F003F,#FFFFFF,#BF7F00,#3F3F00,1.046025
2,186724,caravaggio,169,0.39763,0.014,#3F0000,#BFBFBF,#7F3F7F,#003F3F,#003F00,#3F3F3F,#FFFFFF,#7F3F3F,#BF7F7F,#FFFF7F,1.47929
3,186639,caravaggio,347,0.39885,0.02416,#3F3F00,#BF7F00,#7F7F3F,#FFBF7F,#7F003F,#3F003F,#3F3F3F,#7F3F00,#3F3F3F,#BF7F3F,0.720461
4,186671,caravaggio,328,0.39878,0.01706,#7F003F,#BF7F7F,#00003F,#7F0000,#3F0000,#FFBFBF,#3F0000,#003F00,#7F7F3F,#FF7F3F,0.762195


___
### FEATURE ENGINEERING

Another valious information is the *'school'* or style of the artist. Some artists practiced several styles but I'll just use the most significant and reduce it to two groups: classics and vanguards.

| Artist                 | Movement                    | Group
| :---                   | :---                        | :---
| Caravaggio             | Baroque                     | Classic
| Edgar Degas            | Impressionism               | Classic
| Francisco de Goya      | Romanticism                 | Classic
| Katsushika Hokusai     | Ukiyo-e                     | Classic
| Frida Kahlo            | Surrealism                  | Vanguards
| Wassily Kandinsky      | Expressionism               | Classic
| Gustav Klimt           | Art Nouveau                 | Vanguards
| Roy Lichtenstein       | Pop Art                     | Vanguards
| Piet Mondrian          | De Stijl                    | Vanguards
| Claude Monet           | Impressionism               | Classic
| Pablo Picasso          | Cubism                      | Vanguards
| Jackson Pollock        | Abstract Expressionism      | Vanguards
| Joaquín Sorolla        | Impressionism               | Classic
| Diego Velazquez        | Baroque                     | Classic
| Andy Warhol            | Pop Art                     | Vanguards

In [111]:
vanguards = ['kahlo', 'klimt', 'lichtenstein', 'mondrian',
             'picasso', 'pollock', 'warhol']

museum['group'] = '0'
museum.loc[museum['artist'].isin(vanguards), 'group'] = '1'

In [112]:
museum.head()

Unnamed: 0,img_ID,artist,width,whitespace,chiaroscuro,color_01,color_02,color_03,color_04,color_05,color_06,color_07,color_08,color_09,color_10,ratio,group
0,9223372032559824886,caravaggio,211,0.3981,0.00659,#3F0000,#FFFF7F,#3F003F,#BF3F3F,#FFBF7F,#7F0000,#BF7F3F,#3F0000,#FFFFBF,#BFBFBF,1.184834,0
1,186636,caravaggio,239,0.39833,0.01178,#003F00,#BF7F7F,#3F0000,#FFBF7F,#7F0000,#3F7F3F,#3F003F,#FFFFFF,#BF7F00,#3F3F00,1.046025,0
2,186724,caravaggio,169,0.39763,0.014,#3F0000,#BFBFBF,#7F3F7F,#003F3F,#003F00,#3F3F3F,#FFFFFF,#7F3F3F,#BF7F7F,#FFFF7F,1.47929,0
3,186639,caravaggio,347,0.39885,0.02416,#3F3F00,#BF7F00,#7F7F3F,#FFBF7F,#7F003F,#3F003F,#3F3F3F,#7F3F00,#3F3F3F,#BF7F3F,0.720461,0
4,186671,caravaggio,328,0.39878,0.01706,#7F003F,#BF7F7F,#00003F,#7F0000,#3F0000,#FFBFBF,#3F0000,#003F00,#7F7F3F,#FF7F3F,0.762195,0


In [113]:
museum.groupby('group').count()['img_ID']

group
0    2827
1    1444
Name: img_ID, dtype: int64

___

The color columns are coded in HEX notation and are object types elements. I'll use a hash encoder to transform them.

In [114]:
museum.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4271 entries, 0 to 4637
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   img_ID       4271 non-null   object 
 1   artist       4271 non-null   object 
 2   width        4271 non-null   int64  
 3   whitespace   4271 non-null   float64
 4   chiaroscuro  4271 non-null   float64
 5   color_01     4271 non-null   object 
 6   color_02     4271 non-null   object 
 7   color_03     4271 non-null   object 
 8   color_04     4271 non-null   object 
 9   color_05     4271 non-null   object 
 10  color_06     4271 non-null   object 
 11  color_07     4271 non-null   object 
 12  color_08     4271 non-null   object 
 13  color_09     4271 non-null   object 
 14  color_10     4271 non-null   object 
 15  ratio        4271 non-null   float64
 16  group        4271 non-null   object 
dtypes: float64(3), int64(1), object(13)
memory usage: 600.6+ KB


In [115]:
museum.iloc[:, 5:15].nunique()

color_01    87
color_02    83
color_03    88
color_04    87
color_05    88
color_06    90
color_07    90
color_08    89
color_09    94
color_10    94
dtype: int64

In [116]:
# Find unique colors in the ten colors columns
unique_colors = []

for i in range(5,15):
    for j in museum.iloc[:, i].unique():
        if j in unique_colors: continue
        else: unique_colors.append(j)

# Count color found
print(len(unique_colors))

103


In [117]:
# Turn unique_colors in a 1D np.array
unique_colors = np.array(unique_colors).astype('object')

unique_colors

array(['#3F0000', '#003F00', '#3F3F00', '#7F003F', '#003F3F', '#FFBF7F',
       '#3F3F3F', '#BF7F7F', '#00003F', '#000000', '#3F003F', '#7F0000',
       '#7F3F00', '#FFBFBF', '#BFBF7F', '#7F7F7F', '#FF7F7F', '#FFFFFF',
       '#7F3F3F', '#7F7F3F', '#BFBF00', '#7FBFBF', '#BF7FBF', '#FFFF7F',
       '#BF0000', '#FFBFFF', '#FFBF3F', '#7F7F00', '#3F7F3F', '#FFBF00',
       '#FFFF3F', '#007F3F', '#FFFFBF', '#BFBFBF', '#BF3F00', '#7F3F7F',
       '#BF7F3F', '#FF7F3F', '#3F7F7F', '#BFFF7F', '#BF3F3F', '#BFBF3F',
       '#3F7F00', '#7FBF3F', '#3F3F7F', '#7FBF00', '#7FBF7F', '#3FBF7F',
       '#BF3F7F', '#7F7FBF', '#BF7F00', '#FF3F3F', '#BFFFBF', '#BFFFFF',
       '#7FBFFF', '#FF7F00', '#FF3F7F', '#BFBFFF', '#FF3F00', '#3F007F',
       '#FF0000', '#003F7F', '#7FFFBF', '#3FBFBF', '#7FFF7F', '#7F3FBF',
       '#7FFFFF', '#00007F', '#003FBF', '#FFFF00', '#00BF7F', '#3F7FBF',
       '#BF003F', '#0000BF', '#FF7FBF', '#3F3FBF', '#007F7F', '#00BFBF',
       '#007FBF', '#007F00', '#7F7FFF', '#3FBFFF', 

In [118]:
# Instance, fit & transform label encoder
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(unique_colors)
label_encoder.transform(label_encoder.classes_)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102])

In [119]:
col_names = ['color_01', 'color_02', 'color_03', 'color_04', 'color_05',
             'color_06', 'color_07', 'color_08', 'color_09', 'color_10',]

encoded_col_names = ['c01', 'c02', 'c03', 'c04', 'c05',
                     'c06', 'c07', 'c08', 'c09', 'c10',]

# Transform features
for col_name, encoded_col_name in zip(col_names, encoded_col_names):
    museum[encoded_col_name] = label_encoder.transform(museum[col_name])

# Drop colums with HEX values
museum.drop(labels=col_names, axis=1, inplace=True)

# Show result
museum

Unnamed: 0,img_ID,artist,width,whitespace,chiaroscuro,ratio,group,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10
0,9223372032559824886,caravaggio,211,0.39810,0.00659,1.184834,0,18,100,19,65,95,39,69,18,101,76
1,186636,caravaggio,239,0.39833,0.01178,1.046025,0,5,70,18,95,39,28,19,102,68,22
2,186724,caravaggio,169,0.39763,0.01400,1.479290,0,18,76,45,6,5,23,102,44,70,100
3,186639,caravaggio,347,0.39885,0.02416,0.720461,0,22,68,49,95,40,19,23,43,23,69
4,186671,caravaggio,328,0.39878,0.01706,0.762195,0,40,70,1,39,18,96,18,5,49,89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4633,229217,velazquez,362,0.39890,0.03147,0.690608,0,0,50,1,70,6,40,48,22,101,39
4634,229182,velazquez,192,0.39792,191.00000,1.302083,0,48,0,73,85,62,39,93,84,43,88
4635,229183,velazquez,204,0.39804,0.00714,1.225490,0,75,18,6,48,0,96,23,29,65,39
4636,9223372032559882013,velazquez,138,0.39710,0.01824,1.811594,0,70,18,43,5,64,50,23,101,27,5


___

Now it's time to apply a standar scaler to `width`, `whitespace`, `chiaroscuro`, `ratio`

In [120]:
col_names = ['width', 'whitespace', 'chiaroscuro', 'ratio']

encoded_col_names = ['width_std', 'whitespace_std', 'chiaroscuro_std', 'ratio_std']


# Transform features
museum[encoded_col_names] = preprocessing.StandardScaler().fit_transform(museum[col_names])

# Drop original colums
museum.drop(labels=col_names, axis=1, inplace=True)

# Show result
museum

Unnamed: 0,img_ID,artist,group,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,width_std,whitespace_std,chiaroscuro_std,ratio_std
0,9223372032559824886,caravaggio,0,18,100,19,65,95,39,69,18,101,76,-0.595676,-0.555053,-0.121122,0.366144
1,186636,caravaggio,0,5,70,18,95,39,28,19,102,68,22,-0.312410,-0.271210,-0.120839,-0.002834
2,186724,caravaggio,0,18,76,45,6,5,23,102,44,70,100,-1.020574,-1.135080,-0.120717,1.148857
3,186639,caravaggio,0,22,68,49,95,40,19,23,43,23,69,0.780186,0.370522,-0.120163,-0.868238
4,186671,caravaggio,0,40,70,1,39,18,96,18,5,49,89,0.587970,0.284135,-0.120551,-0.757302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4633,229217,velazquez,0,0,50,1,70,6,40,48,22,101,39,0.931936,0.432227,-0.119765,-0.947593
4634,229182,velazquez,0,48,0,73,85,62,39,93,84,43,88,-0.787892,-0.777191,10.295127,0.677812
4635,229183,velazquez,0,75,18,6,48,0,96,23,29,65,39,-0.666492,-0.629099,-0.121092,0.474214
4636,9223372032559882013,velazquez,0,70,18,43,5,64,50,23,101,27,5,-1.334190,-1.789152,-0.120486,2.032178


___
### BALANCE DATASET

In [130]:
museum.groupby('group').count().iloc[:, 0]

group
0    2827
1    1444
Name: img_ID, dtype: int64

___
### EXPORT RESULTS

In [121]:
with open('./data/large_museum/large_museum_clean', 'wb') as file:
    pickle.dump(museum, file)