# Creating a database with artists from image filenames and cleaned art styles

In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
df = pd.read_csv('../raw_data/artists.csv')

## 1. Creating broader art style labels

In [4]:
genre_mapping = {
    "Expressionism": "Expressionism",
    "Expressionism,Abstractionism": "Expressionism,Abstractionism",
    "Social Realism,Muralism": "Realism,Muralism",
    "Impressionism": "Impressionism",
    "Surrealism,Impressionism": "Surrealism,Impressionism",
    "Surrealism": "Surrealism",
    "Realism,Impressionism": "Realism,Impressionism",
    "Byzantine Art": "Byzantine Art",
    "Post-Impressionism": "Impressionism",
    "Symbolism,Art Nouveau": "Symbolism,Art Nouveau",
    "Northern Renaissance": "Renaissance",
    "Suprematism": "Suprematism",
    "Symbolism": "Symbolism",
    "Cubism": "Cubism",
    "Baroque": "Baroque",
    "Romanticism": "Romanticism",
    "Primitivism,Surrealism": "Primitivism,Surrealism",
    "Mannerism": "Mannerism",
    "Primitivism": "Primitivism",
    "Proto Renaissance": "Renaissance",
    "Early Renaissance": "Renaissance",
    "High Renaissance": "Renaissance",
    "Impressionism,Post-Impressionism": "Impressionism",
    "High Renaissance,Mannerism": "Renaissance,Mannerism",
    "Realism": "Realism",
    "Symbolism,Expressionism": "Symbolism,Expressionism",
    "Expressionism,Abstractionism,Surrealism": "Expressionism,Abstractionism,Surrealism",
    "Neoplasticism": "Neoplasticism",
    "Pop Art": "Pop Art",
    "Symbolism,Post-Impressionism": "Symbolism,Impressionism",
    "Abstract Expressionism": "Expressionism"
}

In [5]:
def merge_genres(genre):
    if genre in genre_mapping:
        return genre_mapping[genre]
    else:
        return genre

In [6]:
df["genre_simplified"] = df["genre"].apply(merge_genres)

In [7]:
df["genre_simplified"]

0                               Expressionism
1                Expressionism,Abstractionism
2                            Realism,Muralism
3                               Impressionism
4                    Surrealism,Impressionism
5                                  Surrealism
6                       Realism,Impressionism
7                               Byzantine Art
8                               Impressionism
9                       Symbolism,Art Nouveau
10                                Renaissance
11                                Suprematism
12                                  Symbolism
13                                     Cubism
14                                    Baroque
15                              Impressionism
16                                Romanticism
17                     Primitivism,Surrealism
18                                  Mannerism
19                                Renaissance
20                              Impressionism
21                                

In [8]:
artists_genres = df["genre_simplified"].apply(lambda x: [g.strip() for g in x.split(',') if g.strip()])

In [9]:
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(artists_genres)
print(mlb.classes_)
genres_dataframe = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=df.index)

['Abstractionism' 'Art Nouveau' 'Baroque' 'Byzantine Art' 'Cubism'
 'Expressionism' 'Impressionism' 'Mannerism' 'Muralism' 'Neoplasticism'
 'Pop Art' 'Primitivism' 'Realism' 'Renaissance' 'Romanticism'
 'Suprematism' 'Surrealism' 'Symbolism']


In [10]:
encoded_genres_dataframe = pd.concat([df, genres_dataframe], axis=1)
encoded_genres_dataframe.head()

Unnamed: 0,id,name,years,genre,nationality,bio,wikipedia,paintings,genre_simplified,Abstractionism,...,Muralism,Neoplasticism,Pop Art,Primitivism,Realism,Renaissance,Romanticism,Suprematism,Surrealism,Symbolism
0,0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193,Expressionism,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Vasiliy Kandinskiy,1866 - 1944,"Expressionism,Abstractionism",Russian,Wassily Wassilyevich Kandinsky (Russian: Васи́...,http://en.wikipedia.org/wiki/Wassily_Kandinsky,88,"Expressionism,Abstractionism",1,...,0,0,0,0,0,0,0,0,0,0
2,2,Diego Rivera,1886 - 1957,"Social Realism,Muralism",Mexican,Diego María de la Concepción Juan Nepomuceno E...,http://en.wikipedia.org/wiki/Diego_Rivera,70,"Realism,Muralism",0,...,1,0,0,0,1,0,0,0,0,0
3,3,Claude Monet,1840 - 1926,Impressionism,French,Oscar-Claude Monet (; French: [klod mɔnɛ]; 14 ...,http://en.wikipedia.org/wiki/Claude_Monet,73,Impressionism,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Rene Magritte,1898 - 1967,"Surrealism,Impressionism",Belgian,René François Ghislain Magritte (French: [ʁəne...,http://en.wikipedia.org/wiki/René_Magritte,194,"Surrealism,Impressionism",0,...,0,0,0,0,0,0,0,0,1,0


## 2. Extracting artists names 

In [11]:
import sys
import os
sys.path.append(os.path.abspath('..'))  # Adds the project root to sys.path

In [12]:
from preprocessing_package.extract_artist_name import create_image_artist_df

image_artist_df = create_image_artist_df(images_root="../raw_data/resized")
image_artist_df.head()

Unnamed: 0,image_path,artist_name
0,../raw_data/resized/Gustav_Klimt_113.jpg,Gustav Klimt
1,../raw_data/resized/Vincent_van_Gogh_388.jpg,Vincent van Gogh
2,../raw_data/resized/Amedeo_Modigliani_24.jpg,Amedeo Modigliani
3,../raw_data/resized/Edgar_Degas_455.jpg,Edgar Degas
4,../raw_data/resized/Edgar_Degas_333.jpg,Edgar Degas


## 3. Merging databases

In [13]:
# make names lowercase
encoded_genres_dataframe["name"] = encoded_genres_dataframe["name"].str.lower()
image_artist_df["artist_name"] = image_artist_df["artist_name"].str.lower()

# Join on cleaned names
merged_df = image_artist_df.merge(encoded_genres_dataframe, left_on="artist_name", right_on="name", how="left")

In [14]:
# Check if all values in 'artist_name' and 'name' columns are equal
print((merged_df['artist_name'] == merged_df['name']).all())

True


In [15]:
merged_df = merged_df.drop(columns=['name'])

In [16]:
merged_df.head()

Unnamed: 0,image_path,artist_name,id,years,genre,nationality,bio,wikipedia,paintings,genre_simplified,...,Muralism,Neoplasticism,Pop Art,Primitivism,Realism,Renaissance,Romanticism,Suprematism,Surrealism,Symbolism
0,../raw_data/resized/Gustav_Klimt_113.jpg,gustav klimt,9,1862 - 1918,"Symbolism,Art Nouveau",Austrian,"Gustav Klimt (July 14, 1862 – February 6, 1918...",http://en.wikipedia.org/wiki/Gustav_Klimt,117,"Symbolism,Art Nouveau",...,0,0,0,0,0,0,0,0,0,1
1,../raw_data/resized/Vincent_van_Gogh_388.jpg,vincent van gogh,8,1853 – 1890,Post-Impressionism,Dutch,Vincent Willem van Gogh (Dutch: [ˈvɪnsɛnt ˈʋɪl...,http://en.wikipedia.org/wiki/Vincent_van_Gogh,877,Impressionism,...,0,0,0,0,0,0,0,0,0,0
2,../raw_data/resized/Amedeo_Modigliani_24.jpg,amedeo modigliani,0,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193,Expressionism,...,0,0,0,0,0,0,0,0,0,0
3,../raw_data/resized/Edgar_Degas_455.jpg,edgar degas,30,1834 - 1917,Impressionism,French,Edgar Degas (US: or UK: ; born Hilaire-Germai...,http://en.wikipedia.org/wiki/Edgar_Degas,702,Impressionism,...,0,0,0,0,0,0,0,0,0,0
4,../raw_data/resized/Edgar_Degas_333.jpg,edgar degas,30,1834 - 1917,Impressionism,French,Edgar Degas (US: or UK: ; born Hilaire-Germai...,http://en.wikipedia.org/wiki/Edgar_Degas,702,Impressionism,...,0,0,0,0,0,0,0,0,0,0


In [17]:
merged_df.columns

Index(['image_path', 'artist_name', 'id', 'years', 'genre', 'nationality',
       'bio', 'wikipedia', 'paintings', 'genre_simplified', 'Abstractionism',
       'Art Nouveau', 'Baroque', 'Byzantine Art', 'Cubism', 'Expressionism',
       'Impressionism', 'Mannerism', 'Muralism', 'Neoplasticism', 'Pop Art',
       'Primitivism', 'Realism', 'Renaissance', 'Romanticism', 'Suprematism',
       'Surrealism', 'Symbolism'],
      dtype='object')

In [18]:
# Save the result to CSV
merged_df.to_csv("../raw_data/merged_df.csv", index=False)

In [20]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7610 entries, 0 to 7609
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   image_path        7610 non-null   object
 1   artist_name       7610 non-null   object
 2   id                7610 non-null   int64 
 3   years             7610 non-null   object
 4   genre             7610 non-null   object
 5   nationality       7610 non-null   object
 6   bio               7610 non-null   object
 7   wikipedia         7610 non-null   object
 8   paintings         7610 non-null   int64 
 9   genre_simplified  7610 non-null   object
 10  Abstractionism    7610 non-null   int64 
 11  Art Nouveau       7610 non-null   int64 
 12  Baroque           7610 non-null   int64 
 13  Byzantine Art     7610 non-null   int64 
 14  Cubism            7610 non-null   int64 
 15  Expressionism     7610 non-null   int64 
 16  Impressionism     7610 non-null   int64 
 17  Mannerism     