In [88]:
import pandas as pd
import numpy as np
from PIL import Image
import os
import io

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [89]:
# Load and join the dataframes 
wiki_art_imgs = pd.read_csv("wikiart_df.csv")
aic_art_imgs = pd.read_csv("aic_images.csv")


In [90]:
aic_art_imgs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4122 entries, 0 to 4121
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   artwork_type_title  4122 non-null   object
 1   style_title         4122 non-null   object
 2   id                  4122 non-null   int64 
 3   image_id            4122 non-null   object
 4   title               4122 non-null   object
 5   search_term         4122 non-null   object
 6   file_name           4122 non-null   object
dtypes: int64(1), object(6)
memory usage: 225.5+ KB


In [91]:
wiki_art_imgs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4314 entries, 0 to 4313
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                4314 non-null   object 
 1   title             4314 non-null   object 
 2   url               4314 non-null   object 
 3   artistUrl         4314 non-null   object 
 4   artistName        4314 non-null   object 
 5   artistId          4314 non-null   object 
 6   completitionYear  4313 non-null   float64
 7   width             4314 non-null   int64  
 8   image             4314 non-null   object 
 9   height            4314 non-null   int64  
 10  Style             4254 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 370.9+ KB


Check and clean the wiki art file images and df, the aic was already checked in its
data collection notebook.

In [92]:
# Create file_name column by adding .jpg extension to id
wiki_art_imgs['file_name'] = wiki_art_imgs['id'] + '.jpg'
print("file_name column added.")

# First check if any ids are None
wiki_art_imgs = wiki_art_imgs.dropna(subset=['id'])

# Check which files exist in the directory
image_dir = 'downloaded_images_wikiart'
exists_mask = wiki_art_imgs['file_name'].apply(lambda x: os.path.exists(os.path.join(image_dir, x)))

# Filter to keep only rows where files exist
wiki_art_imgs = wiki_art_imgs[exists_mask]

# Let's check if there are any duplicates while we're at it
duplicate_count = wiki_art_imgs['file_name'].duplicated().sum()

if duplicate_count > 0:
    print(f"Found {duplicate_count} duplicate entries")
else:
    print("No duplicate filenames found.")

file_name column added.
No duplicate filenames found.


In [93]:
wiki_art_imgs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4314 entries, 0 to 4313
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                4314 non-null   object 
 1   title             4314 non-null   object 
 2   url               4314 non-null   object 
 3   artistUrl         4314 non-null   object 
 4   artistName        4314 non-null   object 
 5   artistId          4314 non-null   object 
 6   completitionYear  4313 non-null   float64
 7   width             4314 non-null   int64  
 8   image             4314 non-null   object 
 9   height            4314 non-null   int64  
 10  Style             4254 non-null   object 
 11  file_name         4314 non-null   object 
dtypes: float64(1), int64(2), object(9)
memory usage: 404.6+ KB


In [94]:
# Drop columns 

wiki_art_imgs = wiki_art_imgs.drop(columns=['id','title','url','artistUrl','artistName','artistId','completitionYear',
                                            'width','image','height'])
aic_art_imgs = aic_art_imgs.drop(columns=['artwork_type_title','id','image_id','title','style_title'])

In [95]:
imgs = pd.concat([wiki_art_imgs, aic_art_imgs], axis=0)

In [96]:
imgs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8436 entries, 0 to 4121
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Style        4254 non-null   object
 1   file_name    8436 non-null   object
 2   search_term  4122 non-null   object
dtypes: object(3)
memory usage: 263.6+ KB


### Standardize the styles to use for classification

In [97]:

imgs['Style'].value_counts()

Style
Art Informel              568
Cubism                    561
Abstract Expressionism    557
Conceptual Art            535
Expressionism             515
Contemporary              494
Baroque                   359
Contemporary Realism      346
Early Renaissance         319
Name: count, dtype: int64

In [98]:
imgs['search_term'].value_counts()

search_term
Modern Art                617
ukiyo-e                   536
Pop Art                   476
Mannerism                 382
Post-Impressionism        304
Cubism                    260
Realism                   195
early renaissance         123
neo-romantic              122
late renaissance          118
Baroque                   106
abstract figures           98
northern renaissance       88
Abstract Expressionism     83
Art Informel               59
Surrealism                 56
Conceptual Art             54
Modernism                  52
Rococo                     45
abstract patterns          44
high renaissance           43
mannerism                  42
Symbolism                  35
abstract objects           33
Fauvism                    22
abstract motifs            22
contemporary               19
abstract shapes            18
abstract imagist           15
abstract forms             11
lyrical abstraction        11
Neoclassicism               7
Abstract                    

In [99]:

def clean_art_styles(df):
    """
    Cleans art styles in a dataframe by:
    1. Dropping rows where search_term contains 'abstract' (except 'Abstract Expressionism')
    2. Capitalizing first letter of each word in search_term
    3. Filling empty Style values with search_term values
    
    Parameters:
    df (pandas.DataFrame): DataFrame with 'search_term' and 'Style' columns
    
    Returns:
    pandas.DataFrame: Cleaned DataFrame
    """
    # Create a copy of the DataFrame
    df = df.copy()
    
    # Drop rows containing 'abstract' except 'Abstract Expressionism'
    # First, create a mask for rows containing 'abstract'
    abstract_mask = df['search_term'].fillna('').str.contains('abstract', case=False)

    # Then, create a mask for rows containing 'Abstract Expressionism'
    expressionism_mask = df['search_term'].fillna('').str.contains('Abstract Expressionism', case=False)
    
    # Keep rows that either don't contain 'abstract' or contain 'Abstract Expressionism'
    df = df[~abstract_mask | expressionism_mask]
    
    # Capitalize first letter of each word in search_term
    df['search_term'] = df['search_term'].apply(lambda x: ' '.join(word.capitalize() for word in x.split()) if isinstance(x, str) else x)
    
    # Fill empty Style values with search_term values
    df.loc[df['Style'].isna(), 'Style'] = df.loc[df['Style'].isna(), 'search_term']
    
    return df


# Usage

imgs_cleaned = clean_art_styles(imgs)

In [100]:
imgs_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8176 entries, 0 to 4121
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Style        8116 non-null   object
 1   file_name    8176 non-null   object
 2   search_term  3862 non-null   object
dtypes: object(3)
memory usage: 255.5+ KB


In [101]:
# Drop the search_term column
imgs_cleaned = imgs_cleaned.drop(columns='search_term')
# Drop any remaining entries that don't have a Style term
imgs_cleaned = imgs_cleaned.dropna()


In [102]:
imgs_cleaned['Style'].value_counts()

Style
Cubism                    821
Abstract Expressionism    640
Art Informel              627
Modern Art                617
Conceptual Art            589
Ukiyo-e                   536
Expressionism             519
Contemporary              513
Pop Art                   476
Baroque                   465
Early Renaissance         442
Mannerism                 424
Contemporary Realism      346
Post-impressionism        304
Realism                   195
Neo-romantic              122
Late Renaissance          118
Northern Renaissance       88
Surrealism                 56
Modernism                  52
Rococo                     45
High Renaissance           43
Symbolism                  37
Fauvism                    22
Neoclassicism               7
Modernist                   4
Romantic                    4
Op Art                      3
Minimalism                  1
Name: count, dtype: int64

In [103]:
# Drop Styles with too few entries
# Filter the 'Style' column based on value counts
style_counts = imgs_cleaned['Style'].value_counts()

# Keep only the styles that occur more than 10 times
imgs_filtered = imgs_cleaned[imgs_cleaned['Style'].isin(style_counts[style_counts > 10].index)]

Calculate basic statistics (# of classes, images per class, image sizes)

In [104]:
imgs_filtered['Style'].value_counts()

Style
Cubism                    821
Abstract Expressionism    640
Art Informel              627
Modern Art                617
Conceptual Art            589
Ukiyo-e                   536
Expressionism             519
Contemporary              513
Pop Art                   476
Baroque                   465
Early Renaissance         442
Mannerism                 424
Contemporary Realism      346
Post-impressionism        304
Realism                   195
Neo-romantic              122
Late Renaissance          118
Northern Renaissance       88
Surrealism                 56
Modernism                  52
Rococo                     45
High Renaissance           43
Symbolism                  37
Fauvism                    22
Name: count, dtype: int64

Visualizations
- Create sample visualizations of images across different styles
- [ ] Generate distribution plots of image properties
- [ ] Analyze color distributions
- [ ] Create dimensionality reduction visualizations (t-SNE, UMAP)

In [111]:

def plot_style_pie(df):
    """
    Creates an interactive pie chart of art style distribution using Plotly
    """
    style_counts = df['Style'].value_counts()
    
    fig = go.Figure(data=[go.Pie(
        labels=style_counts.index,
        values=style_counts.values,
        hole=.3,
        hovertemplate="<b>%{label}</b><br>" +
                      "Count: %{value}<br>" +
                      "Percentage: %{percent}<extra></extra>"
    )])
    
    fig.update_layout(
        title="Art Style Distribution (Original Imageset)",
        width=800,
        height=800
    )
    return fig


#Usage:
plot_style_pie(imgs_filtered)


Resize Images

Normalize Pixel Values

Apply Augmentations