In [1]:
import os
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import shutil
import numpy as np

In [2]:
# PART ONE: DATA PREPROCESSIND & PREPARATION

In [3]:
# Loading in the dataset

originalDataset = pd.read_csv('./archive/classes.csv')
originalDataset.head()

Unnamed: 0,filename,artist,genre,description,phash,width,height,genre_count,subset
0,Abstract_Expressionism/aaron-siskind_acolman-1...,aaron siskind,['Abstract Expressionism'],acolman-1-1955,bebbeb018a7d80a8,1922,1382,1,train
1,Abstract_Expressionism/aaron-siskind_chicago-6...,aaron siskind,['Abstract Expressionism'],chicago-6-1961,d7d0781be51fc00e,1382,1746,1,train
2,Abstract_Expressionism/aaron-siskind_glouceste...,aaron siskind,['Abstract Expressionism'],gloucester-16a-1944,9f846e5a6c639325,1382,1857,1,train
3,Abstract_Expressionism/aaron-siskind_jerome-ar...,aaron siskind,['Abstract Expressionism'],jerome-arizona-1949,a5d691f85ac5e4d0,1382,1849,1,train
4,Abstract_Expressionism/aaron-siskind_kentucky-...,aaron siskind,['Abstract Expressionism'],kentucky-4-1951,880df359e6b11db1,1382,1625,1,train


In [4]:
# Copying the images to another directory and preprocessing (Resizing, normalizing pixel values) for training/testing the CNN (preserving the original data)

originalDirectory = './archive/'
outputDirectory = './resizedData/'

if not os.path.exists(outputDirectory):
    os.makedirs(outputDirectory)

def preprocessImageCopies(imagePath, outputPath):
    newImageSize = (256, 256)
    try:
        with Image.open(imagePath) as img:
            img = img.convert('L')
            # Replace above line with the one below if you do not wish to convert the images to greyscale
            # img = img.convert('RGB')  # Ensure compatibility with CNNs
            imgResized = img.resize(newImageSize)
            imgArray = np.array(imgResized, dtype=np.float32)
            imgArray /= 255.0
            imgProcessed = Image.fromarray((imgArray * 255).astype(np.uint8))
            imgProcessed.save(outputPath)
    except Exception as e:
        print(f"Error processing {imagePath}: {e}")

for root, dirs, files in os.walk(originalDirectory):
    relativePath = os.path.relpath(root, originalDirectory)
    outputSubdirectory = os.path.join(outputDirectory, relativePath)
    os.makedirs(outputSubdirectory, exist_ok=True)
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            inputPath = os.path.join(root, file)
            outputPath = os.path.join(outputSubdirectory, file)
            preprocessImageCopies(inputPath, outputPath)

In [5]:
# Creating a copy of the data that will be manipulated (saving the original data from edits)

preprocessedData = originalDataset

In [6]:
# Updating the filenames to account for the directory they're in

preprocessedData['filename'] = './resizedData/' + preprocessedData['filename']
preprocessedData.head()

Unnamed: 0,filename,artist,genre,description,phash,width,height,genre_count,subset
0,./resizedData/Abstract_Expressionism/aaron-sis...,aaron siskind,['Abstract Expressionism'],acolman-1-1955,bebbeb018a7d80a8,1922,1382,1,train
1,./resizedData/Abstract_Expressionism/aaron-sis...,aaron siskind,['Abstract Expressionism'],chicago-6-1961,d7d0781be51fc00e,1382,1746,1,train
2,./resizedData/Abstract_Expressionism/aaron-sis...,aaron siskind,['Abstract Expressionism'],gloucester-16a-1944,9f846e5a6c639325,1382,1857,1,train
3,./resizedData/Abstract_Expressionism/aaron-sis...,aaron siskind,['Abstract Expressionism'],jerome-arizona-1949,a5d691f85ac5e4d0,1382,1849,1,train
4,./resizedData/Abstract_Expressionism/aaron-sis...,aaron siskind,['Abstract Expressionism'],kentucky-4-1951,880df359e6b11db1,1382,1625,1,train


In [7]:
# Checking if there are any data objects with null/improper values

numEntriesNull = preprocessedData.isnull().sum()
print(numEntriesNull)

filename       0
artist         0
genre          0
description    0
phash          0
width          0
height         0
genre_count    0
subset         0
dtype: int64


In [8]:
# One-Hot encoding the art styles pertaining to each image

preprocessedData = pd.get_dummies(preprocessedData, columns=['genre'])

In [9]:
# Adjusting data to belong to 1 of 2 subsets rather than 3 (train-test split)

print(preprocessedData['subset'].value_counts())
print()
preprocessedData['subset'] = preprocessedData['subset'].replace('uncertain artist', 'train')
print(preprocessedData['subset'].value_counts())

subset
train               63998
test                16000
uncertain artist       44
Name: count, dtype: int64

subset
train    64042
test     16000
Name: count, dtype: int64


In [12]:
# Splitting the data into training and testing sets

trainData = preprocessedData[preprocessedData['subset'] == 'train']
testData = preprocessedData[preprocessedData['subset'] == 'test']

X_train = trainData.drop(columns=['subset'])
y_train = trainData.drop(columns=['filename', 'artist', 'description', 'phash', 'width', 'height', 'genre_count', 'subset'])

X_test = testData.drop(columns=['subset'])
y_test = testData.drop(columns=['filename', 'artist', 'description', 'phash', 'width', 'height', 'genre_count', 'subset'])

In [None]:
# PART TWO: BUILDING & TRAINING THE MODEL

In [None]:
# PART THREE: TESTING THE MODEL & EVALUATING PERFORMANCE

In [None]:
# (optional) PART FOUR: UI TO INTERACT WITH THE MODEL