In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [4]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [5]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [6]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [7]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [22]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
X = X.sample(frac=0.005)
Y = artworks.Department
Y = Y.sample(frac=0.005)

In [23]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(10000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [24]:
mlp.score(X, Y)

0.6303939962476548

In [25]:
Y.value_counts()/len(Y)

Drawings & Prints        0.641651
Photography              0.215760
Architecture & Design    0.108818
Painting & Sculpture     0.033771
Name: Department, dtype: float64

In [26]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([0.63888889, 0.21296296, 0.21495327, 0.60952381, 0.64761905])

A lot of differences in consistncies in cross_val_scores, this happens with 10, 100, and 1000 hidden layers.
Using just 5% of the data, the Accuracy score was 62%@10 layers, 62%@100 layers, and 62@1000


Look at how low the crass val scores go with just a small amount of data

5% at 1000 layers = 0.62827715, 0.22492971, 0.56285178, 0.62910798, 0.23120301
5% at 100 layers = 0.62640449, 0.25585754, 0.62757974, 0.62816901, 0.6268797
5% at 10 layers = 0.36891386, 0.62605436, 0.62945591, 0.63004695, 0.6287594

But at
.05% of the data at 10K layers I get an accuracy score of 63%, 
with crossval scores of: 0.63888889, 0.21296296, 0.21495327, 0.60952381, 0.64761905

In [27]:
Y

73        Architecture & Design
18284         Drawings & Prints
135108        Drawings & Prints
96888         Drawings & Prints
82872               Photography
108774        Drawings & Prints
91228         Drawings & Prints
88773         Drawings & Prints
609       Architecture & Design
77456         Drawings & Prints
45129               Photography
20727         Drawings & Prints
56620         Drawings & Prints
20766         Drawings & Prints
122051        Drawings & Prints
32485         Drawings & Prints
47771               Photography
122476        Drawings & Prints
87032         Drawings & Prints
56739         Drawings & Prints
41699               Photography
87312         Drawings & Prints
29698         Drawings & Prints
84253         Drawings & Prints
96992         Drawings & Prints
100127        Drawings & Prints
5737      Architecture & Design
24986         Drawings & Prints
9425          Drawings & Prints
62890         Drawings & Prints
                  ...          
55137   

In [28]:
X

Unnamed: 0,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired,Gender_(),Gender_(Female),Gender_(Male),Gender_(male),Gender_\(multiple_persons\),...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
46131,True,True,39.200000,39.200000,1975,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83096,True,True,0.000000,0.000000,2006,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
26889,False,False,29.900000,24.300000,1964,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
14679,True,True,17.400000,20.600000,1952,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
68278,True,True,14.500000,23.000000,1943,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
59462,True,True,21.900000,16.000000,1951,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
49490,True,True,22.200000,30.100000,1997,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
49618,False,False,16.400000,24.600000,2000,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
112962,True,True,0.000000,2.000000,2010,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
106122,True,True,94.000000,73.000000,2011,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [31]:
X.columns

Index(['URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)', 'YearAcquired',
       'Gender_()', 'Gender_(Female)', 'Gender_(Male)', 'Gender_(male)',
       'Gender_\(multiple_persons\)',
       ...
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018'],
      dtype='object', length=317)