# Drill: Playing with layers

Now it's your turn. Using the space below, experiment with different hidden layer structures. You can try this on a subset of the data to improve runtime. See how things vary. See what seems to matter the most. Feel free to manipulate other parameters as well.

### Lets see if we can build a model to classify which department a piece should go into using MLP

In [6]:
############ Imports #################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Model Infrastructure
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Import the model.
from sklearn.neural_network import MLPClassifier



In [2]:
######## Bring In Data ################
start_time = time.time()
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 6.166257858276367 seconds ---


In [3]:
######### Clean the Data ##############
start_time = time.time()
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

# Convert timestamps and dates
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year

# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department


print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 1.413816213607788 seconds ---


In [7]:
X.info()

<class 'pandas.core.sparse.frame.SparseDataFrame'>
Int64Index: 104673 entries, 0 to 133542
Columns: 312 entries, URL to 2017
dtypes: bool(2), float64(2), int64(1), uint8(307)
memory usage: 38.6 MB


In [4]:
########### Build the Model ################
start_time = time.time()
# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 39.94540023803711 seconds ---


In [5]:
mlp.score(X, Y)

0.5093386068995825

In [8]:
cross_val_score(mlp, X, Y, cv=5)



array([0.42002197, 0.60560757, 0.52825412, 0.60435676, 0.46694057])