In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
modern = pd.read_csv("modern_met_clean.csv")

Most of the feature engineering that we do in this notebook is creating dummies for the categorical values that exist with the features. Object Date has already been cleaned in an earlier notebook so we don't have to work with this and in the end, this feature turns out to be an important one down the line. 

So let's create some dummies. 

In [3]:
modern.columns

Index(['Is Highlight', 'Department', 'Object Name', 'Artist Role',
       'Artist Nationality', 'Medium', 'Classification', 'Object Year'],
      dtype='object')

In [4]:
modern.head(3)

Unnamed: 0,Is Highlight,Department,Object Name,Artist Role,Artist Nationality,Medium,Classification,Object Year
0,False,American Paintings and Sculpture,Other Object,Other Artist Role,Other Nationality,Other Medium,Other Class,1665
1,False,American Paintings and Sculpture,Other Object,Artist,Other Nationality,Other Medium,Other Class,1800
2,False,American Paintings and Sculpture,Other Object,Artist,Other Nationality,Other Medium,Other Class,1867


In [5]:
modern.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215649 entries, 0 to 215648
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Is Highlight        215649 non-null  bool  
 1   Department          215649 non-null  object
 2   Object Name         215649 non-null  object
 3   Artist Role         215649 non-null  object
 4   Artist Nationality  215649 non-null  object
 5   Medium              215649 non-null  object
 6   Classification      215649 non-null  object
 7   Object Year         215649 non-null  int64 
dtypes: bool(1), int64(1), object(6)
memory usage: 11.7+ MB


In [6]:
# Let's rename those columns to easier ones to work with. 
modern.rename(columns = {'Is Highlight': "highlighted", 'Department': "department", 'Object Name':"obj_name",\
                         'Artist Role': "artist_role",'Artist Nationality': "artist_nationality", \
                         'Object Year': "obj_year", 'Medium': "medium", 'Classification': "class"}, inplace = True)

**"department"**

In [7]:
modern = pd.get_dummies(columns=['department'], drop_first=True, data=modern)
modern.columns

Index(['highlighted', 'obj_name', 'artist_role', 'artist_nationality',
       'medium', 'class', 'obj_year', 'department_Drawings and Prints',
       'department_European Paintings',
       'department_Modern and Contemporary Art', 'department_Photographs',
       'department_Robert Lehman Collection', 'department_The Libraries'],
      dtype='object')

In [9]:
modern.rename(columns = {'department_Drawings and Prints': "department_drawings_and_prints", \
       'department_European Paintings': "department_european_paintings", \
        'department_Modern and Contemporary Art': 'department_modern_and_cont_art', \
        'department_Photographs': 'department_photographs',\
       'department_Robert Lehman Collection': "department_robert_lehman", \
        'department_The Libraries': 'department_libraries'}, inplace = True)

**"obj_name"**

In [10]:
modern = pd.get_dummies(columns=["obj_name"], drop_first=True, data=modern)
modern.columns

Index(['highlighted', 'artist_role', 'artist_nationality', 'medium', 'class',
       'obj_year', 'department_drawings_and_prints',
       'department_european_paintings', 'department_modern_and_cont_art',
       'department_photographs', 'department_robert_lehman',
       'department_libraries', 'obj_name_Other Object', 'obj_name_Painting',
       'obj_name_Photograph', 'obj_name_Print', 'obj_name_Sculpture',
       'obj_name_Watercolor'],
      dtype='object')

In [11]:
modern.rename(columns = {'obj_name_Other Object': "other_obj_name", 'obj_name_Painting': "obj_name_painting", \
                         'obj_name_Photograph': "obj_name_photo", 'obj_name_Print': 'obj_name_print', \
                         'obj_name_Sculpture': "obj_name_sculpture", 'obj_name_Watercolor': "obj_name_watercolor"}, \
              inplace = True)

**"artist_role"**

In [12]:
modern = pd.get_dummies(columns=["artist_role"], drop_first=True, data=modern)
modern.columns

Index(['highlighted', 'artist_nationality', 'medium', 'class', 'obj_year',
       'department_drawings_and_prints', 'department_european_paintings',
       'department_modern_and_cont_art', 'department_photographs',
       'department_robert_lehman', 'department_libraries', 'other_obj_name',
       'obj_name_painting', 'obj_name_photo', 'obj_name_print',
       'obj_name_sculpture', 'obj_name_watercolor', 'artist_role_Author',
       'artist_role_Other Artist Role', 'artist_role_Publisher'],
      dtype='object')

In [13]:
modern.rename(columns = {'artist_role_Author': "artist_role_author", \
       'artist_role_Other Artist Role': "artist_role_other", \
                         'artist_role_Publisher': "artist_role_publisher"}, inplace = True)

**"artist_nationality"**

In [14]:
modern = pd.get_dummies(columns=["artist_nationality"], drop_first=True, data=modern)
modern.columns

Index(['highlighted', 'medium', 'class', 'obj_year',
       'department_drawings_and_prints', 'department_european_paintings',
       'department_modern_and_cont_art', 'department_photographs',
       'department_robert_lehman', 'department_libraries', 'other_obj_name',
       'obj_name_painting', 'obj_name_photo', 'obj_name_print',
       'obj_name_sculpture', 'obj_name_watercolor', 'artist_role_author',
       'artist_role_other', 'artist_role_publisher',
       'artist_nationality_British/German/Dutch/Netherlandish/Spanish',
       'artist_nationality_French', 'artist_nationality_Italian',
       'artist_nationality_Other Nationality'],
      dtype='object')

In [15]:
modern.rename(columns = {'artist_nationality_British/German/Dutch/Netherlandish/Spanish': "artist_nationality_bgdns",
       'artist_nationality_French': "artist_nationality_french", \
        'artist_nationality_Italian': 'artist_nationality_italian',
       'artist_nationality_Other Artist Nationality': "artist_nationality_other"}, inplace = True)

**"obj_date"**

Not much to do here, did the cleaning in the Cleaning Notebook. 

In [16]:
modern.obj_year.value_counts()

0       24647
1888     7654
1900     4558
1890     4391
1889     3850
        ...  
1404        1
1403        1
1461        1
1395        1
1432        1
Name: obj_year, Length: 644, dtype: int64

**"medium"**

In [17]:
modern = pd.get_dummies(columns=["medium"], drop_first=True, data=modern)
modern.columns

Index(['highlighted', 'class', 'obj_year', 'department_drawings_and_prints',
       'department_european_paintings', 'department_modern_and_cont_art',
       'department_photographs', 'department_robert_lehman',
       'department_libraries', 'other_obj_name', 'obj_name_painting',
       'obj_name_photo', 'obj_name_print', 'obj_name_sculpture',
       'obj_name_watercolor', 'artist_role_author', 'artist_role_other',
       'artist_role_publisher', 'artist_nationality_bgdns',
       'artist_nationality_french', 'artist_nationality_italian',
       'artist_nationality_Other Nationality', 'medium_Gelatin silver print',
       'medium_Illustrated book', 'medium_Marble', 'medium_Oil on canvas',
       'medium_Oil on wood', 'medium_Other Medium'],
      dtype='object')

In [18]:
modern.rename(columns = {'medium_Gelatin silver print': "medium_gelatin_silver_print",
       'medium_Illustrated book': 'medium_illustrated_book', 'medium_Marble': "medium_marble", 
    'medium_Oil on canvas': "medium_oil_on_canvas", 'medium_Oil on wood': "medium_oil_on_wood", \
                         'medium_Other Medium': "medium_other"}, inplace = True)

**"class"**

In [19]:
modern = pd.get_dummies(columns=["class"], drop_first=True, data=modern)
modern.columns

Index(['highlighted', 'obj_year', 'department_drawings_and_prints',
       'department_european_paintings', 'department_modern_and_cont_art',
       'department_photographs', 'department_robert_lehman',
       'department_libraries', 'other_obj_name', 'obj_name_painting',
       'obj_name_photo', 'obj_name_print', 'obj_name_sculpture',
       'obj_name_watercolor', 'artist_role_author', 'artist_role_other',
       'artist_role_publisher', 'artist_nationality_bgdns',
       'artist_nationality_french', 'artist_nationality_italian',
       'artist_nationality_Other Nationality', 'medium_gelatin_silver_print',
       'medium_illustrated_book', 'medium_marble', 'medium_oil_on_canvas',
       'medium_oil_on_wood', 'medium_other', 'class_Other Class',
       'class_Paintings', 'class_Photographs', 'class_Prints',
       'class_Sculpture'],
      dtype='object')

In [20]:
modern.rename(columns = {'class_Other Class': "class_other", \
       'class_Paintings': "class_paintings", 'class_Photographs': 'class_photographs', \
        'class_Prints': "class_prints", 'class_Sculpture': "class_sculpture"}, inplace = True)

In [21]:
modern.to_csv("modern_met_fe.csv", index= False)