In [80]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas_profiling as pp
from nltk.corpus import stopwords
import string
import csv

# collection: all data available

In [81]:
collection = pd.read_csv("collection/wcma-collection.csv", index_col = 0)

In [82]:
collection.drop(columns=["department","source_name","credit_line","paper_support","catalogue_raisonne","portfolio","signed","marks","inscription","filename","dimensions","element_type","width_cm","height_cm","depth_cm","width_in","height_in","depth_in","area_in","size_s_m_l","is_3d","orientation_p_l_s","copyright_holder","data_date"], inplace=True)
collection.head()

Unnamed: 0_level_0,accession_number,title,maker,classification,culture,period,creation_date,creation_date_earliest,creation_date_latest,object_name,medium,description,accession_date,ULAN
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,39.1.400,GRANDE MATINÉE DE BIENFAISANCE EN L'HONNEUR DE...,Adrien Barrère,WCMA-PRINTS,,,1917,1917,1917,poster,poster,,1939-01-01,500103823
2,79.22,Men of Europe,Ronald B. Kitaj,WCMA-PRINTS,,,1972,1972,1972,screenprint,color screenprint on paper,,1979-01-01,500007852
3,76.37.5.K,Single-coil Bracelet,Macedonia,WCMA-ANCIENT,,,750-250 BC,-750,-250,bracelet,bronze,,1976-01-01,not found
4,64.29.2,"The First Cornet, Streshnev",Grigory Borisov and Nikolaj Prusakov,WCMA-PRINTS,,,1928 (year film premiered),1928,1928,poster,poster,This is a poster for a war film set in the Cau...,1964-01-01,500084307
5,81.12.1.E,"Untitled (from ""Dogs"")",Dieter Roth,WCMA-PRINTS,,,1979,1979,1979,print,color print,,1981-01-01,500023079


In [83]:
#collection.profile_report()

In [84]:
collection.count()

accession_number          16243
title                     16243
maker                     16185
classification            16243
culture                    1351
period                      127
creation_date             16102
creation_date_earliest    16243
creation_date_latest      16243
object_name               16243
medium                    16095
description                3173
accession_date            15567
ULAN                      16243
dtype: int64

# Cleaned list of top 50 most common mediums: 'top_mediums'

In [85]:
#find most common mediums
#create dictionary of medium:frequency by using each word in medium descriptions

mediums = {};
stopword = set(stopwords.words('english'))
for row in collection.itertuples():
    for word in str(row.medium).split():
        word = word.translate(str.maketrans('', '', string.punctuation)).lower()
        if word in stopword:
            continue
        if word in mediums:
            mediums[word] += 1
        else:
            mediums[word] = 1

#create sorted list of mediums so that we can iterate through most frequently used
sorted_mediums = sorted(mediums.items(), key=lambda x : x[1], reverse=True)

In [86]:
#find top 50 mediums, add to list 'top_mediums'-- this is final cleaned list
count = 0
basic_colors = ['color', 'white', 'gray', 'black', 'red', 'green', 'blue', 'yellow', 'purple', 'bronze', 'silver', 'gold']
not_medium = ['mounted', 'opaque', 'offset']
top_mediums = [("medium","count")]
for entry in sorted_mediums:
    if count >= 50: 
        break
    if len(entry[0])<=3:
        continue
    if entry[0] in basic_colors or entry[0] in not_medium: 
        continue
    top_mediums.append(entry)
    count += 1
print(top_mediums)


[('medium', 'count'), ('paper', 5241), ('print', 2038), ('wood', 1230), ('photograph', 1208), ('lithograph', 1196), ('gelatin', 1186), ('etching', 1094), ('pencil', 920), ('engraving', 686), ('watercolor', 591), ('canvas', 551), ('poster', 474), ('newsprint', 399), ('screenprint', 303), ('aquatint', 292), ('drypoint', 269), ('book', 251), ('metal', 248), ('clay', 229), ('silkscreen', 225), ('woodcut', 223), ('ceramic', 217), ('graphite', 211), ('board', 210), ('panel', 192), ('glass', 178), ('brass', 174), ('gouache', 166), ('crayon', 155), ('charcoal', 148), ('faience', 145), ('terracotta', 145), ('polaroid', 142), ('stone', 139), ('leather', 137), ('woodblock', 134), ('silk', 132), ('frame', 130), ('wash', 127), ('polacolor', 127), ('albumen', 119), ('transfer', 116), ('collage', 110), ('inkjet', 110), ('pigment', 101), ('proof', 97), ('tempera', 86), ('acrylic', 86), ('collotype', 86), ('photogravure', 86)]


Write top mediums to csv

In [87]:
with open("top_mediums.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(top_mediums)

# everything w categories: id, accession num, title, maker, classification, creation date (avg), accession date-- NaN left

In [88]:
selected_info = collection[["accession_number", "accession_date", "title", "maker", "medium", "description", "classification", "creation_date_earliest", "creation_date_latest"]].copy()
#get avg creation date
selected_info['creation_date_avg'] = (selected_info['creation_date_earliest'] + selected_info['creation_date_latest'])/2
selected_info.drop(columns=['creation_date_earliest', 'creation_date_latest'], inplace=True)
#drop duplicates
selected_info.drop_duplicates(subset=None, keep='first', inplace=True)

selected_info.head()

Unnamed: 0_level_0,accession_number,accession_date,title,maker,medium,description,classification,creation_date_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,39.1.400,1939-01-01,GRANDE MATINÉE DE BIENFAISANCE EN L'HONNEUR DE...,Adrien Barrère,poster,,WCMA-PRINTS,1917.0
2,79.22,1979-01-01,Men of Europe,Ronald B. Kitaj,color screenprint on paper,,WCMA-PRINTS,1972.0
3,76.37.5.K,1976-01-01,Single-coil Bracelet,Macedonia,bronze,,WCMA-ANCIENT,-500.0
4,64.29.2,1964-01-01,"The First Cornet, Streshnev",Grigory Borisov and Nikolaj Prusakov,poster,This is a poster for a war film set in the Cau...,WCMA-PRINTS,1928.0
5,81.12.1.E,1981-01-01,"Untitled (from ""Dogs"")",Dieter Roth,color print,,WCMA-PRINTS,1979.0


In [89]:
selected_info.profile_report()



In [90]:
#remove NaNs
values = {"accession_number":"", "accession_date":"", "title":"", "maker":"", "medium":"", "description":"", "classification":"", "creation_date_avg":0}
cleaned = selected_info.fillna(value=values)

In [91]:
#maker: "unknown" to "" --this needed because there is a mix of NAN and "untitled"
cleaned.replace(regex=["unknown"], value="", inplace=True)

In [92]:
cleaned.to_csv("cleaned_collection_data.csv")
cleaned.head()

Unnamed: 0_level_0,accession_number,accession_date,title,maker,medium,description,classification,creation_date_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,39.1.400,1939-01-01,GRANDE MATINÉE DE BIENFAISANCE EN L'HONNEUR DE...,Adrien Barrère,poster,,WCMA-PRINTS,1917.0
2,79.22,1979-01-01,Men of Europe,Ronald B. Kitaj,color screenprint on paper,,WCMA-PRINTS,1972.0
3,76.37.5.K,1976-01-01,Single-coil Bracelet,Macedonia,bronze,,WCMA-ANCIENT,-500.0
4,64.29.2,1964-01-01,"The First Cornet, Streshnev",Grigory Borisov and Nikolaj Prusakov,poster,This is a poster for a war film set in the Cau...,WCMA-PRINTS,1928.0
5,81.12.1.E,1981-01-01,"Untitled (from ""Dogs"")",Dieter Roth,color print,,WCMA-PRINTS,1979.0


# new dataframe only of pieces without much associated info-- unknown artist or creation date (range is ok), for static viz

In [93]:
def all_unknown(maker, create_date):
    if maker == '' and create_date == 0.0:
        return True
    else: return False

In [94]:
no_maker = cleaned[cleaned['maker']=='']
no_date = cleaned[cleaned['creation_date_avg']==0.0]
less_info_pieces = pd.concat([no_maker, no_date], ignore_index=True)
less_info_pieces.drop_duplicates(subset=None, keep='first', inplace=True)

less_info_pieces.count()

accession_number     2838
accession_date       2838
title                2838
maker                2838
medium               2838
description          2838
classification       2838
creation_date_avg    2838
dtype: int64

In [95]:
less_info_pieces.to_csv("less_info_pieces.csv", index=None)

In [96]:
less_info_pieces.head()

Unnamed: 0,accession_number,accession_date,title,maker,medium,description,classification,creation_date_avg
0,94.1.2.B,1994-01-01,illuminated manuscript border,,gold leaf and colors on parchment,on vellum,WCMA-DEC ARTS,0.0
1,95.4.51,1995-03-03,Photograph of Charles Prendergast,,photograph,,WCMA-Prendergast,1905.5
2,A.1.4,1994-04-08,"Photograph of ""Spirit of the Hunt"" by Charles ...",,photograph,,WCMA-Prendergast,1918.0
3,94.1.2.A,1994-01-01,illuminated manuscript border,,gold leaf and colors on parchment,on vellum,WCMA-DEC ARTS,0.0
4,RC.37.2,1995-05-01,Reproduction Barcelona Chair with cushions fro...,,metal and leather,,WCMA-Prendergast,1925.0
