# Set-up

In [None]:
# libraries
import re
import numpy as np
import pandas as pd
from pymongo import MongoClient

In [None]:
# let's connect to the localhost
client = MongoClient()

# let's create a database 
db = client.moma

# collection
artworks = db.artworks

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, artworks), flush=True
)

## Data

![MoMa](https://images.musement.com/cover/0001/31/museum-of-modern-art-moma_header-30520.jpeg?w=1200&h=630&q=95&fit=crop)

In [None]:
df = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/main/Artworks.csv')

df.info()

# Loading

In [None]:
%%time 
# slow loading of data
d = {}
# id column
df['_id'] = df["Cataloged"] + df["ObjectID"].astype(str)
# pass data
for i in df.index:
    d = {
        "_id": df.loc[i, "_id"],
        "Title": df.loc[i, "Title"],
        "Date": df.loc[i, "Date"],
        "Artist": {
            "id": df.loc[i, "ConstituentID"],
            "Name": df.loc[i, "Artist"],
            "Bio": df.loc[i, "ArtistBio"],
            "Nationality": df.loc[i, "Nationality"],
            "Birth": df.loc[i, "BeginDate"],
            "Death": df.loc[i, "EndDate"],
            "Gender": df.loc[i, "Gender"]
        },
        "Characteristics":{
            "Medium": df.loc[i,'Medium'], 
            "Dimensions": df.loc[i,'Dimensions'],
            "Circumference": df.loc[i,'Circumference (cm)'], 
            "Depth": df.loc[i,'Depth (cm)'], 
            "Diameter": df.loc[i,'Diameter (cm)'], 
            "Height": df.loc[i,'Height (cm)'],
            "Length": df.loc[i,'Length (cm)'], 
            "Weight": df.loc[i,'Weight (kg)'], 
            "Width": df.loc[i,'Width (cm)'], 
            "Seat Height": df.loc[i,'Seat Height (cm)'],
            "Duration": df.loc[i,'Duration (sec.)']
        },
        "Acquisition": {
            "Date": df.loc[i, "DateAcquired"],
            "CreditLine": df.loc[i, "CreditLine"],
            "Number": df.loc[i, "AccessionNumber"]
        },
        "Classification": df.loc[i, "Classification"],
        "Cataloged": df.loc[i, "Cataloged"],
        "Department": df.loc[i, "Department"],
        "URL": df.loc[i, "URL"], 
        "ImageURL": df.loc[i, "ImageURL"]
    }
    artworks.insert_one(d)

In [None]:
# Artist as an array
dt = df[df['Artist'].str.contains(',', na=False)]
# push array
for i in dt.index:
    # unset Artist field
    artworks.update_one({'_id': dt.loc[i, '_id']}, {'$unset': {'Artist':''}})
    # get array values
    array = []
    for n in range(0, len(dt.loc[i, 'ConstituentID'].split(',')), 1):
        d_ = {
            'id': dt.loc[i, 'ConstituentID'].split(', ')[n], 
            'Name': dt.loc[i, 'Artist'].split(', ')[n],
            'Nationality': dt.loc[i, 'Nationality'].split(') ')[n].replace('(', '').replace(')', ''),
            'Birth': dt.loc[i, 'BeginDate'].split(') ')[n].replace('(', '').replace(')', ''), 
            'Death': dt.loc[i, 'EndDate'].split(') ')[n].replace('(', '').replace(')', ''), 
            'Gender': dt.loc[i, 'Gender'].split(') ')[n].replace('(', '').replace(')', ''), 
        }
        artworks.update_one({'_id': dt.loc[i, '_id']}, {'$push': {'Artist':d_}})

In [None]:
# for further reference https://docs.mongodb.com/manual/reference/command/collStats/
stats = db.command("collstats", "artworks")
us = stats.get('size')/10**6
cs = stats.get('storageSize')/10**6

print("""
Namespace: {}

Document Count: {}

Uncompressed data size: {} MB

Compressed data size: {} MB

""".format(stats.get('ns'), stats.get('count'), us, cs), flush=True)

## Cleaning

In [None]:
# get key names
key_list = []
for i in d.keys():
    try:
        for b in d.get(str(i)).keys():
            key_list.append(str(i) + '.' + str(b))
    except:
        key_list.append(i)

In [None]:
# unset NaN fields
for i in key_list:
    update = artworks.update_many({str(i):np.nan},{"$unset": {str(i):""}})
    print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format(i, update.matched_count, update.modified_count), flush=True)

In [None]:
# for further reference https://docs.mongodb.com/manual/reference/command/collStats/
stats = db.command("collstats", "artworks")
us_ = stats.get('size')/10**6

print("""
Namespace: {}

Document Count: {}

Size: {}

Var. Size: {}

""".format(stats.get('ns'), stats.get('count'), us_, round(us-us_, 2)), flush=True)

## Further Cleaning

In [None]:
# change data type
update = artworks.update_many({"Date":{"$regex": '^[0-9]*$'}}, [{ "$set": { "Date": { "$toInt": "$Date" } } }])

print("""
    Key: {}
    Matched: {}
    Modified: {}
    ------------
    """.format("Date", update.matched_count, update.modified_count), flush=True)

In [None]:
# create an array field to store ranges
for i in artworks.find({"Date":{"$regex": '^[0-9]{4}-[0-9]{4}$'}}):
    date = i.get('Date').split('-')
    a = int(date[0])
    b = int(date[1])
    id = i.get('_id')
    update = artworks.update_one({"_id": str(id)},{"$set": {"Date": [a, b]}})
    print(update.matched_count, update.modified_count)
    
for i in artworks.find({"Date":{"$regex": '^[0-9]{4}–[0-9]{4}$'}}):
    date = i.get('Date').split('–')
    a = int(date[0])
    b = int(date[1])
    id = i.get('_id')
    update = artworks.update_one({"_id": str(id)},{"$set": {"Date": [a, b]}})
    print(update.matched_count, update.modified_count)

for i in artworks.find({"Date": {"$regex": '^[0-9]{4}-[0-9]{2}$'}}, {"Date": 1}):
    date = i.get('Date').split('-')
    a = int(date[0])
    b = int(date[0][0] + date[0][1] + date[1])
    id = i.get('_id')
    update = artworks.update_one({"_id": str(id)},{"$set": {"Date": [a, b]}})
    print(update.matched_count, update.modified_count)

for i in artworks.find({"Date": {"$regex": '^[0-9]{4}–[0-9]{2}$'}}, {"Date": 1}):
    date = i.get('Date').split('–')
    a = int(date[0])
    b = int(date[0][0] + date[0][1]+ date[1])
    id = i.get('_id')
    update = artworks.update_one({"_id": str(id)},{"$set": {"Date": [a, b]}})
    print(update.matched_count, update.modified_count)

# perform some further cleaning
for i in artworks.find({"Date":{"$regex": '^c. [0-9]{4}$'}}):
    date = i.get('Date').split(' ')
    b = int(date[1])
    id = i.get('_id')
    update = artworks.update_one({"_id": str(id)},{"$set": {"Date": b}})
    print(update.matched_count, update.modified_count)

In [None]:
# remove Unknown or n.d.
update = artworks.update_many({"Date": {"$in": ["n.d.", "Unknown", "unknown"]}}, {"$unset": {"Date": ""}})
print("""
    Matched: {}
    Modified: {}
    """.format(update.matched_count, update.modified_count), flush=True)

In [None]:
for i in artworks.find({"Date": {"$type": "string"}}, {"Date":1}):
    print(i)

# Aggregation and loading

In [None]:
# collection
artw = db.artw

# print connection
print("""
Database
==========
{}

Collection
==========
{}
""".format(db, artw), flush=True
)

In [None]:
# df to dict
df.rename(columns={'Duration (sec.)': 'Duration (sec)'}, inplace=True)
dd = df.to_dict('records')

dd[0]

In [None]:
%%time
# insert array
insert = artw.insert_many(dd)

# define the pipeline
pipeline = [
    {"$project": 
     {
         "_id": {"$concat": ["$Cataloged", {"$toString": "$ObjectID"}]},
         "Title": "$Title",
         "Date": "$Date",
         "Artist": {
             "Name": "$Artist", 
             'Bio': "$ArtistBio",
             'Nationality': "$Nationality",
             "Birth": "$BeginDate",
             "Death": "$EndDate", 
             "Gender": "$Gender",
         },
        "Characteristics":{
            "Medium": '$Medium', 
            "Dimensions": '$Dimensions',
            "Circumference": '$Circumference (cm)', 
            "Depth": '$Depth (cm)', 
            "Diameter": '$Diameter (cm)', 
            "Height": '$Height (cm)',
            "Length": '$Length (cm)', 
            "Weight": '$Weight (kg)', 
            "Width": '$Width (cm)', 
            "Seat Height": '$Seat Height (cm)',
            "Duration": '$Duration (sec)'
        },
        "Acquisition": {
            "Date": "$DateAcquired",
            "CreditLine": "$CreditLine",
            "Number": "$AccessionNumber"
        },
        "Classification": "$Classification",
        "Cataloged": "$Cataloged",
        "Department": "$Department",
        "URL": "$URL", 
        "ImageURL": "$ImageURL"
     }
    },
    { "$out" : "artw" }
]

# perform the aggregation
agr = artw.aggregate(pipeline)

In [None]:
# unset field with null values
all(artw.update_many({str(i):np.nan},{"$unset": {str(i):""}}) for i in key_list)