# DANAM Image Metadata Notebook
This notebook is used along with the scripts clean_json and write_csv to query and analyze DANAM's image metadata quickly.

Queries is done via Pandas Dataframe.

DataFrame can be checked using VSCode's variable viewer


In [1]:
import csv
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 1000)

#from datetime import datetime

from scripts.clean_json import clean_json
from scripts.write_csv import list_from_txt
from scripts.metadata_fix import manual_fixes


## Read DANAM json export
Always replace with the latest export

In [2]:
# read DANAM json export
danam_export = "json\DANAM\Monument_2022-04-27_04-41-59.json"
danam_images = clean_json(danam_export)
danam_df = pd.DataFrame(danam_images)

In [3]:
'''
Review metadata extracted from DANAM before uploading and replace faulty metadata using a CSV file

    df = Pandas DataFrame containing metadata of select images from DANAM
    fixes = a CSV file containing fixes for the metadata. This CSV file has the format 
            row_number,column_name,correct_value
    checked = if False, takes select columns to prepare metadata for review using variable viewer. 
              if True, metadata is fixed using prepared CSV defined in fixes.
'''
def check_metadata(df, fixes, checked):
    cols = [
        'danam_caption', 'caption', 'date', 'date3', 'agent', 'role', 'agent2', 'role2', 'source', 'notes', 'agent3', 'date_scan',
        ]
    manual_fixes(df, fixes)
    if not checked:
        df = df[cols]
        print("PLEASE CHECK USING VARIABLE VIEW")
    else: 
        print("READY TO UPLOAD")
    return df

## Preparing Metadata Uploads
### Metadata of monuments in upload_current.mon

In [4]:
# find recently updated monuments
mon_ids = list_from_txt('mon/upload_all.mon')

# Filter metadata according to current.mon and valid caption
upload_all = danam_df.loc[danam_df['mon_id'].isin(mon_ids)]
upload_all = upload_all.loc[upload_all['validCaption']]


## manual fixes start ##
# set to True if ready to upload
# set to False to review metadata before uploading
upload_all = check_metadata(upload_all,"fixes\\all.fix",True)
## manual fixes end ##


READY TO UPLOAD


### Metadata of recently updloaded maps

In [5]:
only_maps = list_from_txt("mon\\upload_only_maps.mon")

upload_map = danam_df.loc[danam_df['mon_id'].isin(only_maps)]
upload_map = upload_map.loc[upload_map['validCaption']]
upload_map = upload_map.loc[upload_map['filename'].str.contains("_D_")]

## manual fixes start ##
upload_map = check_metadata(upload_map,"fixes\\maps.fix",True)
## manual fixes end ##


READY TO UPLOAD


### Metadata of only recently uploaded images (no maps)

In [6]:
only_images = list_from_txt("mon\\upload_only_images.mon")

upload_images = danam_df.loc[danam_df['mon_id'].isin(only_images)]
upload_images = upload_images.loc[upload_images['validCaption']]

# Set this query to include everything from a monument BUT maps 
upload_images = upload_images.loc[upload_images['filename'].str.contains("_D_") == False]

## manual fixes start ##
upload_images = check_metadata(upload_images,"fixes\\images.fix",True)
## manual fixes end ##



READY TO UPLOAD


### Metadata of recently updloaded historical images

In [7]:
only_historical = list_from_txt("mon\\upload_only_historical.mon")

upload_historical = danam_df.loc[danam_df['mon_id'].isin(only_historical)]
upload_historical = upload_historical.loc[upload_historical['validCaption']]
upload_historical = upload_historical.loc[upload_historical['filename'].str.contains("_H_")]

## manual fixes start ##
upload_historical = check_metadata(upload_historical,"fixes\\historical.fix",True)
## manual fixes end ##


READY TO UPLOAD


### Finding recently updated monuments

In [8]:
from datetime import datetime

recent = danam_df.loc[danam_df['lastModified'] >= datetime(2022, 4, 15)]
print("Number of recently updated monuments: {}".format(recent.shape[0]))
recent_mon_ids = set(list(set(recent['mon_id'])))

uploaded = list_from_txt('mon\\sds.mon')
to_update_mon = [mon for mon in recent_mon_ids if mon in uploaded and mon not in mon_ids and mon not in only_maps and mon not in only_historical]
print("Number of those monuments already uploaded to HeidIcon that are not in current.mon: {}".format(len(to_update_mon)))

file = open("mon\\recently_changed.mon", 'w')
for mon_id in to_update_mon:
    file.write(mon_id+"\n")
file.close()

to_update = danam_df.loc[danam_df['mon_id'].isin(to_update_mon)]
to_update = to_update.loc[to_update['validCaption']]    
to_update = to_update.loc[to_update['lastModified'] > datetime(2022, 2, 15)]

## manual fixes start ##
to_update= check_metadata(to_update,"fixes\\update.fix",True)
## manual fixes end ##


Number of recently updated monuments: 271
Number of those monuments already uploaded to HeidIcon that are not in current.mon: 24
READY TO UPLOAD


## Exporting Results to CSV for Weekly Metadata Transfer

In [9]:
# Writing to CSV 
cols = [
        'filename', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2',
        'copyright', 'source', 'empty_column', 'notes', 'mon_id', 'class_code', 'classification', 'agent3', 'date_scan',
        'license', 'url', 'rights_text', 'heidata', 'heidoc'
        ]

all_upload = pd.concat([upload_all, upload_map, upload_historical, upload_images, to_update])
all_upload.to_csv("csv/image_metadata_.csv", columns=cols, header=True, sep=';', index=False, quotechar = "\"", quoting=csv.QUOTE_ALL)
