In [1]:
import json, csv
import codecs
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from datetime import datetime

from clean_json import read_danam_export, get_caption, metadata_from_json, metadata_from_caption, valid_caption, replace_w_json
from write_csv import list_from_txt


In [2]:
# read DANAM json export
danam_export = "json\DANAM\Monument_2021-11-29_00-10-20.json"
danam = json.load(codecs.open(danam_export, 'r', 'utf-8'))
danam = danam['business_data']['resources']



In [3]:
# collect only danam images
danam_images = read_danam_export(danam_export)

# cleaning and geting metadata
for image in danam_images:
    image['danam_caption'] = get_caption(image).replace("\n", '')
    
    to_delete = "If not otherwise stated, all images and texts in this monument folder are published under Creative Commons Attribution 4.0 License (CC BY-SA 4.0), and the copyright lies with NHDP. All visuals of this monument folder and more are (or will be) also stored in heidICON, the object and multimedia database of Heidelberg University. (Type the ID-number or key words in the first line and click the search field.) You will also find the initial report there. The latest report will always be available in DANAM (this page)"
    image['danam_caption'] = image['danam_caption'].replace(to_delete, "")
    image['empty_column'] = ""   
    metadata_from_json(image, image)

    if image['filename'] == 'KIR0067_I_003_20210907_01':
        image['filename'] = image['filename'].replace("7_01", "7")
        image['filename_danam'] = image['filename']

    caption = image['danam_caption']
    
    fixes = json.load(open('json/dict/fixes.json'))[0]
    caption = replace_w_json(caption, fixes)
    image['validCaption'] = valid_caption(caption)
    #image['old_validCaption'] = valid_caption(caption)

    parts = caption.split(';')

    if image['validCaption'] and len(parts)>=3:
        metadata_from_caption(parts, image)
    else:
        image['validCaption'] = False     
  
        
    image['lastModified'] = image['imagedata'][0]['lastModified']
    timestamp = int(image['lastModified'])
    image['lastModified'] = datetime.fromtimestamp(timestamp/1000)
    
    try:
        del image['imagedata']
        #del image['editorials']
        del image['mon_ids']
    except: 
        pass
    
    dels = [key for key in image.keys() if image[key]==None]
    for i in dels:
        del image[i]

danam_df = pd.DataFrame(danam_images)

In [38]:
danam_images[0].keys()


dict_keys(['4b84aa48-9eea-11e9-8b93-0242ac120006', '4b84aef8-9eea-11e9-8b93-0242ac120006', '4b84bd80-9eea-11e9-8b93-0242ac120006', 'danam_caption', 'empty_column', 'filename_danam', 'filetype', 'filename', 'mon_id', 'classification', 'notes', 'heidoc', 'heidata', 'validCaption', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2', 'copyright', 'source', 'class_code', 'agent3', 'date_scan', 'license', 'url', 'rights_text', 'lastModified'])

In [5]:
valid_captions = danam_df.loc[danam_df['validCaption']].shape[0]
all_images = danam_df.shape[0]

print("Percentage of images with valid captions: {}".format(valid_captions/all_images))
print("Percentage of images with invalid captions: {}".format(1-(valid_captions/all_images)))


Percentage of images with valid captions: 0.722950736671561
Percentage of images with invalid captions: 0.27704926332843904


In [19]:
# query through metadata table, check if monument is complete 
mon_id = 'KAT3260'
mon = danam_df.loc[(danam_df['mon_id']==mon_id) | (danam_df['mon_id']==mon_id.upper())]
#mon = mon.loc[mon['filename'].str.contains("_D_")]
mon = mon.loc[mon['validCaption']==False]
print(mon.shape[0])
mon[[ 'validCaption', 'filename', 'danam_caption']].sort_values('validCaption')
#mon[[ 'validCaption', 'filename', 'danam_caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role']].sort_values('validCaption')

4


Unnamed: 0,validCaption,filename,danam_caption
18971,False,KAT3260-012_P_20150803,"Śāntipura, mural on the east wall, view from W; photo by Ludovic Dusuzeau: 2015-08-03"
19075,False,KAT3260_D_2021_floor_plan,"Śāntipura, floor map by Ludovic Dusuzeau; 2015; updated by Thomas Schrom, 2021-07"
19097,False,KAT3260_H_2000c_Shakya_01,"Śāntipura, view from S; ca. 2000; courtesy of Shakya H. R; free access – no reuse; source: Shakya H. R, Śrī Svayaṃbhū Mahācaitya, 2004"
19098,False,KAT3260_H_2000c_Shakya_02,"Śāntipura, view from S; ca. 2000; courtesy of Shakya H. R; free access – no reuse; source: Shakya H. R, Śrī Svayaṃbhū Mahācaitya, 2004"


In [34]:
# find recently updated monuments
recent = danam_df.loc[danam_df['lastModified'] > '2021-11-01']
print(recent.shape[0])
recent_mon_ids = set(recent['mon_id'])
file = open("mon\\recently_changed.mon", 'w')
for mon_id in recent_mon_ids:
    file.write(mon_id+"\n")
file.close()
    

829


In [48]:
# Filter metadata according to current.mon and valid caption
mon_ids = list_from_txt('mon/current.mon')

to_upload = danam_df.loc[danam_df['mon_id'].isin(mon_ids)]
to_upload = to_upload.loc[to_upload['validCaption']]

In [49]:
# Writing to CSV 
cols = [
        'filename', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2',
        'copyright', 'source', 'empty_column', 'notes', 'mon_id', 'class_code', 'classification', 'agent3', 'date_scan',
        'license', 'url', 'rights_text', 'heidata', 'heidoc'
        ]


to_upload.to_csv("csv/image_metadata.csv", columns=cols, header=False, sep=';', index=False, quotechar = "\"", quoting=csv.QUOTE_ALL)