In [1]:
import json, csv
import codecs, re
import random
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from clean_json import read_danam_export, get_caption, metadata_from_json, metadata_from_caption, valid_caption, replace_w_json

In [2]:
# read DANAM json export
danam_export = "json\DANAM\Monument_2021-10-04_09-49-06.json"
danam = json.load(codecs.open(danam_export, 'r', 'utf-8'))
danam = danam['business_data']['resources']

# collect only danam images
danam_images = read_danam_export(danam_export)

In [3]:
# cleaning and geting metadata
for image in danam_images:
    image['danam_caption'] = get_caption(image).replace("\n", '')
    
    to_delete = "If not otherwise stated, all images and texts in this monument folder are published under Creative Commons Attribution 4.0 License (CC BY-SA 4.0), and the copyright lies with NHDP. All visuals of this monument folder and more are (or will be) also stored in heidICON, the object and multimedia database of Heidelberg University. (Type the ID-number or key words in the first line and click the search field.) You will also find the initial report there. The latest report will always be available in DANAM (this page)"
    image['danam_caption'] = image['danam_caption'].replace(to_delete, "")
    image['empty_column'] = ""   
    metadata_from_json(image, image)
    
    caption = image['danam_caption']
    
    fixes = json.load(open('json/dict/fixes.json'))[0]
    caption = replace_w_json(caption, fixes)

    parts = caption.split(';')
    if  not valid_caption(caption) or len(parts) < 3: 
          image['validCaption'] = False
    else: 
        image['validCaption'] = True
        metadata_from_caption(caption.split(";"), image)
      
        
        
    image['lastModified'] = image['imagedata'][0]['lastModified']
    
    try:
        del image['imagedata']
        del image['editorials']
        del image['mon_ids']
    except: 
        pass
    
    dels = [key for key in image.keys() if image[key]==None]
    for i in dels:
        del image[i]

In [None]:
# check random images from DANAM
i = random.randint(0, len(danam_images))
print(danam_images[i].keys())
danam_images[i]

In [4]:
danam_df = pd.DataFrame(danam_images)

In [5]:
valid_captions = danam_df.loc[danam_df['validCaption']].shape[0]
all_images = danam_df.shape[0]

print("Percentage of images with valid captions: {}".format(valid_captions/all_images))
print("Percentage of images with invalid captions: {}".format(1-(valid_captions/all_images)))


Percentage of images with valid captions: 0.6991106988924655
Percentage of images with invalid captions: 0.30088930110753453


In [6]:
# query through metadata table, check if monument is complete 
mon = danam_df.loc[danam_df['mon_id']=='KIR0024']
#mon = mon.loc[mon['validCaption']==False]
print(mon.shape[0])
mon[['filename', 'validCaption', 'danam_caption']
].sort_values('validCaption')

17


Unnamed: 0,filename,validCaption,danam_caption
11649,KIR0024_D_2021_floor_plan,True,Kutujhvaḥ Śikharakūṭa Caitya; floor plan by Anil Basukala; 2021-09-15
11663,KIR0024_P_20210922_08,True,"Kutujhvaḥ Śikharakūṭa Caitya, view from N; photo by Yogesh Budathoki; 2021-09-22"
11662,KIR0024_P_20210922_01,True,". Kutujhvaḥ Śikharakūṭa Caitya, wide view from E; photo by Yogesh Budathoki; 2021-09-22"
11661,KIR0024-004_P_20210907_01,True,"Kutujhvaḥ Śikharakūṭa Caitya, statue of Mañjuśrī, view from N; photo by Yogesh Budathoki; 2021-09-22"
11660,KIR0024-002_P_20210907_01,True,"Kutujhvaḥ Śikharakūṭa Caitya, statue of Vajrapāṇi, view from S; photo by Yogesh Budathoki; 2021-09-22"
11659,KIR0024_P_20210922_09,True,"Kutujhvaḥ Śikharakūṭa Caitya, view from NE; photo by Yogesh Budathoki; 2021-09-22"
11658,KIR0024_P_20210922_04,True,"Kutujhvaḥ Śikharakūṭa Caitya, view from S; photo by Yogesh Budathoki; 2021-09-22"
11664,KIR0024-001_P_20210907_01,True,"Kutujhvaḥ Śikharakūṭa Caitya, staute of Maitreya, view from E; photo by Yogesh Budathoki; 2021-09-22"
11657,KIR0024_P_20210922_03,True,"Kutujhvaḥ Śikharakūṭa Caitya, view from SE; photo by Yogesh Budathoki; 2021-09-22"
11655,KIR0024_P_20210922_07,True,"Kutujhvaḥ Śikharakūṭa Caitya, view from NW; photo by Yogesh Budathoki; 2021-09-22"


In [7]:
# Filter metadata according to mon_id and valid caption
mon_ids = ['BAL0002', 'MUS0200', 'KAT0010', 'THK0022', 
           'BAL0010', 'BAL0011', 'DLK0091', 'DLK0122',
           'DLK0200', 'DLK0140',
           'DLK0121', 'DLK0071', 'KAT2240', 'BKT0061',
           'KIR0067', 'KIR0038']

to_upload = danam_df.loc[danam_df['mon_id'].isin(mon_ids)]
to_upload = to_upload.loc[to_upload['validCaption']]

In [8]:
# Writing to CSV 
cols = [
        'filename', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2',
        'copyright', 'source', 'empty_column', 'notes', 'mon_id', 'class_code', 'classification', 'agent3', 'date_scan',
        'license', 'url', 'rights_text', 'heidata', 'heidoc'
        ]


to_upload.to_csv("image_metadata.csv", columns=cols, header=False, sep=';', index=False, quotechar = "\"", quoting=csv.QUOTE_ALL)