In [None]:
import json, csv
import codecs
import random
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from datetime import datetime


from clean_json import read_danam_export, get_caption, metadata_from_json, metadata_from_caption, valid_caption, replace_w_json
from write_csv import list_from_txt


In [None]:
# read DANAM json export
danam_export = "json\DANAM\Monument_2021-10-18_21-59-44.json"
danam = json.load(codecs.open(danam_export, 'r', 'utf-8'))
danam = danam['business_data']['resources']



In [None]:
# collect only danam images
danam_images = read_danam_export(danam_export)

# cleaning and geting metadata
for image in danam_images:
    image['danam_caption'] = get_caption(image).replace("\n", '')
    
    to_delete = "If not otherwise stated, all images and texts in this monument folder are published under Creative Commons Attribution 4.0 License (CC BY-SA 4.0), and the copyright lies with NHDP. All visuals of this monument folder and more are (or will be) also stored in heidICON, the object and multimedia database of Heidelberg University. (Type the ID-number or key words in the first line and click the search field.) You will also find the initial report there. The latest report will always be available in DANAM (this page)"
    image['danam_caption'] = image['danam_caption'].replace(to_delete, "")
    image['empty_column'] = ""   
    metadata_from_json(image, image)

    if image['filename'] == 'KIR0067_I_003_20210907_01':
        image['filename'] = image['filename'].replace("7_01", "7")
        image['filename_danam'] = image['filename']

    caption = image['danam_caption']
    
    fixes = json.load(open('json/dict/fixes.json'))[0]
    caption = replace_w_json(caption, fixes)
    image['validCaption'] = valid_caption(caption)
    #image['old_validCaption'] = valid_caption(caption)

    parts = caption.split(';')

    if image['validCaption'] and len(parts)>=3:
        metadata_from_caption(parts, image)
    else:
        image['validCaption'] = False     
  
        
    image['lastModified'] = image['imagedata'][0]['lastModified']
    timestamp = int(image['lastModified'])
    image['lastModified'] = datetime.fromtimestamp(timestamp/1000)
    
    try:
        del image['imagedata']
        del image['editorials']
        del image['mon_ids']
    except: 
        pass
    
    dels = [key for key in image.keys() if image[key]==None]
    for i in dels:
        del image[i]

danam_df = pd.DataFrame(danam_images)

In [None]:
img = danam_images[random.randint(0,len(danam_images))]
img

In [None]:
valid_captions = danam_df.loc[danam_df['validCaption']].shape[0]
all_images = danam_df.shape[0]

print("Percentage of images with valid captions: {}".format(valid_captions/all_images))
print("Percentage of images with invalid captions: {}".format(1-(valid_captions/all_images)))


In [None]:
# query through metadata table, check if monument is complete 
mon = danam_df.loc[danam_df['mon_id']=='BKT0100']
#mon = mon.loc[mon['danam_caption'].str.contains("struts")]
#mon = mon.loc[mon['validCaption']==False]
print(mon.shape[0])
mon[[ 'validCaption', 'danam_caption', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2','source', 'agent3', 'date_scan',]].sort_values('date').head(10)

In [None]:
danam_df.iloc[17367]

In [None]:
print(mon[['filename', 'validCaption', 'danam_caption']
].sort_values('filename', ascending=True)['filename'].to_csv(sep=' ', index=False))

In [None]:
# Filter metadata according to mon_id and valid caption
mon_ids = list_from_txt('log/id_monument.txt')

to_upload = danam_df.loc[danam_df['mon_id'].isin(mon_ids)]
to_upload = to_upload.loc[to_upload['validCaption']]

In [None]:
# Writing to CSV 
cols = [
        'filename', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2',
        'copyright', 'source', 'empty_column', 'notes', 'mon_id', 'class_code', 'classification', 'agent3', 'date_scan',
        'license', 'url', 'rights_text', 'heidata', 'heidoc'
        ]


to_upload.to_csv("image_metadata.csv", columns=cols, header=False, sep=';', index=False, quotechar = "\"", quoting=csv.QUOTE_ALL)