# DANAM Image Metadata Notebook
This notebook is used along with the scripts clean_json and write_csv to query and analyze DANAM's image metadata quickly.

Queries is done via Pandas Dataframe.


In [1]:
import csv
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 1000)

#from datetime import datetime

from scripts.clean_json import clean_json
from scripts.write_csv import list_from_txt
from scripts.metadata_fix import manual_fixes

def print_df(df):
    cols = [
        'danam_caption', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2', 'source', 'notes', 'mon_id', 'class_code', 'classification', 'agent3', 'date_scan',
        ]
    return df[cols]


In [3]:
# Better dataframe handling with jupyter datatables
from jupyter_datatables import init_datatables_mode
%load_ext jupyter_require
%requirejs d3 https://d3js.org/d3.v5.min
init_datatables_mode()

## Read DANAM json export
Always replace with the latest export

In [4]:
# read DANAM json export
danam_export = "json\DANAM\Monument_2022-03-23_02-27-55.json"
danam_images = clean_json(danam_export)
danam_df = pd.DataFrame(danam_images)

## Preparing Metadata Uploads
### Metadata of monuments in upload_current.mon

In [None]:
# find recently updated monuments
mon_ids = list_from_txt('mon/upload_current.mon')

# Filter metadata according to current.mon and valid caption
to_upload = danam_df.loc[danam_df['mon_id'].isin(mon_ids)]
to_upload = to_upload.loc[to_upload['validCaption']]

## manual fixes start ##
fixes = "fixes//current.fix"
#manual_fixes(to_upload, fixes)
## manual fixes end ##

print_df(to_upload)

### Metadata of recently updloaded maps

In [None]:
only_maps = list_from_txt("mon\\upload_only_maps.mon")

upload_map = danam_df.loc[danam_df['mon_id'].isin(only_maps)]
upload_map = upload_map.loc[upload_map['validCaption']]
upload_map = upload_map.loc[upload_map['filename'].str.contains("_D_")]

## manual fixes start ##
fixes = "fixes\\maps.fix"
manual_fixes(upload_map, fixes)
## manual fixes end ##

print_df(upload_map)


### Metadata of recently updloaded historical images

In [None]:
only_historical = list_from_txt("mon\\upload_only_historical.mon")

upload_historical = danam_df.loc[danam_df['mon_id'].isin(only_historical)]
upload_historical = upload_historical.loc[upload_historical['validCaption']]
upload_historical = upload_historical.loc[upload_historical['filename'].str.contains("_H_")]

## manual fixes start ##
fixes = "fixes\\historical.fix"
manual_fixes(upload_historical, fixes)
## manual fixes end ##

print_df(upload_historical)

### Finding recently updated monuments

In [None]:
from datetime import datetime

recent = danam_df.loc[danam_df['lastModified'] >= datetime(2022, 2, 15)]
print("Number of recently update monuments: {}".format(recent.shape[0]))
recent_mon_ids = set(list(set(recent['mon_id'])))

uploaded = list_from_txt('mon\\sds.mon')
to_update_mon = [mon for mon in recent_mon_ids if mon in uploaded and mon not in mon_ids and mon not in only_maps and mon not in only_historical]
print("Number of those monuments already uploaded to HeidIcon that are not in current.mon: {}".format(len(to_update_mon)))

file = open("mon\\recently_changed.mon", 'w')
for mon_id in to_update_mon:
    file.write(mon_id+"\n")
file.close()

to_update = danam_df.loc[danam_df['mon_id'].isin(to_update_mon)]
to_update = to_update.loc[to_update['validCaption']]    
to_update = to_update.loc[to_update['lastModified'] > datetime(2022, 2, 15)]

## manual fixes start ##
fixes = "fixes\\update.fix"
manual_fixes(to_update, fixes)
## manual fixes end ##

#to_update[[ 'lastModified', 'validCaption', 'filename', 'danam_caption', 'caption', 'date', 'agent']].sort_values('lastModified', ascending=False)
print_df(to_update)



### Checking for caption fixes

In [None]:
to_fix = list_from_txt("mon\\upload_to_fix.mon")
upload_fix = danam_df.loc[danam_df['mon_id'].isin(to_fix)]
upload_fix = upload_fix.loc[upload_fix['danam_caption'].str.contains("Attribution 40")]

fixes = ["If not otherwise stated, all images and texts in this folder are published under Creative Commons"
, "If not otherwise stated, all images and texts in this monument folder are published under Creative Commons"
, "Attribution 4.0 License \(CC BY-SA 4.0\),"
, "Attribution 40 License \(CC BY-SA 40\),"
, "and the copyright lies with NHDP. All visuals of this monument folder"
," and more are \(or will be\) also stored in heidICON," 
, "and more are also stored in heidICON,"
, "the object and multimedia database of Heidelberg University" 
, "\(Type the ID-number or key words in the first line and click the search field.\)" 
, "\(type the ID-number or key words in the first line and click the search field.\)" 
, "\(type the ID-number or key words in the first line and click the search field\)" 
, "\(type the ID-number or keywords in the first line and click the search field\)." 
, "You will also find the initial report there"
, "The latest report will always be available in DANAM \(this page\)."
, "You will also find the initial report there. The latest report will always be available in DANAM \(this page\)."
, "."
]

for fix in fixes:
    upload_fix['caption'] = upload_fix['caption'].str.replace(fix, '', regex=True, case=False)

upload_fix['caption'] = upload_fix['caption'].str.strip()

## Exporting Results to CSV for Weekly Metadata Transfer

In [None]:
# Writing to CSV 
cols = [
        'filename', 'caption', 'date1', 'date2', 'date', 'date3', 'agent', 'role', 'agent2', 'role2',
        'copyright', 'source', 'empty_column', 'notes', 'mon_id', 'class_code', 'classification', 'agent3', 'date_scan',
        'license', 'url', 'rights_text', 'heidata', 'heidoc'
        ]

all_upload = pd.concat([to_upload, upload_map, upload_historical, to_update])
all_upload.to_csv("csv/image_metadata_.csv", columns=cols, header=True, sep=';', index=False, quotechar = "\"", quoting=csv.QUOTE_ALL)
