# DANAM Report Generation
This notebook starts an automated instance of Chrome to download PDFs from DANAM's monument pages.

This requires the Python library `selenium` and Google Chrome to be installed

In [1]:
import json, csv
import codecs
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from scripts.clean_json import replace_w_json
from scripts.write_csv import list_from_txt
from scripts.create_report import get_reports, chromedriver_init

Some function and constant definitions

In [2]:
heidicon_id = json.load(open("json/dict/heidicon_id.json"))[0]
roles = {
    "Editor": "Editor",
    "Description": "Autor",
    "Photography": "Fotograf",
    "Drawings": "Zeichner",
    "Historical": "Autor"
}

In [3]:
def get_role(key):
    for role_key in roles.keys():
        if role_key in key:
            return role_key, roles[role_key]
    return ("Editor", "Editor")

def get_editors(text):
    text = replace_w_json(text, heidicon_id, strip=True)
    split = [i.replace("<p>", "").replace("</p>", "").replace("&nbsp;", " ").strip()  for i in text.split("\n\n")]
    editors = {}
    for item in split:
        if len(item) == 0:
            continue
        t = item.split(": ")
        if len(t) < 2:
            continue
        
        field = t[0]
        names = t[1]
        editors[field] = names
    return editors

def separate_editors(editors, mon):
    for role in editors.keys():
        r = get_role(role)
        agents = editors[role].split(",")
        for i in range(len(agents)):
            mon['{}_agent_{}'.format(r[0],i)] = agents[i].strip()
            mon['{}_role_{}'.format(r[0],i)] = r[1]

def fill_report_info(mon):
    mon['filename'] = "DANAM_report_{}.pdf".format(mon['mon_id'])
    mon['title'] = "Report {}".format(mon['mon_id'])
    mon['classification_gnd'] = "4177815-7"
    mon['rights_owner'] = "Nepal Heritage Documentation Project"
    mon['license'] = "CC BY-SA 4.0"
    mon['empty_col'] = ""



In [4]:
def clean_for_report(danam):
    for mon in danam:
        mon['danam_url'] = mon['resourceinstance']['resourceinstanceid']
        mon_ids = [tile["data"]["28294784-9323-11e9-bf23-0242ac120006"] for tile in mon['tiles'] if '28294784-9323-11e9-bf23-0242ac120006' in tile['data'].keys()]

        if len(mon_ids) > 0:
            mon['mon_id'] = mon_ids[0]
        else:
            mon['mon_id'] = "NO_ID"
        
        fill_report_info(mon)

        editorials= [tile["data"]["66fd9c70-ce1b-11e9-b993-0242ac140002"] for tile in mon['tiles'] if '66fd9c70-ce1b-11e9-b993-0242ac140002' in tile['data'].keys()]
        mon['editorial_text'] = editorials
        if len(editorials) > 0:
            editors = get_editors(editorials[0])
            separate_editors(editors, mon)
        
        fill_report_info(mon)

        if "tiles" in mon.keys():
            del mon['tiles']
        
        if "resourceinstance" in mon.keys():
            del mon["resourceinstance"]


Start of script

In [5]:
# read DANAM json export
danam_export = "json\DANAM\Monument_2022-11-07_01-16-16.json"
danam = json.load(codecs.open(danam_export, 'r', 'utf-8'))
danam = danam['business_data']['resources']

clean_for_report(danam)
danam_df = pd.DataFrame(danam)

In [6]:
#manual editing, just in case
danam_df.loc[danam_df['danam_url']=='83d25607-b6cf-4031-bb2a-36e1532ced6a', 'mon_id'] = 'KIR4028'
danam_df.loc[danam_df['danam_url']=='83d25607-b6cf-4031-bb2a-36e1532ced6a', 'filename'] = 'DANAM_report_KIR4028'
danam_df.loc[danam_df['danam_url']=='83d25607-b6cf-4031-bb2a-36e1532ced6a', 'title'] = 'Report KIR4028'

In [7]:
# select monuments that we want to create reports for
mon_ids = list_from_txt('mon/upload_report.mon')
to_upload = danam_df.loc[danam_df['mon_id'].isin(mon_ids)]
to_upload = to_upload.fillna("NaN")

Start an automated Chrome instance. In the case that this cell produces an error, it might be because the Chrome version does not match the chrome driver. Please make sure that the correct version of the chrome driver is in this folder. 

You can download the chromedriver at https://chromedriver.chromium.org/downloads 

In [8]:
path_to_chromedriver = "chromedriver"
to_create_reports = to_upload[['danam_url', 'mon_id']].iterrows()
driver = chromedriver_init(path_to_chromedriver)
get_reports(to_create_reports, driver)


NHDP - loaded.
DANAM_report_BAL0004.pdf downloaded.


In [None]:
# prepare csv for metadata upload
cols = danam_df.columns

agents_roles = [i for i in cols if i[0].isupper()]
cols = ['filename', 'title', 'classification_gnd', 'empty_col', 'mon_id', 'rights_owner', 'license']
cols += agents_roles

csv_str = to_upload.to_csv(columns=cols, header=False, sep=';', index=False, quotechar = "\"", quoting=csv.QUOTE_ALL)
csv_str = csv_str.replace("\"NaN\";", "").replace("\"NaN\"", "")

file = codecs.open("csv/report_metadata.csv", 'w', 'utf-8')
file.write(csv_str)
file.close()