In [1]:
import json, csv
import codecs
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from clean_json import replace_w_json
from write_csv import list_from_txt
from create_report import get_reports, chromedriver_init

In [2]:
heidicon_id = json.load(open("json/dict/heidicon_id.json"))[0]
roles = {
    "Editor": "Editor",
    "Description": "Autor",
    "Photography": "Fotograf",
    "Drawings": "Zeichner",
    "Historical": "Autor"
}

In [3]:
def get_role(key):
    for role_key in roles.keys():
        if role_key in key:
            return role_key, roles[role_key]
    return ("Editor", "Editor")

def get_editors(text):
    text = replace_w_json(text, heidicon_id, strip=True)
    split = [i.replace("<p>", "").replace("</p>", "").replace("&nbsp;", " ").strip()  for i in text.split("\n\n")]
    editors = {}
    for item in split:
        if len(item) == 0:
            continue
        t = item.split(": ")
        if len(t) < 2:
            continue
        
        field = t[0]
        names = t[1]
        editors[field] = names
    return editors

def separate_editors(editors, mon):
    for role in editors.keys():
        r = get_role(role)
        agents = editors[role].split(",")
        for i in range(len(agents)):
            mon['{}_agent_{}'.format(r[0],i)] = agents[i].strip()
            mon['{}_role_{}'.format(r[0],i)] = r[1]

def fill_report_info(mon):
    mon['filename'] = "DANAM - {}.pdf".format(mon['mon_id'])
    mon['title'] = "Report {}".format(mon['mon_id'])
    mon['classification_gnd'] = "4177815-7"
    mon['rights_owner'] = "Nepal Heritage Documentation Project"
    mon['license'] = "CC BY-SA 4.0"
    mon['empty_col'] = ""



In [4]:
def clean_for_report(danam):
    for mon in danam:
        mon['danam_url'] = mon['resourceinstance']['resourceinstanceid']
        mon_ids = [tile["data"]["28294784-9323-11e9-bf23-0242ac120006"] for tile in mon['tiles'] if '28294784-9323-11e9-bf23-0242ac120006' in tile['data'].keys()]

        if len(mon_ids) > 0:
            mon['mon_id'] = mon_ids[0]
        else:
            mon['mon_id'] = "NO_ID"
        
        fill_report_info(mon)

        editorials= [tile["data"]["66fd9c70-ce1b-11e9-b993-0242ac140002"] for tile in mon['tiles'] if '66fd9c70-ce1b-11e9-b993-0242ac140002' in tile['data'].keys()]
        mon['editorial_text'] = editorials
        if len(editorials) > 0:
            editors = get_editors(editorials[0])
            separate_editors(editors, mon)
        
        fill_report_info(mon)

        if "tiles" in mon.keys():
            del mon['tiles']
        
        if "resourceinstance" in mon.keys():
            del mon["resourceinstance"]


In [5]:
# read DANAM json export
danam_export = "json\DANAM\Monument_2021-11-11_00-23-56.json"
danam = json.load(codecs.open(danam_export, 'r', 'utf-8'))
danam = danam['business_data']['resources']

clean_for_report(danam)
danam_df = pd.DataFrame(danam)

In [6]:
#manual editing, just in case
danam_df.loc[danam_df['danam_url']=='83d25607-b6cf-4031-bb2a-36e1532ced6a', 'mon_id'] = 'KIR4028'
danam_df.loc[danam_df['danam_url']=='83d25607-b6cf-4031-bb2a-36e1532ced6a', 'filename'] = 'DANAM - KIR4028'
danam_df.loc[danam_df['danam_url']=='83d25607-b6cf-4031-bb2a-36e1532ced6a', 'title'] = 'Report KIR4028'

In [7]:
danam_df.loc[danam_df['mon_id']=='KIR4028']

Unnamed: 0,danam_url,mon_id,filename,title,classification_gnd,rights_owner,license,empty_col,editorial_text,Editor_agent_0,...,Photography_agent_3,Photography_role_3,Drawings_agent_4,Drawings_role_4,Drawings_agent_5,Drawings_role_5,Historical_agent_3,Historical_role_3,Photography_agent_4,Photography_role_4
787,83d25607-b6cf-4031-bb2a-36e1532ced6a,KIR4028,DANAM - KIR4028,Report KIR4028,4177815-7,Nepal Heritage Documentation Project,CC BY-SA 4.0,,"[<p>Editor: Christiane Brosius</p>\n\n<p>Descriptions, iconography, social and religious activities: Bharat Maharjan</p>\n\n<p>Photography after 2015: Yogesh Budathoki</p>\n\n<p>Drawings and architectural data: Anil Basukala, Bibek Basukala, Bijay Basukala, Reinhard Herdick</p>\n\n<p>Historical events, inscriptions: Rajan Khatiwoda, Bharat Maharjan</p>\n]",1182530,...,,,,,,,,,,


In [8]:
# select monuments that we want to create reports for
mon_ids = list_from_txt('mon/report.mon')
to_upload = danam_df.loc[danam_df['mon_id'].isin(mon_ids)]
to_upload = to_upload.fillna("NaN")

In [11]:
to_create_reports = to_upload[['danam_url', 'mon_id']].iterrows()
driver = chromedriver_init("chromedriver")
get_reports(to_create_reports, driver)


Arches - loaded.
DANAM - DLK0190.pdf downloaded.
Arches - loaded.
DANAM - BAL4001.pdf downloaded.
Arches - loaded.
DANAM - KIR4076.pdf downloaded.
Arches - loaded.
DANAM - DLK0122.pdf downloaded.
Arches - loaded.
DANAM - KAT2130.pdf downloaded.
Arches - loaded.
DANAM - BKT0070.pdf downloaded.
Arches - loaded.
DANAM - DLK0091.pdf downloaded.
Arches - loaded.
DANAM - DLK0121.pdf downloaded.
Arches - loaded.
DANAM - DLK0140.pdf downloaded.
Arches - loaded.
DANAM - DLK0200.pdf downloaded.


In [11]:
# prepare csv for metadata upload
cols = danam_df.columns

agents_roles = [i for i in cols if i[0].isupper()]
cols = ['filename', 'title', 'classification_gnd', 'empty_col', 'mon_id', 'rights_owner', 'license']
cols += agents_roles

csv_str = to_upload.to_csv(columns=cols, header=False, sep=';', index=False, quotechar = "\"", quoting=csv.QUOTE_ALL)
csv_str = csv_str.replace("\"NaN\";", "").replace("\"NaN\"", "")

file = codecs.open("csv/report_metadata.csv", 'w', 'utf-8')
file.write(csv_str)
file.close()