In [1]:
import os,sys,glob,re,string,unicodedata,regex,gzip,shutil
import json,jsonlines
from bs4 import BeautifulSoup
import requests
import folium
from folium.plugins import MarkerCluster

# Map places in PTA

## Functions

In [2]:
def load_files(files_path):
    '''Load all files from files_path in list of dictionaries with urn, title, body of file'''
    xml_dir = os.path.expanduser(files_path)
    xml_paths = glob.glob(xml_dir)
    xml_paths = [path for path in sorted(xml_paths) if '__cts__' not in path]
    pta_dict = []
    for xml_path in xml_paths:
        file_dict = {}
        short_path = "/".join(xml_path.split("/")[8:])
        urn = "".join(short_path[7:]).split(".xml")[0]
        with open(xml_path) as file_open:
            soup = BeautifulSoup(file_open, 'lxml')
        strip_tags = ['cit', 'ref', 'quote', 'said', 'gap', 'app'] # remove not needed tags to avoid problems
        for tag in strip_tags: 
            for match in soup.find_all(tag):
                match.replaceWithChildren()
        body = soup.find("text")
        title = soup.find('title')
        file_dict["urn"] = urn
        file_dict["title"] = title.text
        file_dict["body"] = body
        pta_dict.append(file_dict)
    return pta_dict

In [3]:
def extract_places(files_path):
    '''Extract places from all files in list of dictionaries with 
    urn, title, number of place mentions, number of places, 
    list of places (with Pleiades-ID, context, count of mentions)'''
    file_list = load_files(files_path)
    results = []
    for entry in file_list:
        file_results = {}
        body = BeautifulSoup(str(entry["body"]), "lxml")
        places = []
        for place in body.find_all('placename'):
            place_entry = {}
            try: 
                refs = place["ref"]
            except:
                refs = "no ref found"
            place_name = place.text
            p = place.parent
            div = p.parent
            try:
                chapter = div["n"]
            except:
                chapter = ""
            ct_before = place.previous_siblings
            context_before_list = []
            for context_b in ct_before:
                context_before_list.append(context_b.string)
            try:
                context_before = " ".join(context_before_list[::-1]).split()[-5:]
            except:
                context_before = ""
            ct_after = place.next_siblings
            context_after_list = []
            for context_a in ct_after:
                context_after_list.append(context_a.string)
            try:
                context_after = " ".join(context_after_list).split()[:5]
            except:
                context_after = ""
            context = chapter+" "+" ".join(context_before)+" <b>"+place_name+"</b> "+" ".join(context_after)
            if not any(d['ID'] == refs for d in places):
                place_entry["ID"] = refs
                place_entry["name"] = place_name
                place_entry["Context"] = context
                place_entry["Count"] = 1
                places.append(place_entry)
            else:
                matches = next(d for d in places if refs == d["ID"])
                tc = matches["Count"]
                myid = matches["ID"]
                mycontext = matches["Context"]
                place_entry["ID"] = myid
                place_entry["name"] = place_name
                place_entry["Context"] = mycontext+"<br/>"+context
                place_entry["Count"] = tc+1
                places.remove(matches)
                places.append(place_entry)
        places_count = [d['Count'] for d in places]
        places_count = sum(places_count)
        file_results["urn"] = entry["urn"]
        file_results["title"] = entry["title"]
        file_results["p_mentions"] = str(places_count)
        file_results["p_number"] = str(len(places))
        file_results["places"] = places
        results.append(file_results)
    return results

In [4]:
def load_pleiades_data():
    '''Load Pleiades data from http://atlantides.org/downloads/pleiades/json/pleiades-places-latest.json.gz if it does not exist yet'''
    if os.path.isfile('pleiades-places-latest.jsonl'):
        contents = open('pleiades-places-latest.jsonl', "r").read() 
    else:
        url = 'http://atlantides.org/downloads/pleiades/json/pleiades-places-latest.json.gz'
        r = requests.get(url, allow_redirects=True)
        with open('pleiades-places-latest.json.gz', 'wb') as f_gzip:
            f_gzip.write(r.content)
        with gzip.open("pleiades-places-latest.json.gz", 'rb') as f_in:
            with jsonlines.open("pleiades-places-latest.jsonl", mode='w', sort_keys=True,compact=True) as f_out:
                places = json.load(f_in)
                for place in places['@graph']:
                    f_out.write(place)
        contents = open('pleiades-places-latest.jsonl', "r").read() 
    data = [json.loads(str(item)) for item in contents.strip().split('\n')]
    return data

In [5]:
def load_basemap():
    '''Load basemap data from Digital Atlas of the Roman Empire'''
    m = folium.Map(location=[40.4285312, 29.715356500000002],
               zoom_start=5,control_scale=True,tiles=None)
    folium.TileLayer(tiles = 'http://dh.gu.se/tiles/imperium/{z}/{x}/{y}.png',
              name = "DARE",
              attr = '<a href="http://dh.gu.se/dare">Map (c) Johan Åhlfeldt, Centre for Digital Humanities, University of Gothenburg 2019</a> (CC-BY), <a href="https://pta.bbaw.de">Data: PTA</a>',
              ).add_to(m)
    #folium.TileLayer(tiles='https://stamen-tiles-{s}.a.ssl.fastly.net/terrain-background/{z}/{x}/{y}.png', 
    #                 attr='Map tiles by <a href="http://stamen.com">Stamen Design</a>, <a href="http://creativecommons.org/licenses/by/3.0">CC BY 3.0</a> &mdash; Map data &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors',
    #                 name='Stamen Terrain').add_to(m)
    return m

In [6]:
def add_data_to_map(file_path):
    '''Add extracted places to map'''
    results = extract_places(file_path)
    m = load_basemap()
    data = load_pleiades_data()
    missing_locations = []
    for result in results:
        feature_group = folium.FeatureGroup(result["title"]+" ("+result["p_number"]+" places)")
        for place in result["places"]:
            try:
                location = next(item for item in data if item["uri"] == place["ID"])
                place_name = location.get("title")
            except:
                place_name = place["name"]
                location = "empty"
            try:
                long,lat = location.get("reprPoint")
            except:
                missing_locations.append(place_name)
                long = "0"
                lat = "0"
            popup_entry = result["title"]+" "+place["Context"]
            mult = place["Count"]
            # Add marker to map
            if lat != 0 and long != 0:
                folium.CircleMarker(
                radius=2*mult,
                location=[lat,long],
                tooltip=place_name,
                popup=popup_entry,
                color="darkgreen",
                fill=True,
                ).add_to(feature_group)
        feature_group.add_to(m)
    folium.LayerControl().add_to(m)
    print("Missing coordinates for: "+", ".join(missing_locations))
    return m

In [7]:
def show_places(file_path):
    # show map
    m = add_data_to_map(file_path)
    return m
    #m.save('ApolConst-map.html')

In [8]:
def save_map(file_path,file_name):
    '''Map places in file_path and save to file_name (html)'''
    m = add_data_to_map(file_path)
    m.save(file_name)

## Map

In [9]:
show_places("~/Dokumente/projekte/pta_data/data/pta0022/pta010/*.xml")

Missing coordinates for: Aegyptus (Roman imperial province), Libya Superior


In [10]:
save_map("~/Dokumente/projekte/pta_data/data/pta0022/pta010/*.xml","Athanasius.html")

Missing coordinates for: Aegyptus (Roman imperial province), Libya Superior
