In [21]:
import pickle
import re
# print the named entities frist 10
def load_nes(type):
    
    with open('./data/named_entities.pickle', 'rb') as f:
        i = 0

        nes_raw = [];
        while True:
            try:
                ne = pickle.load(f)
                # print all the GPE entities, GPE: Geopolitical Entity
                if ne.label() == type :nes_raw.append(str(ne))

            except EOFError:
                break
    return process_nes(nes_raw,type)

def process_nes(nes,type):
    transformed_nes = []
    i=0
    for ne in nes:
        loc = ne.replace('('+type+' ','')
        loc = re.sub(r'/[A-Z]+','',loc)
        loc = loc.replace(')','')
        if i<10: print(loc)
        i+=1
        transformed_nes.append(loc)

    return transformed_nes


# https://towardsdatascience.com/geoparsing-with-python-c8f4c9f78940
# https://geopy.readthedocs.io/en/stable/index.html#accessing-geocoders
import geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
geopy.geocoders.options.default_user_agent = 'my_app/2'
geopy.geocoders.options.default_timeout = 2
geolocator = Nominatim()

def extract_locations(nes):
    lat_lon = []
    for ne in nes: 
        try:
            location = geolocator.geocode(ne)
            if location:
                # print(ne,location.latitude, location.longitude)
                lat_lon.append(location)
        except GeocoderTimedOut as e:  print("Error: geocode failed on input %s with message %s"%(ne, e))
    return lat_lon

# display the coordinates on a map
import folium
from folium.plugins import MarkerCluster, HeatMap
from lat_lon_parser import parse


# mc = MarkerCluster()
# mc.add_child(folium.Marker(location=[lat,long]))
# folium.Marker(location=[lat,long]).add_to(m)
#  add the marker cluster to the map
# m.add_child(mc)

def visualise_locations(lat_lon,name, heat = False, connected = False, data_type = 'locations'):
    # create a map
    m = folium.Map(location=[0, 0], zoom_start=2)

    if data_type == 'locations':    data = [[loc.latitude,loc.longitude] for loc in lat_lon]
    elif data_type == 'dataframe': 
        data = [     
            [parse(str(row['degrees_lat'])+'° '+str(row['minutes_lat'])+"' "+row['directio_lat']),
            parse(str(row['degrees_lon'])+'° '+str(row['minutes_lon'])+"' "+row['directio_lon'])]
            for idx, row in lat_lon.iterrows()
        ]


    if heat:   HeatMap(data).add_to(m)
    elif connected:
        for i in range(len(data)-1):
            folium.PolyLine(locations=[data[i],data[i+1]], weight=2, color='blue').add_to(m) 
    else:
        for d in data: folium.Marker(location=d).add_to(m)


    # save it as html
    title = name+'.html'
    if heat: title = 'heat_'+title
    elif connected: title = 'connected_'+title
    m.save('./plots/'+title)

    # display the map
    return m


In [12]:
gpes = load_nes('GPE')
gpe_lat_lon = extract_locations(gpes)
visualise_locations(gpe_lat_lon,'gpe',heat=True)


Europe
America
Australia
Captain
Similar
Fifteen
United States
Europe
Aleutian
Kulammak


In [11]:
locations = load_nes('LOCATION')
location_lat_lon = extract_locations(locations)
visualise_locations(location_lat_lon,'location')

In [15]:
# load the coordinates of the books
import pandas as pd
df_concat = pd.read_csv('./data/coordinates.csv')

# filling the missing values using the last valid observation
# df_concat.fillna(method='ffill', inplace=True)
#    ffill: propagate last valid observation forward to next valid.
#     backfill / bfill: use next valid observation to fill gap.

# df_concat_fill =df_concat.fillna(method='bfill')
df_concat_fill =df_concat.fillna(method='ffill')

# todo rounding down the degrees to the nearest integer and passing the rest to the minutes
df_concat_fill['degrees_lat'] = df_concat_fill['degrees_lat'].astype(float).astype(int)
df_concat_fill['degrees_lon'] = df_concat_fill['degrees_lon'].astype(float).astype(int)
df_concat_fill.tail(15)

# equivalent to the above code but without using dataframes
# complete incomplete coordinates with the previous one
# keys = list(book_coordinates.keys())
# for i in range(len(keys)):
#     if 'lon' not in book_coordinates[keys[i]]:
#         book_coordinates[keys[i]] = book_coordinates[keys[i]]+' lon ' +book_coordinates[keys[i-1]].split('lon')[1]
#     if 'lat' not in book_coordinates[keys[i]]:
#         book_coordinates[keys[i]] = book_coordinates[keys[i-1]].split('lon')[0]+ book_coordinates[keys[i]]

Unnamed: 0.1,Unnamed: 0,coordinates,degrees_lat,minutes_lat,directio_lat,degrees_lon,minutes_lon,directio_lon
17,262225,lat 15° S. lon 105° S.,15,18.0,S.,105,30.0,S.
18,288489,lat 12° 5' S. lon 94° 33',12,5.0,S.,94,33.0,S.
19,294966,lat 9° 4' N.,9,4.0,N.,94,33.0,S.
20,322030,lat 10° 30' N.,12,30.0,N.,94,33.0,S.
21,322038,lat 14° 30' N. lon 69° 50' E.,14,30.0,N.,69,50.0,E.
22,339216,lat 21° 30' N.,21,30.0,N.,69,50.0,E.
23,393560,lat 33° 22' lon 16° 17',33,22.0,N.,16,17.0,E.
24,422500,lat 45° N.,45,22.0,N.,16,17.0,E.
25,428050,lat 45° 37' S. lon 37° 53' W.,45,37.0,S.,37,53.0,W.
26,447777,lat 55°,55,37.0,S.,37,53.0,W.


In [22]:
visualise_locations(df_concat_fill,'books',data_type='dataframe')

In [19]:
visualise_locations(df_concat_fill,'books',connected=True,data_type='dataframe')

In [24]:
# display as saved plot

from IPython.display import IFrame
IFrame(src='./plots/books.html', width=700, height=600)
