In [2]:
import pickle
import re
# print the named entities frist 10
def load_nes(type):
    
    with open('./data/named_entities.pickle', 'rb') as f:
        i = 0

        nes_raw = [];
        while True:
            try:
                ne = pickle.load(f)
                # print all the GPE entities, GPE: Geopolitical Entity
                if ne.label() == type :nes_raw.append(str(ne))

            except EOFError:
                break
    return process_nes(nes_raw,type)

def process_nes(nes,type):
    transformed_nes = []
    i=0
    for ne in nes:
        loc = ne.replace('('+type+' ','')
        loc = re.sub(r'/[A-Z]+','',loc)
        loc = loc.replace(')','')
        if i<10: print(loc)
        i+=1
        transformed_nes.append(loc)

    return transformed_nes


# https://towardsdatascience.com/geoparsing-with-python-c8f4c9f78940
# https://geopy.readthedocs.io/en/stable/index.html#accessing-geocoders
import geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
geopy.geocoders.options.default_user_agent = 'my_app/2'
geopy.geocoders.options.default_timeout = 2
geolocator = Nominatim()

def extract_locations(nes):
    lat_lon = []
    for ne in nes: 
        try:
            location = geolocator.geocode(ne)
            if location:
                # print(ne,location.latitude, location.longitude)
                lat_lon.append(location)
        except GeocoderTimedOut as e:  print("Error: geocode failed on input %s with message %s"%(ne, e))
    return lat_lon

# display the coordinates on a map
import folium
from folium.plugins import MarkerCluster, HeatMap
from lat_lon_parser import parse


# mc = MarkerCluster()
# mc.add_child(folium.Marker(location=[lat,long]))
# folium.Marker(location=[lat,long]).add_to(m)
#  add the marker cluster to the map
# m.add_child(mc)

def visualise_locations(lat_lon,name, heat = False, connected = False, data_type = 'locations'):
    # create a map
    m = folium.Map(location=[0, -150], zoom_start=2)

    if data_type == 'locations':    data = [[loc.latitude,loc.longitude] for loc in lat_lon]
    elif data_type == 'dataframe': 
        data = [     
            [parse(str(row['degrees_lat'])+'° '+str(row['minutes_lat'])+"' "+row['directio_lat']),
            parse(str(row['degrees_lon'])+'° '+str(row['minutes_lon'])+"' "+row['directio_lon'])]
            for idx, row in lat_lon.iterrows()
        ]


    if heat:   HeatMap(data).add_to(m)
    elif connected:
        for i in range(len(data)-1):
            folium.PolyLine(locations=[data[i],data[i+1]], weight=2, color='blue').add_to(m) 
    else:
        for d in data: folium.Marker(location=d).add_to(m)


    # save it as html
    title = name+'.html'
    if heat: title = 'heat_'+title
    elif connected: title = 'connected_'+title
    m.save('./plots/'+title)

    # display the map
    return m


In [12]:
gpes = load_nes('GPE')
gpe_lat_lon = extract_locations(gpes)
visualise_locations(gpe_lat_lon,'gpe',heat=True)


Europe
America
Australia
Captain
Similar
Fifteen
United States
Europe
Aleutian
Kulammak


In [8]:
locations = load_nes('GPE')

def read_file(file_name):
    with open(file_name) as f:
        lines = f.readlines()
    return lines

path = './data/corefed_text.txt'

coref_text = read_file(path)[0]

import nltk
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
coref_sentences = tokenizer.tokenize(coref_text)

# print the locations in their contexts
for i,sent in enumerate(coref_sentences):
    for loc in locations:
        subtext = ' '.join(coref_sentences[i-1:i+1])
        if loc in sent: print(loc,',',subtext)  ; print(); break
          

Europe
America
Australia
Captain
Similar
Fifteen
United States
Europe
Aleutian
Kulammak
No ,  PART ONE     CHAPTER I A SHIFTING REEF   The year 1866 was signalised by a remarkable incident, a mysterious and puzzling phenomenon, which doubtless no one has yet forgotten. Not to mention rumours which agitated the maritime population and excited the public mind, even in the interior of continents, seafaring men were particularly excited.

Europe , Not to mention rumours which agitated the maritime population and excited the public mind, even in the interior of continents, seafaring men were particularly excited. Merchants, common sailors, captains of vessels, skippers, both of Europe and America, naval officers of all countries, and the Governments of several states on the two continents, were deeply interested in the matter.

Australia , As to classing it in the list of fables, the idea was out of the question. On the 20th of July, 1866, the steamer _Governor Higginson_, of the Calcutta a

In [11]:
locations = load_nes('LOCATION')
location_lat_lon = extract_locations(locations)
visualise_locations(location_lat_lon,'location')

In [69]:
location_lat_lon

[Location(India, Zillebeke, Ieper, West-Vlaanderen, 8902, België / Belgique / Belgien, (50.848914, 2.9505882, 0.0)),
 Location(North Pacific Ocean, (30.0, -170.0, 0.0)),
 Location(North Sea / Nordsee / Noordzee / Nordsøen / Nordsjøen / Mer du Nord, (56.0026997, 2.8144672799047834, 0.0)),
 Location(North Pacific Ocean, (30.0, -170.0, 0.0)),
 Location(East River, The Bronx, City of New York, Hudson County, New York, United States, (40.7792911, -73.92771, 0.0)),
 Location(Nord, Cameroun, (8.7712794, 13.7803627, 0.0)),
 Location(Northern Pacific, East Trent Avenue, Irwin, Trentwood, Spokane Valley, Spokane County, Washington, 99216, United States, (47.6916617, -117.2335527, 0.0)),
 Location(France, (46.603354, 1.8883335, 0.0)),
 Location(North Sea / Nordsee / Noordzee / Nordsøen / Nordsjøen / Mer du Nord, (56.0026997, 2.8144672799047834, 0.0)),
 Location(Southern Seas, El Mar Drive, Silver Shores, Lauderdale-by-the-Sea, Broward County, Florida, 33308, United States, (26.193151, -80.0952056

In [68]:
locations = load_nes('LOCATION')
location_lat_lon = extract_locations(locations)
visualise_locations(location_lat_lon,'location', connected=True)

West India
North Pacific Ocean
North Sea
North Pacific Ocean
East River
North
Northern Pacific
France
North Sea
Southern Seas


In [67]:
# load the coordinates of the books
import pandas as pd
df_concat = pd.read_csv('./data/coordinates.csv')

# filling the missing values using the last valid observation
# df_concat.fillna(method='ffill', inplace=True)
#    ffill: propagate last valid observation forward to next valid.
#     backfill / bfill: use next valid observation to fill gap.

# df_concat_fill =df_concat.fillna(method='bfill')
df_concat_fill =df_concat.fillna(method='ffill')

# todo rounding down the degrees to the nearest integer and passing the rest to the minutes
df_concat_fill['degrees_lat'] = df_concat_fill['degrees_lat'].astype(float).astype(int)
df_concat_fill['degrees_lon'] = df_concat_fill['degrees_lon'].astype(float).astype(int)
print(len(df_concat_fill))
df_concat_fill.tail(15)

# equivalent to the above code but without using dataframes
# complete incomplete coordinates with the previous one
# keys = list(book_coordinates.keys())
# for i in range(len(keys)):
#     if 'lon' not in book_coordinates[keys[i]]:
#         book_coordinates[keys[i]] = book_coordinates[keys[i]]+' lon ' +book_coordinates[keys[i-1]].split('lon')[1]
#     if 'lat' not in book_coordinates[keys[i]]:
#         book_coordinates[keys[i]] = book_coordinates[keys[i-1]].split('lon')[0]+ book_coordinates[keys[i]]

43


Unnamed: 0.1,Unnamed: 0,coordinates,degrees_lat,minutes_lat,directio_lat,degrees_lon,minutes_lon,directio_lon
28,428050,lat 45° 37' S lon 37° 53' W,45,37.0,S,37,53.0,W
29,447397,lon 50°,45,37.0,S,50,53.0,W
30,447777,lat 55°,55,37.0,S,50,53.0,W
31,448844,lat 60°,60,37.0,S,50,53.0,W
32,450406,lat 50°,50,37.0,S,50,53.0,W
33,453304,lat 67° 39' S lon 51°,67,39.0,S,51,53.0,W
34,461155,lat 52°,59,39.0,S,51,53.0,W
35,461172,lat 67° 30',78,30.0,S,51,53.0,W
36,461184,lat 90°,90,30.0,S,51,53.0,W
37,465680,lat 60°,60,30.0,S,51,53.0,W


In [13]:
visualise_locations(df_concat_fill,'books',data_type='dataframe')

In [64]:
# save locations to a gpx file
import gpxpy
import gpxpy.gpx
import datetime
gpx = gpxpy.gpx.GPX()
gpx_track = gpxpy.gpx.GPXTrack()
gpx.tracks.append(gpx_track)

def translate_longitude(longitude):                                             
    if longitude < -120:                                                          
            return  longitude   +360                                              
    else:                                                                      
            return longitude 

# Create first segment in our GPX track:
gpx_segment = gpxpy.gpx.GPXTrackSegment()
gpx_track.segments.append(gpx_segment)
for idx, row in df_concat_fill.iterrows():
#    add a segment to the gpx file
    # create time
    time = datetime.datetime(2019, 10, 10, 10, 10, 10)
    gpx_segment.points.append(gpxpy.gpx.GPXTrackPoint(
                parse(str(row['degrees_lat'])+'° '+str(row['minutes_lat'])+"' "+row['directio_lat']),
                translate_longitude(parse(str(row['degrees_lon'])+'° '+str(row['minutes_lon'])+"' "+row['directio_lon'])),
                elevation=0, time=time))
    # gpx.tracks.append(gpx_segment)

# save to a file
with open("./data/books_coos.gpx", "w") as f:  

    f.write(gpx.to_xml())

from gpxplotter import read_gpx_file, create_folium_map, add_segment_to_map


# Define some properties for drawing the line:
line_options = {'color': 'red', 'weight': 8, 'opacity': 0.5}

the_map = create_folium_map(tiles='openstreetmap') # 'stamenterrain',  'openstreetmap'
for track in read_gpx_file('./data/books_coos.gpx'):
    for i, segment in enumerate(track['segments']):
        add_segment_to_map(the_map, segment, line_options=line_options)

# To store the map as a HTML paghttps://file+.vscode-resource.vscode-cdn.net/home/louis/Documents/IRTM/project/#e:
the_map.save('./plots/book_coos.html')

# To display the map in a Jupyter notebook:
the_map  

In [65]:
# visualise_locations(df_concat_fill,'books',connected=True,data_type='dataframe')
 

In [34]:
from gpxplotter import read_gpx_file, create_folium_map, add_segment_to_map

# Define some properties for drawing the line:
line_options = {'color': 'red', 'weight': 8, 'opacity': 0.5}

the_map = create_folium_map(tiles='openstreetmap') # 'stamenterrain',  'openstreetmap'
for track in read_gpx_file('./data/nautilus_route.gpx'):
    for i, segment in enumerate(track['segments']):
        add_segment_to_map(the_map, segment, line_options=line_options, add_start_end=False)

# To store the map as a HTML page:
the_map.save('./plots/ground_truth.html')

# To display the map in a Jupyter notebook:
the_map