In [1]:
import pandas as pd
import geopandas as gpd
from pprint import pprint
import matplotlib.pyplot as plt
import contextily as cx
import folium

import math

In [2]:
gaz = 'nga'
# textfile = "data/texts/UK accused of abandoning world’s poor as aid turne.txt"
textfile = "data/texts/aid_wiki.txt"


if gaz == "nga":
    gazetteerpath = "data/nga/countries_administrative.csv"
    indexOfLat = 4
    indexOfLon = 5
    n_lines =  484618 #for nga administrative
    # n_lines =  284485 #for nga administrative approved
    nameCol = "FULL_NAME_RO" # for nga

elif gaz == "geonames":
    gazetteerpath = "data/geonames/allCountries_cleaned.csv"
    indexOfLat = 5
    indexOfLon = 6
    n_lines = 6974472
    nameCol = "name"


Define function to give information about the progress of reading the csv

In [3]:
def read_csv_in_chunks(path, n_lines, **read_params):
    if 'chunksize' not in read_params or read_params['chunksize'] < 1:
        read_params['chunksize'] = 80000

    chunks = [0] * math.ceil(n_lines / read_params['chunksize'])

    for i, chunk in enumerate(pd.read_csv(path, **read_params)):
        percent = min(((i + 1) * read_params['chunksize'] / n_lines) * 100, 100.0)
        print("#" * int(percent), f"{percent:.2f}%", end='\r', flush=True)
        chunks[i] = chunk
    
    print()
    print("Now concatenating chunks...")
    df = pd.concat(chunks, axis=0)
    del chunks
    print("Finished!")
    return df

Read gazetter data (csv) and save placenames in a list

In [4]:
# read in the gazetter csv
df_gazetteer = read_csv_in_chunks(
    path = gazetteerpath,
    n_lines = n_lines,
    low_memory=False)

# save all placenames as a list
placenames = df_gazetteer[nameCol].tolist()
print(placenames[:5])

#################################################################################################### 100.00%
Now concatenating chunks...
Finished!
['Aruba', 'Land Aruba', 'Aruba', 'Country of Aruba', 'Aruba']


Read in the textfile that is to be georeferenced

In [5]:
with open(textfile, "r") as wiki:
    lines = wiki.readlines()
allWords = []
for line in lines:
    allWords.extend(line.strip().split(" "))

Match words in the textfile with placenames in the gazetteer. There is a moving window that can be set with the variable num_words.

In [6]:
# create a set to speed up the query
placenames_set = set(placenames)
# create empty list that is filled with the matched placenames
af_matches = []
# define size of moving window
num_words = 6


skip_count = -999
# loop throuth the words of the text
for i, word in enumerate(allWords):

    # the amount of words that the last placename match consisted of is skipped
    # e.g. "united states" and "states" would be two matches otherwise 
    skip_count -= 1
    if skip_count >= 0:
        continue

    try:
        nWords = [nword for nword in allWords[i : i+num_words]]

        # take the current n words (moving window) and create all possible concatenations that contain the last word
        # if there is a match, all words that where in the moving window are skipped
        for j in range(num_words-1, -1, -1):
            tmp_words = " ".join(nWords[:j+1])
            if tmp_words in placenames_set:
                af_matches.append(tmp_words)
                skip_count = j
                continue

    except IndexError as e:
        print(e)
        break

Create Dictionary with the matched placenames as keys and dictionaries with the counts as values.

In [7]:
d = {}
len_af_matches = len(af_matches)

for i, match in enumerate(af_matches):
    if match in d.keys():
        d[match]["count"] += 1
    else:
        d[match] = {"count": 1}

Fill the dictionary with the coordinates of the placenames.

In [8]:
lenKeys = len(d)
logInterval = 1 #int(round(lenKeys/20, 0))

for i, (placename, attributes) in enumerate(d.items()):

    tmp_df_values = df_gazetteer.query(f"{nameCol} == @placename").values[0]
    attributes["lat"] = tmp_df_values[indexOfLat]
    attributes["lon"] = tmp_df_values[indexOfLon]

    # give feedback to progress
    
    if (i % logInterval == 0 and i > 0) or i == lenKeys:
        print(f"{i} of {lenKeys} ({round((i/lenKeys)*100, 1)}%)", end='\r')

89 of 90 (98.9%)

Create a geodataframe with the matched placenames

In [9]:
geo = pd.DataFrame(
    [ [place] + list(attributes.values()) for place, attributes in d.items()],
    columns=["name", "count", "lat", "lon"])


In [10]:


geo = gpd.GeoDataFrame(
    geo,
    geometry=gpd.points_from_xy(geo.lon, geo.lat),
    crs=4326
)

geojsonname = textfile[textfile.find("/")+1:textfile.find(".")][6:]
geo.to_file(f"data/geodataframes/{geojsonname}.geojson", driver='GeoJSON')

In [11]:
pointMap = gpd.GeoSeries.explore(
    geo,
    tiles="Stamen Watercolor",
    marker_type='circle',
    marker_kwds={'radius': 50000,
    'fill': True})

pointMap

In [12]:
from folium import plugins

# extract coordinate of geodataframe
coordinates = [[point.xy[1][0], point.xy[0][0]] for point in geo.geometry]

# create map
heatMap = folium.Map(location = [15,30], zoom_start = 2, tiles=None)

# add tiles
folium.TileLayer(tiles='Cartodb dark_matter', name="Dark").add_to(heatMap)
folium.TileLayer(tiles='stamen watercolor', name="Watercolor").add_to(heatMap)

# add points and markercluster
points = folium.FeatureGroup(name="Points", show=False).add_to(heatMap)
cluster = plugins.MarkerCluster(name="Cluster").add_to(heatMap)
for coordinate in coordinates:
    folium.Circle(coordinate).add_to(cluster)
    folium.Circle(coordinate).add_to(points)

# add heatmap
plugins.HeatMap(
    name = 'HeatMap',
    data = coordinates,
    min_opacity = 0.3
    ).add_to(heatMap)

# add layercontrol
folium.LayerControl(collapsed=False).add_to(heatMap)


heatMap