In [41]:
import pandas as pd
import geopandas as gpd
from pprint import pprint
# import matplotlib.pyplot as plt
# import contextily as cx
import folium
import html2text
import os

import math

import inspect
import re


Define function to debug

In [42]:
def prinfo(*args, **kwargs):
    frame = inspect.currentframe().f_back
    s = inspect.getframeinfo(frame).code_context[0]
    r = re.search(r"\((.*)\)", s).group(1)
    vnames = r.split(", ")

    if 'newline' in kwargs:
        newlinestring = "\n" if kwargs['newline'] else ""
    else:
        newlinestring = ""

    for i, (var, val) in enumerate(zip(vnames, args)):
        print(f"{var} = {newlinestring}{val}")


Define function to give information about the progress of reading the csv

In [43]:
def read_csv_in_chunks(path, n_lines, **read_params):
    if 'chunksize' not in read_params or read_params['chunksize'] < 1:
        read_params['chunksize'] = 80000

    chunks = [0] * math.ceil(n_lines / read_params['chunksize'])

    for i, chunk in enumerate(pd.read_csv(path, **read_params)):
        percent = min(((i + 1) * read_params['chunksize'] / n_lines) * 100, 100.0)
        print("#" * int(percent), f"{percent:.2f}%", end='\r', flush=True)
        chunks[i] = chunk
    
    print()
    print("Now concatenating chunks...")
    df = pd.concat(chunks, axis=0)
    del chunks
    print("Finished!")
    return df

In [44]:
gaz = 'nga'
textfile = "data/texts/aid_wiki.txt"

# IMPORTANT: try the package nltk (natural language processing)

url = 'https://www.theguardian.com/global-development/2021/dec/21/uk-accused-of-abandoning-worlds-poor-as-aid-turned-into-colonial-investment'
text_save_path = f"data/texts/autosave/lastText_{url[-20:]}"
# url = "https://www.theguardian.com/world/2021/oct/21/cuts-to-overseas-aid-thwart-uk-efforts-to-fight-covid-pandemic"



if gaz == "nga":
    gazetteerpath = "data/nga/countries_administrative.csv"
    indexOfLat = 4
    indexOfLon = 5
    n_lines =  484618 #for nga administrative
    # n_lines =  284485 #for nga administrative approved
    # n_lines = 7866485 #for nga administrative populated
    nameCol = "SORT_NAME_RO" # for nga

elif gaz == "geonames":
    gazetteerpath = "data/geonames/allCountries_cleaned.csv"
    indexOfLat = 5
    indexOfLon = 6
    n_lines = 6974472
    nameCol = "name"
    


Read gazetter data (csv) and save placenames in a list

In [45]:
if gaz == "countries":
    countrynames = pd.read_csv("data/geodict_github/countrynames.csv", names=["short", "long"])
    countrypositions = pd.read_csv("data/geodict_github/countrypositions.csv", names=["short", "lat", "lon"])

    df_gazetteer = countrynames.merge(countrypositions, on="short")
    df_gazetteer.long = df_gazetteer.long.str.strip()
    indexOfLat = 2
    indexOfLon = 3
    n_lines = 240
    nameCol = "long"
    
else:
# read in the gazetter csv
    df_gazetteer = read_csv_in_chunks(
        path = gazetteerpath,
        n_lines = n_lines,
        low_memory=False)

# save all placenames as a list
placenames = [country.strip() for country in df_gazetteer[nameCol].tolist()]

prinfo(placenames[:5])

#################################################################################################### 100.00%
Now concatenating chunks...
Finished!
placenames[:5] = ['ARUBA', 'LANDARUBA', 'ARUBA', 'COUNTRYOFARUBA', 'ARUBA']


Import textfile or call the html2text function to extract the text from a given url and save it for next use

In [46]:
if textfile:
    with open(textfile, "r") as wiki:
        lines = wiki.readlines()
    allWords = []
    for line in lines:
        allWords.extend(line.strip().split(" "))

else:
    if not os.path.exists(text_save_path):
        allWords = html2text.html2text(url=url).split(" ")

        with open(text_save_path, "w") as lt:
            lt.write(" ".join(allWords))

    else:
        with open(text_save_path, "r") as lt:
            allWords = lt.read().split(" ")

    # # include only capital words in list
    # allWords = [word for word in allWords if word[0].isupper()]


Import most common words

In [47]:
# # 500
# common_set = [str(word).upper() for word in list(pd.read_csv("data/500_MostCommonWords.csv", skiprows=4, sep=";", header=None)[1].values)]

# 1000
with open("data/1000_MostCommonWords.txt", "r") as common_1000:
    common_set = {word.strip().upper() for word in common_1000.readlines()}


In [56]:
placenames_set

{'DESATONDANOUWSATU',
 'MUNICIPIODEMARUMBI',
 'EREBANGO',
 'PEIHUOSHANG',
 '北漳',
 'MUNICIPIODEPINHEIROPRETO',
 '王皮溜镇',
 'KECAMATANKOTASUMENEP',
 'HSIHANLING',
 '发窝乡',
 'LANDKREISWUNSIEDELIMFICHTELGEBIRGE',
 'TZULINGPU',
 'DAURALOCALGOVERNMENTAREA',
 '풍덕리',
 'LANGEH',
 '柘溪镇',
 'DESAWOLOMAPA',
 'PAIKUAN',
 'XATRUNGSON',
 'XEMIRXEK',
 'NKOMANGOMBE',
 'MANGNONGZHEN',
 '신효리',
 '금사리',
 'CALAKMUL',
 'DAEBANG',
 'ALTEPEXI',
 'TUNLITSUN',
 'BATANGAN',
 'DAIRADEBENIABBES',
 'MUNICIPIODEOUROFINO',
 '花丛',
 '河口街道',
 'CAGWAIT',
 '회덕b동',
 'SANLORENZORUIZUNO',
 'CHONDONGNI',
 'DESATANDOYONDO',
 'DESABALOREJO',
 'QINGTONGXIASHI',
 'BOEWEINTOBA',
 '涛圩',
 'GURVANZAGALSUM',
 'KENYAMONTA',
 '太平里乡',
 'CARRASCO',
 '花山',
 'JINNIU',
 '丈亭镇',
 'ITAPICURU',
 '东川镇',
 '방림면',
 'DEPARTEMENTDESARDENNES',
 'SORIOAN',
 '三官',
 '吉潭镇',
 '栏杆集镇',
 '荣华乡',
 'YABON',
 '전진리',
 '美里町',
 'MUNICIPIODESANTODOMINGOALBARRADAS',
 'KABUPATENTEGAL',
 'KAMPUNGKOMBAMA',
 'NORTHCADULAWAN',
 'TAPAUA',
 'HSUCHIATO',
 'あきるのし',
 '전호리',
 'WEERT',

Match words in the textfile with placenames in the gazetteer. There is a moving window that can be set with the variable num_words. Only words that don't appear in the 500 most frequent word list get added.

In [48]:
# create a set that only contains placenames that are not in the most common words set
placenames_set = {placename for placename in placenames if placename not in common_set}


# create empty list that is filled with the matched placenames
af_matches = []
# define size of moving window
num_words = 6


skip_count = -999
# loop through the words of the text
for i, word in enumerate(allWords):

    # the amount of words that the last placename match consisted of is skipped
    # e.g. "united states" and "states" would be two matches otherwise 
    skip_count -= 1
    if skip_count >= 0:
        continue

    try:
        nWords = [nword for nword in allWords[i : i+num_words]]

        # take the current n words (moving window) and create all possible concatenations that contain the last word
        # if there is a match, all words that where in the moving window are skipped
        for j in range(num_words-1, -1, -1):
            if gaz == "nga":
                tmp_words = "".join( [word.upper() for word in nWords[:j+1]] )
            if gaz in ["geonames", "countries"]:
                tmp_words = " ".join( [word.strip() for word in nWords[:j+1] ])

            # print(tmp_words)

            if tmp_words in placenames_set:
                af_matches.append(tmp_words)
                skip_count = j
                break

    except IndexError as e:
        print(e)
        break

Create Dictionary with the matched placenames as keys and dictionaries with the counts as values.

In [49]:
d = {}
len_af_matches = len(af_matches)

for i, match in enumerate(af_matches):
    if match in d.keys():
        d[match]["count"] += 1
    else:
        d[match] = {"count": 1}


Fill the dictionary with the coordinates of the placenames.

In [50]:
lenKeys = len(d)
logInterval = 1 #int(round(lenKeys/20, 0))

for i, (placename, attributes) in enumerate(d.items()):

    tmp_df_values = df_gazetteer.query(f"{nameCol} == @placename").values[0]
    # tmp_df_values = df_gazetteer[df_gazetteer["SORT_NAME_RO"] == "MAFIKENG"].values[0]
    attributes["lat"] = tmp_df_values[indexOfLat]
    attributes["lon"] = tmp_df_values[indexOfLon]

    # give feedback to progress
    
    if (i+1 % logInterval == 0 and i > 0) or i+1 == lenKeys:
        print(f"{i+1} of {lenKeys} ({round((i/lenKeys)*100, 1)}%)", end='\r')

100 of 100 (99.0%)

In [55]:
lyst = []

for word in common_set:
    # bool = df_gazetteer[nameCol].str.contains(word, case=False)
    # if bool.sum() > 1:
    if word in placenames_set:
        lyst.append(word)



Create a geodataframe with the matched placenames

In [52]:
df = pd.DataFrame(
    [ [place] + list(attributes.values()) for place, attributes in d.items()],
    columns=["name", "count", "lat", "lon"])

geo = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.lon, df.lat),
    crs=4326
)

# geojsonname = textfile[textfile.find("/")+1:textfile.find(".")][6:]
# geo.to_file(f"data/geodataframes/{geojsonname}.geojson", driver='GeoJSON')

geo.head()

Unnamed: 0,name,count,lat,lon,geometry
0,ADE,1,12.456154,22.240731,POINT (22.24073 12.45615)
1,ASA,21,27.7824,101.764,POINT (101.76400 27.78240)
2,PUBLIC,10,7.688139,124.774652,POINT (124.77465 7.68814)
3,READING,2,51.45,-0.966667,POINT (-0.96667 51.45000)
4,ITIS,21,60.927418,26.280911,POINT (26.28091 60.92742)


In [53]:
pointMap = folium.Figure(width='35%')
gpd.GeoSeries.explore(
    geo,
    max_bounds=True,
    tiles="Open Street Map", # "Stamen Watercolor",
    marker_type='circle',
    marker_kwds={'radius': 50000,
    'fill': True}).add_to(pointMap)

# folium.TileLayer(tiles='stamen watercolor', name="Watercolor").add_to(pointMap)

pointMap

In [54]:
from folium import plugins

# extract coordinate of geodataframe
coordinates = [[point.xy[1][0], point.xy[0][0]] for point in geo.geometry]

# create map
heatMap = folium.Figure(width='35%')
heatMap = folium.Map(location = [15,30], zoom_start = 2, tiles=None).add_to(heatMap)

# add tiles
folium.TileLayer(tiles='Cartodb dark_matter', name="Dark").add_to(heatMap)
folium.TileLayer(tiles='stamen watercolor', name="Watercolor").add_to(heatMap)

# add points and markercluster
points = folium.FeatureGroup(name="Points", show=False).add_to(heatMap)
cluster = plugins.MarkerCluster(name="Cluster").add_to(heatMap)
for coordinate in coordinates:
    folium.Circle(coordinate).add_to(cluster)
    folium.Circle(coordinate).add_to(points)

# add heatmap
plugins.HeatMap(
    name = 'HeatMap',
    data = coordinates,
    min_opacity = 0.3
    ).add_to(heatMap)

# add layercontrol
folium.LayerControl(collapsed=False).add_to(heatMap)


heatMap