## Import modules

In [1]:
import random
import pandas as pd
import geopandas as gpd

from folium import plugins

import html2text
import nltk
import folium

import os
import math

## Minor Functions

Define function to debug

In [2]:
import inspect
import re


def prinfo(*args, **kwargs):
    frame = inspect.currentframe().f_back
    s = inspect.getframeinfo(frame).code_context[0]
    r = re.search(r"\((.*)\)", s).group(1)
    vnames = r.split(", ")

    if 'newline' in kwargs:
        newlinestring = "\n" if kwargs['newline'] else ""
    else:
        newlinestring = ""

    for i, (var, val) in enumerate(zip(vnames, args)):
        print(f"{var} = {newlinestring}{val}")

Define function to give information about the progress of reading the csv

In [3]:
def read_csv_in_chunks(path: str, n_lines: int, nameCol: str, **read_params) -> pd.DataFrame:
    if 'chunksize' not in read_params or read_params['chunksize'] < 1:
        read_params['chunksize'] = 80000

    chunks = [0] * math.ceil(n_lines / read_params['chunksize'])

    for i, chunk in enumerate(pd.read_csv(path, **read_params)):
        percent = min(
            ((i + 1) * read_params['chunksize'] / n_lines) * 100, 100.0)
        print("#" * int(percent), f"{percent:.2f}%", end='\r', flush=True)
        chunks[i] = chunk[chunk[nameCol].notnull()]

    print()
    print("Now concatenating chunks...")
    df = pd.concat(chunks, axis=0)
    del chunks
    print("Finished!")
    return df

## Import Stuff

### Gazetteer

In [4]:


def read_gazetteer(gaz: str = 'nga') -> dict:

    if gaz == "nga":
        gazetteerpath = "data/gazetteers/nga/countries_administrative.csv"
        # indexOfLat = 4
        # indexOfLon = 5
        indexOfLat = 1
        indexOfLon = 2
        n_lines = 484618  # for nga administrative
        # n_lines =  284485 #for nga administrative approved
        # n_lines = 7866485 #for nga administrative populated
        nameCol = "SORT_NAME_RO"  # for nga

        # read in the gazetter csv
        df_gazetteer = read_csv_in_chunks(
            path=gazetteerpath,
            n_lines=n_lines,
            low_memory=False,
            nameCol=nameCol)

        df_gazetteer = pd.concat(
            [df_gazetteer,
             pd.read_csv("data/gazetteers/own_places.csv")]
        )

    elif gaz == "geonames":
        gazetteerpath = "data/geonames/allCountries_cleaned.csv"
        indexOfLat = 2
        indexOfLon = 3
        n_lines = 6974472  # for allCountries_cleaned.csv
        # n_lines = 2079830 # for allCountries_AT.csv
        nameCol = "name"

        # read in the gazetter csv
        df_gazetteer = read_csv_in_chunks(
            path=gazetteerpath,
            n_lines=n_lines,
            low_memory=False,
            nameCol=nameCol)

    # Read gazetter data (csv) and save placenames in a list
    elif gaz == "countries":
        countrynames = pd.read_csv(
            "data/geodict_github/countrynames.csv", names=["short", "long"])
        countrypositions = pd.read_csv(
            "data/geodict_github/countrypositions.csv", names=["short", "lat", "lon"])

        df_gazetteer = countrynames.merge(countrypositions, on="short")
        df_gazetteer.long = df_gazetteer.long.str.strip()
        indexOfLat = 2
        indexOfLon = 3
        n_lines = 240
        nameCol = "long"

    dic = {
        'df_gazetteer': df_gazetteer,
        'indexOfLat': indexOfLat,
        'indexOfLon': indexOfLon,
        'nameCol': nameCol
    }

    return dic

### Textfile

Import textfile or call the html2text function to extract the text from a given url and save it for next use

In [5]:
def read_textfile(textfile: str = None, url: str = None) -> str:

    if textfile:
        with open(textfile, "r") as raw_text:
            text = raw_text.read()

    elif url:
        text_save_path = f"data/texts/autosave/lastText_{url[-20:]}"

        if not os.path.exists(text_save_path):
            text = html2text.html2text(url=url)

            with open(text_save_path, "w") as lt:
                lt.write(text)

        else:
            with open(text_save_path, "r") as lt:
                text = lt.read()

    else:
        raise Warning("No path specified")

    return text


# url = 'https://www.theguardian.com/global-development/2021/dec/21/uk-accused-of-abandoning-\
# worlds-poor-as-aid-turned-into-colonial-investment'
# url = "https://www.theguardian.com/world/2021/oct/21/cuts-to-overseas-aid-thwart-uk-efforts-\
# to-fight-covid-pandemic"

## Matching

Extract places from text using nltk

In [6]:
def create_placenames(text: str, df_gazetteer: pd.DataFrame, propDic: dict) -> tuple:

    tokenized = nltk.word_tokenize(text)
    tree = nltk.ne_chunk(nltk.pos_tag(tokenized))

    i = 0
    for word in tokenized:
        if word == "Asia":
            i += 1
            pass

    places = [
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() == "GPE"
    ]

    stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    places = [lemmatizer.lemmatize(word).upper().replace(
        " ", "") for word in places]
    prinfo(sorted(places)[:10])
    print()

    # Create Dictionary with placenames as keys and dictionaries with the counts as values.
    d = {}
    len_places = len(places)

    for i, place in enumerate(places):
        if place in d.keys():
            d[place]["count"] += 1
        else:
            d[place] = {"count": 1}

    # Fill the dictionary with the coordinates of the placenames.

    # add column with lemmatized placenames two compare them to lematized placenames of the text
    df_gazetteer["lemma_placenames"] = df_gazetteer[propDic['nameCol']].apply(
        lambda place: lemmatizer.lemmatize(place))
    df_gazetteer["stem_placenames"] = df_gazetteer[propDic['nameCol']].apply(
        lambda place: stemmer.stem(place))

    lenKeys = len(d)
    logInterval = 1  # int(round(lenKeys/20, 0))

    failed_places = []

    for i, (placename, attributes) in enumerate(d.items()):
        try:
            tmp_df_values = df_gazetteer.query(
                "lemma_placenames == @placename").values[0]
            # tmp_df_values = df_gazetteer[df_gazetteer["SORT_NAME_RO"] == "MAFIKENG"].values[0]
            attributes["name"] = tmp_df_values[7]
            attributes["lat"] = tmp_df_values[propDic['indexOfLat']]
            attributes["lon"] = tmp_df_values[propDic['indexOfLon']]

        except IndexError as e:
            failed_places.append(placename)

        # give feedback to progress
        if (i % logInterval == 0 and i > 0) or i+1 == lenKeys:
            print(f"{i+1} of {lenKeys} ({round((i/lenKeys)*100, 1)}%)", end='\r')

    for i, (place, attributes) in enumerate(d.items()):
        if i < 10:
            print((place, attributes))

    # Catch information about nonfound placenames and delete them from the dictionary
    num_fails = len(failed_places)
    for fail in failed_places:
        del d[fail]

    df = pd.DataFrame(
        [list(attributes.values()) + [place]
         for place, attributes in d.items()],
        columns=["count", "name", "lat", "lon", "stemname"])

    geo = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df.lon, df.lat),
        crs=4326
    )

    return (geo, d)

    # geojsonname = textfile[textfile.find("/")+1:textfile.find(".")][6:]
    # geo.to_file(f"data/geodataframes/{geojsonname}.geojson", driver='GeoJSON')

## Initialization

In [7]:
propDic = read_gazetteer()
df_gazetteer = propDic['df_gazetteer']
text = read_textfile(
    url="https://www.theguardian.com/global-development/2022/jan/14/worlds-poorest-bear-brunt-of-climate-crisis-10-underreported-emergencies")
# text = read_textfile("data/texts/tagesanzeiger_spendensammler.txt")
# text = read_textfile("data/texts/aid_wiki.txt")
geo, d = create_placenames(
    text=text, df_gazetteer=df_gazetteer, propDic=propDic)

#################################################################################################### 100.00%
Now concatenating chunks...
Finished!
sorted(places)[:10] = ['BANGASSOU', 'BUENAVENTURA', 'BURUNDI', 'GUARDIAN', 'GUATEMALA', 'HONDURAS', 'HONDURAS', 'KNOWNHUMANITARIAN', 'LATIN', 'MALAWI']

('GUARDIAN', {'count': 1})
('SAN', {'count': 2, 'name': 'San', 'lat': 13.178953, 'lon': -5.016175})
('KNOWNHUMANITARIAN', {'count': 1})
('ZAMBIA', {'count': 1, 'name': 'Zambia', 'lat': -14.33333, 'lon': 28.5})
('RUSSIA', {'count': 1, 'name': 'Russia', 'lat': 60.0, 'lon': 100.0})
('UKRAINE', {'count': 2, 'name': 'Ukraine', 'lat': 49.0, 'lon': 32.0})
('UKRAINIAN', {'count': 1})
('MALAWI', {'count': 1, 'name': 'Malawi', 'lat': -13.5, 'lon': 34.0})
('BANGASSOU', {'count': 1})
('GUATEMALA', {'count': 1, 'name': 'Guatemala', 'lat': 14.666667, 'lon': -90.5})


## Visualization

In [8]:
random.seed(1)

colordic = {}
for i in range(geo["count"].max()):
    colordic[i+1] = "#%06x" % random.randint(0, 0xFFFFFF)
colordic = {
    1: '#440154',
    2: '#3b528b',
    3: '#21918c',
    4: '#5ec962',
    5: '#fde725'
}
print(colordic)

color_list = [colordic[count] for count in geo["count"]]

{1: '#440154', 2: '#3b528b', 3: '#21918c', 4: '#5ec962', 5: '#fde725'}


In [9]:
# create map
heatMap = folium.Figure(width='75%')
heatMap = folium.Map(
    location=[15, 30],
    zoom_start=2,
    max_bounds=True,
    tiles=None).add_to(heatMap)

# add tiles
folium.TileLayer(tiles='Cartodb dark_matter', name="Dark").add_to(heatMap)
folium.TileLayer(tiles='stamen watercolor', name="Watercolor").add_to(heatMap)

# add points and markercluster
points = folium.FeatureGroup(name="Points", show=True).add_to(heatMap)
cluster = plugins.MarkerCluster(name="Cluster", show=False).add_to(heatMap)

for place, attributes in d.items():

    coordinates = (attributes['lat'], attributes['lon'])

    html = f'''

    <strong>Name:</strong> &emsp;&emsp;&emsp;&emsp;&emsp;{attributes['name']}<br/>
    <strong>Stemmed Name:</strong>&emsp;{place}<br/>
    <strong>Count:</strong>&emsp;&emsp;&emsp;&emsp;&emsp;{attributes['count']}
    
    '''

    iframe = folium.IFrame(
        html,
        width=300,
        height=70)

    popup = folium.Popup(iframe)

    folium.Circle(coordinates).add_to(cluster)
    folium.Circle(
        location=coordinates,
        popup=popup,
        tooltip=attributes['name'],
        radius=attributes['count'] * 50000,
        fill=True,
        color=colordic[attributes['count']]
    ).add_to(points)

# extract coordinate of geodataframe
coordinates = [[point.xy[1][0], point.xy[0][0]] for point in geo.geometry]

# add heatmap
plugins.HeatMap(
    name='HeatMap',
    data=coordinates,
    min_opacity=0.3,
    show=False
).add_to(heatMap)


new = []
for place in geo.iterrows():
    for i in range(place[1]['count']):
        new.append(place[1])

geo_multiple = pd.DataFrame(new)

geo_multiple = gpd.GeoDataFrame(
    geo_multiple,
    geometry=gpd.points_from_xy(geo_multiple.lon, geo_multiple.lat),
    crs=4326
)

# extract coordinate of geodataframe
coordinates = [[point.xy[1][0], point.xy[0][0]]
               for point in geo_multiple.geometry]

# add heatmap
plugins.HeatMap(
    name='HeatMap_multiple',
    data=coordinates,
    min_opacity=0.3,
    show=False
).add_to(heatMap)

# add layercontrol
folium.LayerControl(collapsed=False).add_to(heatMap)


heatMap

In [10]:
pointMap = folium.Figure(width='35%')
gpd.GeoSeries.explore(
    geo,
    color=color_list,
    max_bounds=True,
    tiles="Open Street Map",  # "Stamen Watercolor",
    marker_type='circle',
    marker_kwds={
        'radius': 50000,
        'fill': True}).add_to(pointMap)

# folium.TileLayer(tiles='stamen watercolor', name="Watercolor").add_to(pointMap)

pointMap