## Import modules

In [1]:
import random
import inspect
import re
import os
import math


import pandas as pd

import geopandas as gpd
import folium
from folium import plugins

import nltk

from urllib.request import urlopen
from bs4 import BeautifulSoup

from mycolorpy import colorlist as mcp

## Minor Functions

Define function to debug

In [2]:
def prinfo(*args, **kwargs) -> None:
    """Prints arguments in with the name beforehand:
    example:
    x = 5
    print(x)

    #prints "x = 5"
    """

    frame = inspect.currentframe().f_back
    all_input = inspect.getframeinfo(frame).code_context[0]
    filtered_input = re.search(r"\((.*)\)", all_input).group(1)
    split_input = filtered_input.split(", ")

    if 'newline' in kwargs:
        newlinestring = "\n" if kwargs['newline'] else ""
    else:
        newlinestring = ""

    for var, val in zip(split_input, args):
        print(f"{var} = {newlinestring}{val}")

Define function to give information about the progress of reading the csv

In [3]:


def read_csv_in_chunks(path: str, n_lines: int, name_col: str, **read_params) -> pd.DataFrame:
    """Simmilar to pd.read_csv, but in chunks that enable to see the progress

    Returns:
        [pandas.DataFrame]: [description]
    """

    if 'chunksize' not in read_params or read_params['chunksize'] < 1:
        read_params['chunksize'] = 80000

    chunks = [0] * math.ceil(n_lines / read_params['chunksize'])

    for chunk_idx, chunk in enumerate(pd.read_csv(path, **read_params)):
        percent = min(
            ((chunk_idx + 1) * read_params['chunksize'] / n_lines) * 100, 100.0)
        print("#" * int(percent), f"{percent:.2f}%", end='\r', flush=True)
        chunks[chunk_idx] = chunk[chunk[name_col].notnull()]

    print()
    print("Now concatenating chunks...")
    data_frame = pd.concat(chunks, axis=0)
    del chunks
    print("Finished concatenating chunks!")
    return data_frame

## Import Stuff

### Gazetteer

In [4]:


def get_gazetteer(gaz: str = 'gns') -> dict:
    """Reads in a gazetteer and returns it as dataframe in a property dictionary
    Args:
        gaz (str): ["nga", "geonames", "countries"]

    Returns:
        dict: {'df_gazetteer', 'idx_of_lat', 'idx_of_lon', 'nameCol'}
    """
    print("Started reading the gazetteer...")

    if gaz == "gns":
        gazetteerpath = "data/gazetteers/gns/countries_administrative.csv"
        # indexOfLat = 4
        # indexOfLon = 5
        idx_of_lat = 1
        idx_of_lon = 2
        n_lines = 484618  # for gns administrative
        # n_lines =  284485 #for gns administrative approved
        # n_lines = 7866485 #for gns administrative populated
        name_col = "SORT_NAME_RO"  # for gns

        # read in the gazetter csv
        gazetteer = read_csv_in_chunks(
            path=gazetteerpath,
            n_lines=n_lines,
            low_memory=False,
            name_col=name_col)

        gazetteer = pd.concat(
            [gazetteer,
            pd.read_csv("data/gazetteers/own_places.csv")]
        )

    elif gaz == "geonames":
        gazetteerpath = "data/geonames/allCountries_cleaned.csv"
        idx_of_lat = 2
        idx_of_lon = 3
        n_lines = 6974472  # for allCountries_cleaned.csv
        # n_lines = 2079830 # for allCountries_AT.csv
        name_col = "name"

        # read in the gazetter csv
        gazetteer = read_csv_in_chunks(
            path=gazetteerpath,
            n_lines=n_lines,
            low_memory=False,
            name_col=name_col)

    # Read gazetter data (csv) and save placenames in a list
    elif gaz == "countries":
        countrynames = pd.read_csv(
            "data/geodict_github/countrynames.csv", names=["short", "long"])
        countrypositions = pd.read_csv(
            "data/geodict_github/countrypositions.csv", names=["short", "lat", "lon"])

        gazetteer = countrynames.merge(countrypositions, on="short")
        gazetteer.long = gazetteer.long.str.strip()
        idx_of_lat = 2
        idx_of_lon = 3
        n_lines = 240
        name_col = "long"

    return {
        'df_gazetteer': gazetteer,
        'idx_of_lat': idx_of_lat,
        'idx_of_lon': idx_of_lon,
        'nameCol': name_col
    }


### Textfile

Import textfile or call the html2text function to extract the text from a given url \
and save it for next use

In [5]:


def get_textfile(textfile: str = None, url: str = None) -> str:
    
    """Reads a textfile or extracts text from a website using html2text
    Args:
        textfile (str): path to textfile
        url (str): url of website

    Raises:
        Warning: If no path is specified

    Returns:
        str: loaded text or extracted text from website
    """


    if textfile:
        with open(textfile, "r", encoding="utf8") as raw_text:
            text_str = raw_text.read()

    elif url:
        text_save_path = f"data/texts/autosave/lastText_{url[-20:].replace('/', '')}"

        if not os.path.exists(text_save_path):

            html = urlopen(url).read()
            text_str = ' '.join(BeautifulSoup(html, "html.parser").stripped_strings)

            with open(text_save_path, "w", encoding="utf8") as raw_text:
                raw_text.write(text_str)

        else:
            with open(text_save_path, "r", encoding="utf8") as raw_text:
                text_str = raw_text.read()

    else:
        raise Warning("No path specified")

    print("Returning text...")
    return text_str


# url = 'https://www.theguardian.com/global-development/2021/dec/21/uk-accused-of-abandoning-\
# worlds-poor-as-aid-turned-into-colonial-investment'
# url = "https://www.theguardian.com/world/2021/oct/21/cuts-to-overseas-aid-thwart-uk-efforts-\
# to-fight-covid-pandemic"

## Matching

Extract places from text using nltk

In [6]:
def extract_placenames(text_str: str, gazetteer_dic: dict) -> tuple:
    """Extract placenames out of a text and return georeferenced results in a geodataframe and dict
    Args:
        text_str (str): text that placenames should get extracted from
        gazetteer (pandas.Dataframe) gazetteer
        gazetteer_dic (dict): Dictionary with keys /
            {'df_gazetteer', 'idx_of_lat', 'idx_of_lon', 'nameCol'}

    Returns:
        [type]: [description]
    """
    print("Extracting placenames...")

    gazetteer = gazetteer_dic['df_gazetteer']

    tokenized = nltk.word_tokenize(text_str)
    tagged = nltk.pos_tag(tokenized)
    ne_chunks = nltk.ne_chunk(tagged)

    i = 0
    for word in tokenized:
        if word == "Asia":
            i += 1

    place_words = [
        " ".join(i[0] for i in word)
        for word in ne_chunks
        if hasattr(word, "label") and word.label() == "GPE"
    ]

    stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    place_words = [lemmatizer.lemmatize(word).upper().replace(
        " ", "") for word in place_words]
    prinfo(sorted(place_words)[:10])

    # Create Dictionary with placenames as keys and dictionaries with the counts as values.
    place_dic = {}

    for place_word in place_words:
        if place_word in place_dic:
            place_dic[place_word]["count"] += 1
        else:
            place_dic[place_word] = {"count": 1}

    # Fill the dictionary with the coordinates of the placenames.

    # add column with lemmatized placenames two compare them to lematized placenames of the text
    gazetteer["lemma_placenames"] = gazetteer[gazetteer_dic['nameCol']].apply(
        lemmatizer.lemmatize)
    gazetteer["stem_placenames"] = gazetteer[gazetteer_dic['nameCol']].apply(
        stemmer.stem)

    len_keys = len(place_dic)
    log_interval = 1  # int(round(lenKeys/20, 0))

    failed_places = []

    print("Get coordinates of placenames...")
    for i, (placename, place_attributes) in enumerate(place_dic.items()):
        try:
            tmp_df_values = gazetteer.query(
                "lemma_placenames == @placename").values[0]
            # tmp_df_values = df_gazetteer[df_gazetteer["SORT_NAME_RO"] == "MAFIKENG"].values[0]
            place_attributes["name"] = tmp_df_values[7]
            place_attributes["lat"] = tmp_df_values[gazetteer_dic['idx_of_lat']]
            place_attributes["lon"] = tmp_df_values[gazetteer_dic['idx_of_lon']]

        except IndexError:
            failed_places.append(placename)

        # give feedback to progress
        if (i % log_interval == 0 and i > 0) or i+1 == len_keys:
            print(f"{i+1} of {len_keys} ({round((i/len_keys)*100, 1)}%)", end='\r')

    print("First 5 places in dictionary")
    for i, (place_word, place_attributes) in enumerate(place_dic.items()):
        if i < 5:
            print((place_word, place_attributes))

    # Catch information about nonfound placenames and delete them from the dictionary
    num_fails = len(failed_places)
    for fail in failed_places:
        del place_dic[fail]

    print(f'{num_fails} words/places haven\'t been found in the gazetteer')

    data_frame = pd.DataFrame(
        [list(attributes.values()) + [place]
         for place, attributes in place_dic.items()],
        columns=["count", "name", "lat", "lon", "stemname"])

    geo = gpd.GeoDataFrame(
        data_frame,
        geometry=gpd.points_from_xy(data_frame.lon, data_frame.lat),
        crs=4326
    )

    return (geo, place_dic)


## Visualization

In [7]:

def visualize_places(places: gpd.GeoDataFrame, places_dic:dict) -> folium.Figure:
    n = places["count"].max()
    color_list = mcp.gen_color(cmap="viridis",n=n)

    # create map
    heatMap = folium.Figure(width='75%')
    heatMap = folium.Map(
        location=[15, 30],
        zoom_start=2,
        max_bounds=True,
        tiles=None).add_to(heatMap)

    # add tiles
    folium.TileLayer(tiles='Cartodb dark_matter', name="Dark").add_to(heatMap)
    folium.TileLayer(tiles='stamen watercolor', name="Watercolor").add_to(heatMap)

    # add points and markercluster
    points = folium.FeatureGroup(name="Points", show=True).add_to(heatMap)
    cluster = plugins.MarkerCluster(
        name="Cluster", show=False).add_to(heatMap)

    for place, attributes in places_dic.items():

        latlon = (attributes['lat'], attributes['lon'])

        html = f'''

        <strong>Name:</strong> &emsp;&emsp;&emsp;&emsp;&emsp;{attributes['name']}<br/>
        <strong>Stemmed Name:</strong>&emsp;{place}<br/>
        <strong>Count:</strong>&emsp;&emsp;&emsp;&emsp;&emsp;{attributes['count']}
        
        '''

        iframe = folium.IFrame(
            html,
            width=300,
            height=70)

        popup = folium.Popup(iframe)

        folium.Circle(latlon).add_to(cluster)
        folium.Circle(
            location=latlon,
            popup=popup,
            tooltip=attributes['name'],
            radius= 50_000 + 50_000 * (attributes['count'] / places["count"].max()),
            fill=True,
            color=color_list[attributes['count']-1]
        ).add_to(points)


    # extract coordinate of geodataframe
    coordinates = [[point.xy[1][0], point.xy[0][0]] for point in places.geometry]

    # add heatmap
    plugins.HeatMap(
        name='HeatMap',
        data=coordinates,
        min_opacity=0.3,
        show=False
    ).add_to(heatMap)


    multiple_list = []
    for place in places.iterrows():
        for i in range(place[1]['count']):
            multiple_list.append(place[1])

    geo_multiple = pd.DataFrame(multiple_list)

    geo_multiple = gpd.GeoDataFrame(
        geo_multiple,
        geometry=gpd.points_from_xy(geo_multiple.lon, geo_multiple.lat),
        crs=4326
    )

    # extract coordinate of geodataframe
    coordinates_dupl = [[point.xy[1][0], point.xy[0][0]]
                for point in geo_multiple.geometry]

    # add heatmap
    plugins.HeatMap(
        name='HeatMap_multiple',
        data=coordinates_dupl,
        min_opacity=0.3,
        show=False
    ).add_to(heatMap)

    # add layercontrol
    folium.LayerControl(collapsed=False).add_to(heatMap)

    loc = f'Places in Wikipedia entry for Aid'
    title_html = '''
             <h3 align="center" style="font-size:16px"><b>{}</b></h3>
             '''.format(loc)
        
    heatMap.get_root().html.add_child(folium.Element(title_html))
    


    return heatMap

## Execution

Get gazetteer:

In [8]:
dic = get_gazetteer()


Started reading the gazetteer...
#################################################################################################### 100.00%
Now concatenating chunks...
Finished concatenating chunks!


Get places of Weltwoche articles

In [9]:
for i, article in enumerate(os.listdir("data/texts/weltwoche")):
    print(i, article)
    text = get_textfile(textfile=f'data/texts/weltwoche/{article}')
    if i == 0:
        places_weltw, places_weltw_dic = extract_placenames(text_str=text, gazetteer_dic=dic)
        continue
        
    new_places, new_places_dic = extract_placenames(text_str=text, gazetteer_dic=dic)

    places_weltw = pd.concat([places_weltw, new_places])

    for new_place, new_attr in new_places_dic.items():
        if new_place in places_weltw_dic:
            places_weltw_dic[new_place]["count"] += new_places_dic[new_place]["count"]
        else:
            places_weltw_dic[new_place] = new_places_dic[new_place]


places_weltw = places_weltw.groupby(by="stemname").agg(
    count=pd.NamedAgg(column='count', aggfunc=sum),
    name=pd.NamedAgg(column='name', aggfunc='first'),
    lat=pd.NamedAgg(column='lat', aggfunc='first'),
    lon=pd.NamedAgg(column='lon', aggfunc='first'),
    geometry=pd.NamedAgg(column='geometry', aggfunc='first')
    
).reset_index()

places_weltw = places_weltw

0 weltw_rwanda.txt
Returning text...
Extracting placenames...
sorted(place_words)[:10] = ['AFRICA', 'AFRICAN', 'BRITISH', 'EASTAFRICAN', 'ENGLISH', 'FR', 'GERMANS', 'GERSONY', 'HUTU', 'HUTU']
Get coordinates of placenames...
First 5 places in dictionary
('FR', {'count': 1})
('EASTAFRICAN', {'count': 1})
('SWITZERLAND', {'count': 3, 'name': 'Switzerland', 'lat': 47.0, 'lon': 8.0})
('SWISS', {'count': 1})
('AFRICA', {'count': 1, 'name': 'Africa', 'lat': -8.7832, 'lon': 34.5085})
20 words/places haven't been found in the gazetteer
1 weltw_afgh_eu.txt
Returning text...
Extracting placenames...
sorted(place_words)[:10] = ['AUSTRIA', 'EUROPE', 'EUROPE', 'EUROPE', 'GOTHENBURG', 'HUNGARY', 'NIGERIA', 'SWEDEN', 'SWITZERLAND', 'WÜRZBURG']


In [None]:

# places_no_afgh = places[(places.stemname != "AFGHANISTAN") & (places.stemname != "SWITZERLAND")]
# places_dic_no_afgh = {place: attr for (place, attr) in places_dic.items() if place not in ["AFGHANISTAN", "SWITZERLAND"]}

# visualize_places(places=places_no_afgh, places_dic=places_dic_no_afgh)
visualize_places(places=places_weltw, places_dic=places_weltw_dic)

Get places of Tagesanzeiger articles

In [None]:
for i, article in enumerate(os.listdir("data/texts/tagesanzeiger")):
    print(i, article)
    text = get_textfile(textfile=f'data/texts/tagesanzeiger/{article}')
    if i == 0:
        places_tagi, places_dic_tagi = extract_placenames(text_str=text, gazetteer_dic=dic)
        continue
        
    new_places, new_places_dic = extract_placenames(text_str=text, gazetteer_dic=dic)

    places_tagi = pd.concat([places_tagi, new_places])

    for new_place, new_attr in new_places_dic.items():
        if new_place in places_dic_tagi:
            places_dic_tagi[new_place]["count"] += new_places_dic[new_place]["count"]
        else:
            places_dic_tagi[new_place] = new_places_dic[new_place]


places_tagi = places_tagi.groupby(by="stemname").agg(
    count=pd.NamedAgg(column='count', aggfunc=sum),
    name=pd.NamedAgg(column='name', aggfunc='first'),
    lat=pd.NamedAgg(column='lat', aggfunc='first'),
    lon=pd.NamedAgg(column='lon', aggfunc='first'),
    geometry=pd.NamedAgg(column='geometry', aggfunc='first')
    
).reset_index()


0 tagi_warriors.txt
Returning text...
Extracting placenames...
sorted(place_words)[:10] = ['AFGHAN', 'AFGHAN', 'AFGHAN', 'AFGHAN', 'AFGHAN', 'AFGHANISTAN', 'AFGHANISTAN', 'AFGHANISTAN', 'AFGHANISTAN', 'AFGHANISTAN']
Get coordinates of placenames...
First 5 places in dictionary
('JUMP', {'count': 1})
('AFGHANISTAN', {'count': 34, 'name': 'Afghanistan', 'lat': 33.0, 'lon': 66.0})
('WASHINGTON', {'count': 1})
('AFGHAN', {'count': 5})
('KABUL', {'count': 4, 'name': 'Kābul', 'lat': 34.5, 'lon': 69.416667})
36 words/places haven't been found in the gazetteer
1 tagi_twobrothers
Returning text...
Extracting placenames...
sorted(place_words)[:10] = ['AFRICA', 'AFRICA', 'BEIRUT', 'BERN', 'BERN', 'BERNESE', 'CORONA', 'CÔTE', 'EASTAFRICA', 'JAPAN']
Get coordinates of placenames...
First 5 places in dictionary
('SOCIAL', {'count': 1})
('SWISS', {'count': 3})
('AFRICA', {'count': 2, 'name': 'Africa', 'lat': -8.7832, 'lon': 34.5085})
('EASTAFRICA', {'count': 1})
('BERN', {'count': 2, 'name': 'Bern', 

In [None]:

# places_no_afgh = places[(places.stemname != "AFGHANISTAN") & (places.stemname != "SWITZERLAND")]
# places_dic_no_afgh = {place: attr for (place, attr) in places_dic.items() if place not in ["AFGHANISTAN", "SWITZERLAND"]}

# visualize_places(places=places_no_afgh, places_dic=places_dic_no_afgh)
visualize_places(places=places_tagi, places_dic=places_dic_tagi)

In [None]:
places_tagi.head()

Unnamed: 0,stemname,count,name,lat,lon,geometry
0,AFGHANISTAN,117,Afghanistan,33.0,66.0,POINT (66.00000 33.00000)
1,AFRICA,5,Africa,-8.7832,34.5085,POINT (34.50850 -8.78320)
2,ARAB,1,Arab,17.50764,120.591256,POINT (120.59126 17.50764)
3,AUSTRALIA,1,Australia,-25.0,135.0,POINT (135.00000 -25.00000)
4,BAHRAIN,1,Bahrain,26.033333,50.55,POINT (50.55000 26.03333)


In [None]:
places_weltw.head()

Unnamed: 0,stemname,count,name,lat,lon,geometry
0,ADDISABABA,3,Addis Ababa,9.0,38.75,POINT (38.75000 9.00000)
1,AFGHANISTAN,12,Afghanistan,33.0,66.0,POINT (66.00000 33.00000)
2,AFRICA,28,Africa,-8.7832,34.5085,POINT (34.50850 -8.78320)
3,ASIA,6,Asia,9.566667,122.6,POINT (122.60000 9.56667)
4,AUSTRALIA,2,Australia,-25.0,135.0,POINT (135.00000 -25.00000)
