In [1]:
import pandas as pd
import geopandas as gpd
from pprint import pprint
import matplotlib.pyplot as plt
import contextily as cx
import folium

import math

In [2]:
gazetteerpath = "data/geonames/allCountries_cleaned.csv"

textfile = "data/reformation_wiki_DE.txt"

indexOfLat = 5
indexOfLon = 6
n_lines = 6974472

In [3]:
def read_csv_in_chunks(path, n_lines, **read_params):
    if 'chunksize' not in read_params or read_params['chunksize'] < 1:
        read_params['chunksize'] = 80000

    chunks = [0] * math.ceil(n_lines / read_params['chunksize'])

    for i, chunk in enumerate(pd.read_csv(path, **read_params)):
        percent = min(((i + 1) * read_params['chunksize'] / n_lines) * 100, 100.0)
        print("#" * int(percent), f"{percent:.2f}%", end='\r', flush=True)
        chunks[i] = chunk
    
    print()
    print("Now concatenating chunks...")
    df = pd.concat(chunks, axis=0)
    del chunks
    print("Finished!")
    return df

In [4]:
df_gazetteer = read_csv_in_chunks(
    path = gazetteerpath,
    n_lines = n_lines,
    low_memory=False)

placenames = df_gazetteer.name.tolist()
print(placenames[:5])

#################################################################################################### 100.00%
Now concatenating chunks...
Finished!
['Pic de Font Blanca', 'Roc Mélé', 'Pic des Langounelles', 'Pic de les Abelletes', 'Port Vieux de la Coume d’Ose']


In [5]:
with open(textfile, "r") as wiki:
    lines = wiki.readlines()
allWords = []
for line in lines:
    allWords.extend(line.strip().split(" "))

In [6]:
placenames_set = set(placenames)
af_matches = []
wordslen = len(allWords)
num_words = 6

skip_count = -1
for i, word in enumerate(allWords):

    skip_count -= 1
    if skip_count >= 0:
        continue

    try:
        # fourWords = (word, allWords[i+1], allWords[i+2], allWords[i+3])
        nWords = [nword for nword in allWords[i : i+num_words]]
        for j in range(num_words-1, -1, -1):
            tmp_words = " ".join(nWords[:j+1])
            if tmp_words in placenames_set:
                af_matches.append(tmp_words)
                skip_count = j
                continue
    except IndexError:
        break

In [7]:
d = {}
len_af_matches = len(af_matches)

for i, match in enumerate(af_matches):
    if match in d.keys():
        d[match]["count"] += 1
    else:
        d[match] = {"count": 1}

In [14]:
import copy
dd = copy.deepcopy(d)
lenKeys = len(d)

for i, (placename, attributes) in enumerate(d.items()):

    tmp_df_values = df_gazetteer.query("name == @placename").values[0]
    attributes["lat"] = tmp_df_values[5]
    attributes["lon"] = tmp_df_values[6]

    # give feedback to progress
    if (i % 10 == 0 and i > 0) or i == lenKeys:
        print(f"{i} of {lenKeys} ({round((i/lenKeys)*100, 1)}%)", end='\r')

490 of 495 (99.0%)

In [25]:
key_set = set(af_matches)
l = [[place] + list(attributes.values()) for place, attributes in d.items()]
pprint(l[:5])

[['Die', 121, 44.7536, 5.37033],
 ['Sinn', 3, 50.65, 8.33333],
 ['Wittenberg', 11, 51.26667, 5.46667],
 ['Zürich', 9, 47.36667, 8.55],
 ['Martin Luther', 9, 56.66726, 12.8808]]


In [26]:
# l = []
# key_set = set(af_matches)
# for place in key_set:
#     l.append([place])
#     l[-1].extend(d[place].values())

# pprint(l[:5])

In [27]:
geo = pd.DataFrame(
    [[place] + list(d[place].values()) for place in key_set],
    columns=["name", "count", "lat", "lon"])
    
geo = gpd.GeoDataFrame(
    geo,
    geometry=gpd.points_from_xy(geo.lon, geo.lat),
    crs=4326
)

In [28]:
pointMap = gpd.GeoSeries.explore(
    geo,
    tiles="Stamen Watercolor",
    marker_type='circle',
    marker_kwds={'radius': 10000,
    'fill': True})

pointMap

# ax = geo.plot()
# cx.add_basemap(
#     ax,
#     crs=geo.crs,
#     # zoom=8,
#     # source=cx.providers.Stamen.Watercolor)
# plt.show()

In [29]:
from folium import plugins

# extract coordinate of geodataframe
coordinates = [[point.xy[1][0], point.xy[0][0]] for point in geo.geometry]

# create map
heatMap = folium.Map(location = [15,30], zoom_start = 2, tiles=None)

# add tiles
folium.TileLayer(tiles='Cartodb dark_matter', name="Dark").add_to(heatMap)
folium.TileLayer(tiles='stamen watercolor', name="Watercolor").add_to(heatMap)

# add points and markercluster
points = folium.FeatureGroup(name="Points", show=False).add_to(heatMap)
cluster = plugins.MarkerCluster(name="Cluster").add_to(heatMap)
for coordinate in coordinates:
    folium.Circle(coordinate).add_to(cluster)
    folium.Circle(coordinate).add_to(points)

# add heatmap
plugins.HeatMap(
    name = 'HeatMap',
    data = coordinates,
    min_opacity = 0.3
    ).add_to(heatMap)

# add layercontrol
folium.LayerControl(collapsed=False).add_to(heatMap)


heatMap