In [63]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/silver/schengen-visa-2020.csv')
df = df[["origin_country","origin_consulate"]]
df["country"] = df["origin_country"]
df["city"] = df["origin_consulate"]
df = df.drop_duplicates()
df

Unnamed: 0,origin_country,origin_consulate,country,city
0,ALBANIA,TIRANA,ALBANIA,TIRANA
1,ALGERIA,ALGIERS,ALGERIA,ALGIERS
2,ARGENTINA,BUENOS AIRES,ARGENTINA,BUENOS AIRES
3,AUSTRALIA,CANBERRA,AUSTRALIA,CANBERRA
4,AZERBAIJAN,BAKU,AZERBAIJAN,BAKU
...,...,...,...,...
1554,SWITZERLAND,ZURICH,SWITZERLAND,ZURICH
1574,USA,"SAN JUAN, PR",USA,"SAN JUAN, PR"
1604,LIBERIA,MONROVIA,LIBERIA,MONROVIA
1607,NORTH KOREA,PYONGYANG,NORTH KOREA,PYONGYANG


In [64]:
def correct_countries(country):
    return {
        'HONG KONG S.A.R.': 'HONG KONG',
        'CONGO (DEMOCRATIC REPUBLIC)': 'DEMOCRATIC REPUBLIC OF THE CONGO',
        'CONGO (BRAZZAVILLE)': 'REPUBLIC OF THE CONGO'
    }.get(country, country)
df['country'] = df.apply(lambda x: correct_countries(x['country']), axis=1)

In [65]:
def correct_cities(city):
    return {
        'YAONDE': 'YAOUNDE',
        'VITSYEBSK': 'VITEBSK'
    }.get(city, city)
df['city'] = df.apply(lambda x: correct_cities(x['city']), axis=1)

In [66]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="mhobolth-geo")

def my_geocoder(country, city):
    try:
        point = geolocator.geocode(city + ', ' + country).point
        return pd.Series({'latitude': point.latitude, 'longitude': point.longitude})
    except:
        return None

df[['latitude', 'longitude']] = df.apply(lambda x: my_geocoder(x['country'], x['city']), axis=1)

print("{}% of cities were geocoded!".format(
    (1 - sum(np.isnan(df["latitude"])) / len(df)) * 100))


100.0% of cities were geocoded!


In [67]:
df[df["latitude"].isna()]

Unnamed: 0,origin_country,origin_consulate,country,city,latitude,longitude


In [68]:
df.to_csv('../data/silver/cities.csv')

In [69]:
import folium
import folium.plugins.marker_cluster as fmc

m = folium.Map(location=[54, 15], tiles='Stamen Toner', zoom_start=2)

mc = fmc.MarkerCluster()
for idx, row in df[df['latitude'].notna()].iterrows():
    mc.add_child(folium.Marker([row['latitude'], row['longitude']], popup=row['city'] + ", " + row['country'], icon=folium.Icon(color="green")))
m.add_child(mc)

m