# Purpose

2023-03-07. In this notebook we'll  
- pull the data for the top ~900 cities at Reddit (by DAU)
- get the lat & long coordinates for those cities
- Create a base plot to test whether we can add city-popular subreddits to a map

The main idea is that by making the MVP ti'll be more tangible and easier to get buy-in to add a map to the discovery tab (or something like it).

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
# Register bigquery magic (only needed for laptop/local, not colab)
%load_ext google.cloud.bigquery

In [55]:
# increase display width of cells
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [121]:
from datetime import datetime
import time

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from tqdm import tqdm

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import subclu

# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


setup_logging()
notebook_display_config()
print_lib_versions([geopy, np, pd, plotly, subclu])

python		v 3.7.11
===
geopy		v: 2.3.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 5.11.0
subclu		v: 0.6.1


In [122]:
# plotting defaults
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')


# Test geopy basics

Note: we need to use `RateLimiter` to prevent making too many requests at once.

It looks like this format works most of the time so we'll need to reshape our text to 
- `City, Region, Country-Code`

In [8]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location1 = geolocator.geocode("Chicago, IL US")
print(location1.address)
print((location1.latitude, location1.longitude))

Chicago, Cook County, Illinois, United States
(41.8755616, -87.6244212)
CPU times: user 22.7 ms, sys: 3.8 ms, total: 26.5 ms
Wall time: 2.21 s


In [290]:
# %%time
# # # san jose fails... needed to use the open maps tool to find something that worked
# geolocator = Nominatim(user_agent="test_geo")

# location0 = geolocator.geocode("San Jose, CA United States")
# print(location0.address)
# print((location0.latitude, location1.longitude))

In [289]:
# %%time
# # san jose fails... needed to use the open maps tool to find something that worked

# geolocator = Nominatim(user_agent="test_geo")

# location0 = geolocator.geocode("San Jose, California United States")
# print(location0.address)
# print((location0.latitude, location1.longitude))

In [10]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location2 = geolocator.geocode("London, ON CA")
print(location2.address)
print((location2.latitude, location2.longitude))

London, Southwestern Ontario, Ontario, N6A 3N7, Canada
(42.9832406, -81.243372)
CPU times: user 12.5 ms, sys: 2.38 ms, total: 14.9 ms
Wall time: 314 ms


In [11]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location2 = geolocator.geocode("Paris, 75 FR")
print(location2.address)
print((location2.latitude, location2.longitude))

Paris, Île-de-France, France métropolitaine, France
(48.8588897, 2.3200410217200766)
CPU times: user 13.6 ms, sys: 2.53 ms, total: 16.1 ms
Wall time: 360 ms


# Get the data for the top cities
This data is pre-calculated in BQ, so just pull the table from a specific partition

In [69]:
%%time
%%bigquery df_top_cities --project data-science-prod-218515 

SELECT
    CONCAT(geo_city, ", ", geo_region, " ", COALESCE(cn.country_name, tc.geo_country_code)) AS city_to_encode
    , LN(10 + city_users_l7) AS city_users_l7_ln
    , tc.*
    , cn.country_name
FROM `reddit-employee-datasets.david_bermejo.top_cities_l7` AS tc
    LEFT JOIN `reddit-employee-datasets.david_bermejo.countrycode_name_mapping` AS cn
        ON tc.geo_country_code = cn.country_code
WHERE tc.pt = "2023-03-01"

Query complete after 0.00s: 100%|███████████████████████████████████████████████████| 2/2 [00:00<00:00, 1236.16query/s]
Downloading: 100%|████████████████████████████████████████████████████████████████| 924/924 [00:01<00:00, 607.14rows/s]

CPU times: user 42.2 ms, sys: 16.4 ms, total: 58.6 ms
Wall time: 2.55 s





In [70]:
df_top_cities.shape

(924, 10)

In [71]:
df_top_cities.head()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States
3,"Chicago, IL United States",14.617279,2023-03-01,US,IL,Chicago,2229471,3,4,United States
4,"Sydney, NSW Australia",14.560106,2023-03-01,AU,NSW,Sydney,2105579,1,5,Australia


In [72]:
df_top_cities.tail()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name
919,"Villeurbanne, 69 France",9.923339,2023-03-01,FR,69,Villeurbanne,20391,14,1605,France
920,"Viña del Mar, VS Chile",9.911902,2023-03-01,CL,VS,Viña del Mar,20159,3,1621,Chile
921,"Zabrze, 24 Poland",9.909171,2023-03-01,PL,24,Zabrze,20104,15,1625,Poland
922,"Yevpatoriya, 43 Ukraine",9.90733,2023-03-01,UA,43,Yevpatoriya,20067,6,1628,Ukraine
923,"Tuxtla Gutiérrez, CHP Mexico",9.906882,2023-03-01,MX,CHP,Tuxtla Gutiérrez,20058,31,1630,Mexico


# Get lat & long for all cities

Again, remember to include a delay so that we don't break request limits

In [73]:
%%time
geocode_limiter = RateLimiter(geolocator.geocode, min_delay_seconds=0.5)

# test on top 5 cities
df_top_cities['geopy_location'] = (
    df_top_cities['city_to_encode'].iloc[20:26]
    .apply(geocode_limiter)
)

CPU times: user 33.3 ms, sys: 9.24 ms, total: 42.6 ms
Wall time: 5.16 s


In [76]:
df_top_cities[['city_to_encode', 'geopy_location']].iloc[20:26]

Unnamed: 0,city_to_encode,geopy_location
20,"Vancouver, BC Canada","(Vancouver, Metro Vancouver Regional District, British Columbia, Canada, (49.2608724, -123.113952))"
21,"San Antonio, TX United States","(San Antonio, Bexar County, Texas, United States, (29.4246002, -98.4951405))"
22,"San Jose, CA United States",
23,"San Francisco, CA United States","(San Francisco, CAL Fire Northern Region, California, United States, (37.7790262, -122.419906))"
24,"Montreal, QC Canada","(Montréal, Agglomération de Montréal, Montréal (06), Québec, Canada, (45.5031824, -73.5698065))"
25,"Minneapolis, MN United States","(Minneapolis, Hennepin County, Minnesota, United States, (44.9772995, -93.2654692))"


In [77]:
# get lat & long from location:
(
    df_top_cities['geopy_location'].iloc[20:26]
    .apply(lambda loc: loc.latitude if loc else None)
)

20    49.260872
21    29.424600
22          NaN
23    37.779026
24    45.503182
25    44.977300
Name: geopy_location, dtype: float64

In [78]:
# get lat & long from location:
(
    df_top_cities['geopy_location'].iloc[20:26]
    .apply(lambda loc: loc.longitude if loc else None)
)

20   -123.113952
21    -98.495141
22           NaN
23   -122.419906
24    -73.569806
25    -93.265469
Name: geopy_location, dtype: float64

In [186]:
df_top_cities = df_top_cities.drop(columns=['geopy_location'])

## Get location for all cities

In [126]:
batch_size = 40
iter_chunks = range(1 + len(df_top_cities) // batch_size)
print(iter_chunks)

range(0, 24)


In [127]:
# copy original name with potential errors
# l_df_geopy_loc_old = l_df_geopy_loc.copy()

In [128]:
len(l_df_geopy_loc_old)

49

In [129]:
geocode_limiter = RateLimiter(geolocator.geocode, min_delay_seconds=0.03)
l_df_geopy_loc = list()

for i in tqdm(iter_chunks):    
    df_slice_ = (
        df_top_cities.iloc[i * batch_size:(i + 1) * batch_size]
        [['city_to_encode']]
        .copy()
        .reset_index()
    )
    df_slice_['geopy_location'] = (
        df_slice_['city_to_encode']
        .apply(geocode_limiter)
    )
    l_df_geopy_loc.append(df_slice_)
    time.sleep(0.5)

  4%|███▍                                                                               | 1/24 [00:20<07:55, 20.68s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Madrid, M Spain',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.

RateLimiter swallowed an error after 2 retries. Called with (*('Madrid, M Spain',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = str(

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Santiago, RM Chile',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line 

 12%|██████████▍                                                                        | 3/24 [01:55<13:21, 38.16s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Copenhagen, 84 Denmark',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users

RateLimiter swallowed an error after 2 retries. Called with (*('Copenhagen, 84 Denmark',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Barcelona, B Spain',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line 

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Milwaukee, WI United States',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status


RateLimiter swallowed an error after 2 retries. Called with (*('Milwaukee, WI United States',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
   

 17%|█████████████▊                                                                     | 4/24 [03:26<19:36, 58.81s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Kochi, KL India',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Buenos Aires, C Argentina',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
  

RateLimiter swallowed an error after 2 retries. Called with (*('Buenos Aires, C Argentina',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    l

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Fresno, CA United States',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
   

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Frederiksberg, 84 Denmark',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
  

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Davao City, DAS Philippines',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status


 25%|████████████████████▊                                                              | 6/24 [05:45<20:37, 68.72s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Lyon, 69 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.

RateLimiter swallowed an error after 2 retries. Called with (*('Lyon, 69 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = str(

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Marikina City, 00 Philippines',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_statu

 38%|███████████████████████████████▏                                                   | 9/24 [07:28<11:20, 45.36s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Paranaque City, 00 Philippines',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File

RateLimiter swallowed an error after 2 retries. Called with (*('Paranaque City, 00 Philippines',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status


RateLimiter caught an error, retrying (1/2 tries). Called with (*('Marseille, 13 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    lin

 42%|██████████████████████████████████▏                                               | 10/24 [08:28<11:35, 49.69s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Toulouse, 31 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/da

RateLimiter swallowed an error after 2 retries. Called with (*('Toulouse, 31 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = 

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Mandaluyong City, 00 Philippines',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_st

 50%|█████████████████████████████████████████                                         | 12/24 [09:50<09:01, 45.15s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Lille, 59 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david

RateLimiter swallowed an error after 2 retries. Called with (*('Lille, 59 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = str

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Baguio City, BEN Philippines',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status

 71%|██████████████████████████████████████████████████████████                        | 17/24 [12:17<03:28, 29.75s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Strasbourg, 67 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/

RateLimiter swallowed an error after 2 retries. Called with (*('Strasbourg, 67 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line 

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Bordeaux, 33 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Zaragoza, Z Spain',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line =

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Montpellier, 34 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    l

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Nantes, 44 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line =

RateLimiter swallowed an error after 2 retries. Called with (*('Nantes, 44 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = st

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Rennes, 35 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line =

 92%|███████████████████████████████████████████████████████████████████████████▏      | 22/24 [15:51<01:13, 36.85s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Nice, 06 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.

RateLimiter swallowed an error after 2 retries. Called with (*('Nice, 06 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = str(

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Grenoble, 38 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Nancy, 54 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = 

RateLimiter swallowed an error after 2 retries. Called with (*('Nancy, 54 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = str

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Villeurbanne, 69 France',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    

100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [17:33<00:00, 43.88s/it]


In [137]:
len(l_df_geopy_loc)

24

In [131]:
style_df_numeric(l_df_geopy_loc[0])

Unnamed: 0,index,city_to_encode,geopy_location
0,0,"London, ENG United Kingdom","London, Greater London, England, United Kingdom"
1,1,"Los Angeles, CA United States","Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States"
2,2,"New York, NY United States","City of New York, New York, United States"
3,3,"Chicago, IL United States","Chicago, Cook County, Illinois, United States"
4,4,"Sydney, NSW Australia","Sydney, Council of the City of Sydney, New South Wales, Australia"
5,5,"Toronto, ON Canada","Old Toronto, Toronto, Golden Horseshoe, Ontario, Canada"
6,6,"Melbourne, VIC Australia","Melbourne, City of Melbourne, Victoria, Australia"
7,7,"Dallas, TX United States","Dallas, Dallas County, Texas, United States"
8,8,"Seattle, WA United States","Seattle, King County, Washington, United States"
9,9,"Brooklyn, NY United States","Brooklyn, Kings County, City of New York, New York, United States"


In [138]:
l_df_geopy_loc[-1]

Unnamed: 0,index,city_to_encode,geopy_location
0,920,"Viña del Mar, VS Chile","(Viña del Mar, Provincia de Valparaíso, Región de Valparaíso, Chile, (-33.0244535, -71.5517636))"
1,921,"Zabrze, 24 Poland","(Zabrze, Górnośląsko-Zagłębiowska Metropolia, województwo śląskie, Polska, (50.3086154, 18.7863749))"
2,922,"Yevpatoriya, 43 Ukraine","(Евпатория, Республика Крым, Україна, (45.1907635, 33.3679049))"
3,923,"Tuxtla Gutiérrez, CHP Mexico","(Tuxtla Gutiérrez, Chiapas, 29019, México, (16.753801, -93.115959))"


# Data quality checks & manual backfill
Things to check:
- Which cities are missing?
- Which cities are not in the expected target country?

In [177]:
df_city_loc_raw = (
    pd.concat(l_df_geopy_loc, ignore_index=True, axis=0)
    .drop(columns=['index'])
)
print(df_city_loc_raw.shape)
df_city_loc_raw.head()

(924, 2)


Unnamed: 0,city_to_encode,geopy_location
0,"London, ENG United Kingdom","(London, Greater London, England, United Kingdom, (51.5073359, -0.12765))"
1,"Los Angeles, CA United States","(Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States, (34.0536909, -118.242766))"
2,"New York, NY United States","(City of New York, New York, United States, (40.7127281, -74.0060152))"
3,"Chicago, IL United States","(Chicago, Cook County, Illinois, United States, (41.8755616, -87.6244212))"
4,"Sydney, NSW Australia","(Sydney, Council of the City of Sydney, New South Wales, Australia, (-33.8698439, 151.2082848))"


## Check Nulls

In [178]:
mask_null_loc = df_city_loc_raw['geopy_location'].isnull()
print(mask_null_loc.sum())

df_city_loc_raw[mask_null_loc].head()

30


Unnamed: 0,city_to_encode,geopy_location
22,"San Jose, CA United States",
62,"Central, HCW Hong Kong",
71,"Madrid, M Spain",
122,"Copenhagen, 84 Denmark",
123,"Barcelona, B Spain",


In [179]:
# df_city_loc_raw[mask_null_loc]['city_to_encode'].to_list()

In [291]:
df_city_loc_raw[df_city_loc_raw['city_to_encode'].str.contains('Victoria')]

Unnamed: 0,city_to_encode,geopy_location
184,"Victoria, BC Canada","(Beco Canada, Joana d'Arc, Região Administrativa IV - Maruípe, Vitória, Região Geográfica Imediata de Vitória, Região Metropolitana da Grande Vitória, Região Geográfica Intermediária de Vitória, Espírito Santo, Região Sudeste, 29048-060..."


### Fill missing cities

We can test the output from open maps using this website:

https://nominatim.openstreetmap.org/ui/search.html?q=lyon+69+france

In [357]:
%%time

geolocator = Nominatim(user_agent="test_geo2")

# create new df with new values to fix missing locations
d_missing_cities = {
    
    # victoria, BC CA is not missing, but it's wrong. It's being set in Brazil!
    'Victoria, BC Canada': 'royal athletic park Victoria, British Columbia, Canada', 
    
    'San Jose, CA United States': 'San Jose santa clara county, California',
    'Central, HCW Hong Kong': 'Central Hong Kong',
    'Madrid, M Spain': 'Madrid community Spain',
    'Copenhagen, 84 Denmark': 'Copenhagen municipality Denmark',
    'Barcelona, B Spain': 'Barcelona barcelona Spain',
    'Milwaukee, WI United States': 'Milwaukee downtown wisconsin',
    'Cebu City, CEB Philippines': 'cebu city san roque Philippines',
    'Buenos Aires, C Argentina': 'buenos aires ciudad comuna argentina',
    'Frederiksberg, 84 Denmark': 'Frederiksberg capital Denmark',
    'Lyon, 69 France': 'Métropole de Lyon France',
    
    'Marikina City, 00 Philippines': 'Marikina heights, marikina capital Philippines',
    'Paranaque City, 00 Philippines': 'Parañaque City hall, southern manila Philippines',
    'Marseille, 13 France': 'Marseille, bouches-du-rhone France',
    'Toulouse, 31 France': 'Toulouse, occitania France',
    'Lahug, CEB Philippines': 'Lahug cebu city Philippines',
    'Mandaluyong City, 00 Philippines': 'Mandaluyong City, eastern manila district, Philippines',
    'Lille, 59 France': 'lille, nord hauts-de-france, france',

    'Northampton, NTH United Kingdom': 'Northampton, west northamptonshire United Kingdom',
    'Baguio City, BEN Philippines': 'Baguio city, cordillera Philippines',
    'Strasbourg, 67 France': 'Strasbourg, bas-rhin France',
    'Bordeaux, 33 France': 'Bordeaux, gironde France',
    'Montpellier, 34 France': 'Montpellier, occitania France',
    'Nantes, 44 France': 'Nantes loire-atlantique France',
    'City of Muntinlupa, RIZ Philippines': 'Muntinlupa, southern manila Philippines',
    
    'Rennes, 35 France': 'rennes brittany france',
    'San Luis Potosí City, SLP Mexico': 'San Luis Potosi, Municipio San Luis Potosi Mexico',
    'Nice, 06 France': 'nice alpes maritimes france',
    'Grenoble, 38 France': 'grenoble, auvergne-rhone-alpes france',
    'Nancy, 54 France': 'nancy grande est france',
    'Villeurbanne, 69 France': 'Villeurbanne, lyon france',
}

# for c_old, c_new in d_missing_cities.items():
#     # Test the new city names
#     loc_fix = geolocator.geocode(c_new)
#     print(location2.address)
#     time.sleep(0.05)

# we need to keep the same index so that it's matched we we fill back nulls
df_city_nulls_fix = (
    pd.DataFrame([d_missing_cities]).T
    .reset_index()
    .rename(columns={'index': 'city_to_encode', 0: 'city_to_encode_fix_null'})
).merge(
    df_city_loc_raw.reset_index(),
    how='left',
    on=['city_to_encode']
).set_index('index')

df_city_nulls_fix['geopy_location'] = (
    df_city_nulls_fix['city_to_encode_fix_null']
    .apply(geocode_limiter)
)

CPU times: user 72.9 ms, sys: 8.68 ms, total: 81.6 ms
Wall time: 15.2 s


In [358]:
df_city_nulls_fix.head(10)

Unnamed: 0_level_0,city_to_encode,city_to_encode_fix_null,geopy_location
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
184,"Victoria, BC Canada","royal athletic park Victoria, British Columbia, Canada","(Royal Athletic Park, North Park, Victoria, Capital Regional District, British Columbia, V8T 1G1, Canada, (48.43105265, -123.35428975865334))"
22,"San Jose, CA United States","San Jose santa clara county, California","(San Jose, Santa Clara County, CAL Fire Northern Region, California, United States, (37.3361663, -121.890591))"
62,"Central, HCW Hong Kong",Central Hong Kong,"(香港 Hong Kong, 8, 金融街 Finance Street, 國際金融中心 International Finance Centre, 中環 Central, 中西區 Central and Western District, 香港島 Hong Kong Island, 香港 Hong Kong, 中国, (22.2850394, 114.1583819))"
71,"Madrid, M Spain",Madrid community Spain,"(Madrid, Área metropolitana de Madrid y Corredor del Henares, Comunidad de Madrid, España, (40.4167047, -3.7035825))"
122,"Copenhagen, 84 Denmark",Copenhagen municipality Denmark,"(Københavns Kommune, Region Hovedstaden, Danmark, (55.65554675, 12.601643209374739))"
123,"Barcelona, B Spain",Barcelona barcelona Spain,"(Barcelona, Barcelonès, Barcelona, Catalunya, 08001, España, (41.3828939, 2.1774322))"
124,"Milwaukee, WI United States",Milwaukee downtown wisconsin,"(Milwaukee Area Technical College Downtown Campus, 700, West State Street, Bronzeville, Milwaukee, Milwaukee County, Wisconsin, 53233, United States, (43.04437365, -87.92031411388143))"
179,"Cebu City, CEB Philippines",cebu city san roque Philippines,"(Cebu, P. Burgos Street, Santo Niño, Cebu City, Central Visayas, 6000, Philippines, (10.2934208, 123.9022613))"
215,"Buenos Aires, C Argentina",buenos aires ciudad comuna argentina,"(Buenos Aires, Comuna 6, Ciudad Autónoma de Buenos Aires, Argentina, (-34.6075682, -58.4370894))"
217,"Frederiksberg, 84 Denmark",Frederiksberg capital Denmark,"(Frederiksberg, Frederiksberg Kommune, Region Hovedstaden, 1861, Danmark, (55.678016, 12.5326186))"


In [359]:
df_city_nulls_fix.tail(10)

Unnamed: 0_level_0,city_to_encode,city_to_encode_fix_null,geopy_location
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
724,"Bordeaux, 33 France","Bordeaux, gironde France","(Bordeaux, Gironde, Nouvelle-Aquitaine, France métropolitaine, France, (44.841225, -0.5800364))"
763,"Montpellier, 34 France","Montpellier, occitania France","(Montpellier, Hérault, Occitanie, France métropolitaine, France, (43.6112422, 3.8767337))"
769,"Nantes, 44 France",Nantes loire-atlantique France,"(Nantes, Loire-Atlantique, Pays de la Loire, France métropolitaine, France, (47.2186371, -1.5541362))"
799,"City of Muntinlupa, RIZ Philippines","Muntinlupa, southern manila Philippines","(Muntinlupa, Southern Manila District, Metro Manila, Philippines, (14.3892634, 121.0449101))"
822,"Rennes, 35 France",rennes brittany france,"(Rennes, Ille-et-Vilaine, Bretagne, France métropolitaine, France, (48.1113387, -1.6800198))"
877,"San Luis Potosí City, SLP Mexico","San Luis Potosi, Municipio San Luis Potosi Mexico","(San Luis Potosí, Municipio de San Luis Potosí, San Luis Potosí, 78001, México, (22.1516472, -100.9763993))"
890,"Nice, 06 France",nice alpes maritimes france,"(Nice, Alpes-Maritimes, Provence-Alpes-Côte d'Azur, France métropolitaine, France, (43.7009358, 7.2683912))"
892,"Grenoble, 38 France","grenoble, auvergne-rhone-alpes france","(Grenoble, Isère, Auvergne-Rhône-Alpes, France métropolitaine, France, (45.1875602, 5.7357819))"
914,"Nancy, 54 France",nancy grande est france,"(Nancy, Meurthe-et-Moselle, Grand Est, France métropolitaine, France, (48.6937223, 6.1834097))"
919,"Villeurbanne, 69 France","Villeurbanne, lyon france","(Villeurbanne, Lyon, Métropole de Lyon, Rhône, Auvergne-Rhône-Alpes, France métropolitaine, 69100, France, (45.7733573, 4.8868454))"


In [360]:
# replace fixes
df_city_loc_raw.loc[
    df_city_loc_raw['city_to_encode'].isin(df_city_nulls_fix['city_to_encode']),
    'geopy_location'
] = df_city_nulls_fix['geopy_location']

In [361]:
mask_null_loc2 = df_city_loc_raw['geopy_location'].isnull()
print(mask_null_loc2.sum())

df_city_loc_raw[mask_null_loc2]

0


Unnamed: 0,city_to_encode,geopy_location


## Check country mismatch

In [213]:
def get_address_from_loc(
    loc,
) -> dict:
    """Get raw location based on lat & lon.
    This is the best way to check, but it's also SLOW! can take ~1 second per city... so 10+ minutes
    we'll get back a dictionary with outputs like (but not guaranteed):
    {
        'suburb': 'Rajendra Nagar',
        'city': 'Patna',
        'country': 'Patna Rural',
        'state_district': 'Patna',
        'state': 'Bihar',
        'postcode': '800001',
        'country': 'India',
        'country_code': 'in'
    }
    """
    geolocator = Nominatim(user_agent="test_loc3")
    loc_raw = geolocator.reverse(f"{loc.latitude}, {loc.longitude}")
    return loc_raw.raw.get('address')


In [362]:
df_city_and_loc = (
    (
        df_top_cities
        # .drop(columns=['geopy_location'])
        .copy()
    )
    .merge(
        df_city_loc_raw.copy(),
        how='left',
        on=['city_to_encode']
    )
)
print(df_city_and_loc.shape)

df_city_and_loc.head()

(924, 11)


Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name,geopy_location
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom,"(London, Greater London, England, United Kingdom, (51.5073359, -0.12765))"
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States,"(Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States, (34.0536909, -118.242766))"
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States,"(City of New York, New York, United States, (40.7127281, -74.0060152))"
3,"Chicago, IL United States",14.617279,2023-03-01,US,IL,Chicago,2229471,3,4,United States,"(Chicago, Cook County, Illinois, United States, (41.8755616, -87.6244212))"
4,"Sydney, NSW Australia",14.560106,2023-03-01,AU,NSW,Sydney,2105579,1,5,Australia,"(Sydney, Council of the City of Sydney, New South Wales, Australia, (-33.8698439, 151.2082848))"


In [197]:
df_city_and_loc['geopy_location'].apply(lambda loc: loc.address if loc else None)

0                                             London, Greater London, England, United Kingdom
1      Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States
2                                                   City of New York, New York, United States
3                                               Chicago, Cook County, Illinois, United States
4                           Sydney, Council of the City of Sydney, New South Wales, Australia
                                                ...                                          
919                                                                                      None
920                        Viña del Mar, Provincia de Valparaíso, Región de Valparaíso, Chile
921                  Zabrze, Górnośląsko-Zagłębiowska Metropolia, województwo śląskie, Polska
922                                                       Евпатория, Республика Крым, Україна
923                                                  Tuxtla 

In [221]:
# Test how to access the location data
style_df_numeric(
    df_city_and_loc['geopy_location'].head(2).apply(lambda loc: loc.raw if loc else None).to_frame()
)

Unnamed: 0,geopy_location
0,"{'place_id': 344211958, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 65606, 'boundingbox': ['51.2867601', '51.6918741', '-0.5103751', '0.3340155'], 'lat': '51.5073359', 'lon': '-0.12765', 'display_name': 'London, Greater London, England, United Kingdom', 'class': 'place', 'type': 'city', 'importance': 1.2407827616237295, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_place_city.p.20.png'}"
1,"{'place_id': 355412804, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 207359, 'boundingbox': ['33.659541', '34.337306', '-118.6681779', '-118.1552947'], 'lat': '34.0536909', 'lon': '-118.242766', 'display_name': 'Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States', 'class': 'boundary', 'type': 'administrative', 'importance': 1.2738053728457621, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'}"


In [227]:
# get only the country. Caveat, this might be in non-English script!
style_df_numeric(
    df_city_and_loc['geopy_location'].tail(5).apply(lambda loc: loc.raw['display_name'].split(',')[-1] if loc else None).to_frame()
)

Unnamed: 0,geopy_location
919,
920,Chile
921,Polska
922,Україна
923,México


In [236]:
%%time
# 1st get the plain name from the existing data, even if it's in non-English text

df_city_and_loc['geopy_country_name'] = (
    df_city_and_loc['geopy_location'].apply(lambda loc: loc.raw['display_name'].split(',')[-1].strip() if loc else None)
)

CPU times: user 2.07 ms, sys: 74 µs, total: 2.15 ms
Wall time: 2.12 ms


In [242]:
value_counts_and_pcts(df_city_and_loc['geopy_country_name'], top_n=None)

Unnamed: 0,geopy_country_name-count,geopy_country_name-percent,geopy_country_name-pct_cumulative_sum
United States,350,37.9%,37.9%
United Kingdom,69,7.5%,45.3%
Canada,69,7.5%,52.8%
Россия,35,3.8%,56.6%
Deutschland,35,3.8%,60.4%
India,34,3.7%,64.1%
Brasil,32,3.5%,67.5%
México,30,3.2%,70.8%
Philippines,29,3.1%,73.9%
Nederland,24,2.6%,76.5%


In [254]:
d_country_name_standard = {
    'Россия': 'Russia',
    "Deutschland": "Germany",
    'Brasil': 'Brazil',
    'México': "Mexico",
    'Nederland': 'Netherlands',
    'Italia': 'Italy',
    'Polska': 'Poland',
    'Türkiye': "Turkey",
    "España": 'Spain',
    "Sverige": 'Sweden',
    "New Zealand/Aotearoa": "New Zealand",
    "Suomi / Finland": 'Finland',
    'Danmark': 'Denmark',
    'Україна': 'Ukraine',
    'România': 'Romania',
    '日本': 'Japan',
    'Schweiz/Suisse/Svizzera/Svizra': 'Switzerland',
    'België / Belgique / Belgien': 'Belgium',
    'پاکستان': 'Pakistan',
    'Éire / Ireland': 'Ireland',
    'Norge': 'Norway',
    'الإمارات العربية المتحدة': 'United Arab Emirates',
    'Česko': 'Czechia',
    'ประเทศไทย': 'Thailand',
    'Österreich': 'Austria',
    'Magyarország': 'Hungary',
    'Việt Nam': 'Vietnam',
    'Ελλάς': 'Greece',
    '中国': 'China'

}

In [255]:
df_city_and_loc.head(3)

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name,geopy_location,geopy_country_name
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom,"(London, Greater London, England, United Kingdom, (51.5073359, -0.12765))",United Kingdom
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States,"(Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States, (34.0536909, -118.242766))",United States
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States,"(City of New York, New York, United States, (40.7127281, -74.0060152))",United States


In [284]:
mask_match_country = (
    (df_city_and_loc['geopy_location'].notnull()) &
    (df_city_and_loc['country_name'] == df_city_and_loc['geopy_country_name'].replace(d_country_name_standard))
)
print(f"Cities not matching expected country: {(~mask_match_country).sum():,.0f}")

Cities not matching expected country: 23


In [278]:
mask_match_country.sum()

901

In [280]:
# df_city_and_loc[df_city_and_loc['geopy_location'].isnull()]

In [285]:
df_city_and_loc[(~mask_match_country) & (df_city_and_loc['geopy_location'].notnull())]

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name,geopy_location,geopy_country_name
62,"Central, HCW Hong Kong",13.110236,2023-03-01,HK,HCW,Central,493963,1,63,Hong Kong,"(香港 Hong Kong, 8, 金融街 Finance Street, 國際金融中心 International Finance Centre, 中環 Central, 中西區 Central and Western District, 香港島 Hong Kong Island, 香港 Hong Kong, 中国, (22.2850394, 114.1583819))",中国
86,"Prague, 10 Czechia",12.814177,2023-03-01,CZ,10,Prague,367379,1,87,Czechia,"(Hudební divadlo Karlín, 10, Křižíkova, Florenc, Karlín, Hlavní město Praha, Praha, 186 00, Praha, (50.0901221, 14.441241))",Praha
184,"Victoria, BC Canada",12.05507,2023-03-01,CA,BC,Victoria,171959,16,185,Canada,"(Beco Canada, Joana d'Arc, Região Administrativa IV - Maruípe, Vitória, Região Geográfica Imediata de Vitória, Região Metropolitana da Grande Vitória, Região Geográfica Intermediária de Vitória, Espírito Santo, Região Sudeste, 29048-060...",Brasil


In [230]:
# %%time
# # this might take ~900 seconds (15 mins!!?)
# df_city_and_loc['geopy_country_code'] = (
#     df_city_and_loc['geopy_location'].apply(lambda loc: get_address_from_loc(loc)['country_code'].upper() if loc else None)
# )

### Fill mismatched cities

None for now!!

# Add Lat & lon for plotting

In [286]:
# get lat & long from location
df_city_and_loc['latitude'] = (
    df_city_and_loc['geopy_location']
    .apply(lambda loc: loc.latitude if loc else None)
)

df_city_and_loc['longitude'] = (
    df_city_and_loc['geopy_location']
    .apply(lambda loc: loc.longitude if loc else None)
)

In [287]:
df_city_and_loc.head()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name,geopy_location,geopy_country_name,latitude,longitude
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom,"(London, Greater London, England, United Kingdom, (51.5073359, -0.12765))",United Kingdom,51.507336,-0.12765
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States,"(Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States, (34.0536909, -118.242766))",United States,34.053691,-118.242766
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States,"(City of New York, New York, United States, (40.7127281, -74.0060152))",United States,40.712728,-74.006015
3,"Chicago, IL United States",14.617279,2023-03-01,US,IL,Chicago,2229471,3,4,United States,"(Chicago, Cook County, Illinois, United States, (41.8755616, -87.6244212))",United States,41.875562,-87.624421
4,"Sydney, NSW Australia",14.560106,2023-03-01,AU,NSW,Sydney,2105579,1,5,Australia,"(Sydney, Council of the City of Sydney, New South Wales, Australia, (-33.8698439, 151.2082848))",Australia,-33.869844,151.208285


# Dump data to df so I can start building a map in a separate notebook

In [288]:
df_city_and_loc.to_csv(
    f"djb-df_top_cities_loc-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}.csv",
    index=False,
)