# Purpose

2023-03-07. In this notebook we'll  
- pull the data for the top ~900 cities at Reddit (by DAU)
- get the lat & long coordinates for those cities
- Create a base plot to test whether we can add city-popular subreddits to a map

The main idea is that by making the MVP ti'll be more tangible and easier to get buy-in to add a map to the discovery tab (or something like it).

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
# Register bigquery magic (only needed for laptop/local, not colab)
%load_ext google.cloud.bigquery

In [55]:
# increase display width of cells
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [121]:
from datetime import datetime
import time

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from tqdm import tqdm

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import subclu

# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


setup_logging()
notebook_display_config()
print_lib_versions([geopy, np, pd, plotly, subclu])

python		v 3.7.11
===
geopy		v: 2.3.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 5.11.0
subclu		v: 0.6.1


In [122]:
# plotting defaults
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')


# Test geopy basics

Note: we need to use `RateLimiter` to prevent making too many requests at once.

It looks like this format works most of the time so we'll need to reshape our text to 
- `City, Region, Country-Code`

In [8]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location1 = geolocator.geocode("Chicago, IL US")
print(location1.address)
print((location1.latitude, location1.longitude))

Chicago, Cook County, Illinois, United States
(41.8755616, -87.6244212)
CPU times: user 22.7 ms, sys: 3.8 ms, total: 26.5 ms
Wall time: 2.21 s


In [81]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location0 = geolocator.geocode("San Jose, CA United States")
print(location0.address)
print((location0.latitude, location1.longitude))

AttributeError: 'NoneType' object has no attribute 'address'

In [82]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location0 = geolocator.geocode("San Jose, California United States")
print(location0.address)
print((location0.latitude, location1.longitude))

GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=San+Jose%2C+California+United+States&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

In [10]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location2 = geolocator.geocode("London, ON CA")
print(location2.address)
print((location2.latitude, location2.longitude))

London, Southwestern Ontario, Ontario, N6A 3N7, Canada
(42.9832406, -81.243372)
CPU times: user 12.5 ms, sys: 2.38 ms, total: 14.9 ms
Wall time: 314 ms


In [11]:
%%time

geolocator = Nominatim(user_agent="test_geo")

location2 = geolocator.geocode("Paris, 75 FR")
print(location2.address)
print((location2.latitude, location2.longitude))

Paris, Île-de-France, France métropolitaine, France
(48.8588897, 2.3200410217200766)
CPU times: user 13.6 ms, sys: 2.53 ms, total: 16.1 ms
Wall time: 360 ms


# Get the data for the top cities
This data is pre-calculated in BQ, so just pull the table from a specific partition

In [69]:
%%time
%%bigquery df_top_cities --project data-science-prod-218515 

SELECT
    CONCAT(geo_city, ", ", geo_region, " ", cn.country_name) AS city_to_encode
    , LN(10 + city_users_l7) AS city_users_l7_ln
    , tc.*
    , cn.country_name
FROM `reddit-employee-datasets.david_bermejo.top_cities_l7` AS tc
    LEFT JOIN `reddit-employee-datasets.david_bermejo.countrycode_name_mapping` AS cn
        ON tc.geo_country_code = cn.country_code
WHERE tc.pt = "2023-03-01"

Query complete after 0.00s: 100%|███████████████████████████████████████████████████| 2/2 [00:00<00:00, 1236.16query/s]
Downloading: 100%|████████████████████████████████████████████████████████████████| 924/924 [00:01<00:00, 607.14rows/s]

CPU times: user 42.2 ms, sys: 16.4 ms, total: 58.6 ms
Wall time: 2.55 s





In [70]:
df_top_cities.shape

(924, 10)

In [71]:
df_top_cities.head()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States
3,"Chicago, IL United States",14.617279,2023-03-01,US,IL,Chicago,2229471,3,4,United States
4,"Sydney, NSW Australia",14.560106,2023-03-01,AU,NSW,Sydney,2105579,1,5,Australia


In [72]:
df_top_cities.tail()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name
919,"Villeurbanne, 69 France",9.923339,2023-03-01,FR,69,Villeurbanne,20391,14,1605,France
920,"Viña del Mar, VS Chile",9.911902,2023-03-01,CL,VS,Viña del Mar,20159,3,1621,Chile
921,"Zabrze, 24 Poland",9.909171,2023-03-01,PL,24,Zabrze,20104,15,1625,Poland
922,"Yevpatoriya, 43 Ukraine",9.90733,2023-03-01,UA,43,Yevpatoriya,20067,6,1628,Ukraine
923,"Tuxtla Gutiérrez, CHP Mexico",9.906882,2023-03-01,MX,CHP,Tuxtla Gutiérrez,20058,31,1630,Mexico


# Get lat & long for all cities

Again, remember to include a delay so that we don't break request limits

In [73]:
%%time
geocode_limiter = RateLimiter(geolocator.geocode, min_delay_seconds=0.5)

# test on top 5 cities
df_top_cities['geopy_location'] = (
    df_top_cities['city_to_encode'].iloc[20:26]
    .apply(geocode_limiter)
)

CPU times: user 33.3 ms, sys: 9.24 ms, total: 42.6 ms
Wall time: 5.16 s


In [76]:
df_top_cities[['city_to_encode', 'geopy_location']].iloc[20:26]

Unnamed: 0,city_to_encode,geopy_location
20,"Vancouver, BC Canada","(Vancouver, Metro Vancouver Regional District, British Columbia, Canada, (49.2608724, -123.113952))"
21,"San Antonio, TX United States","(San Antonio, Bexar County, Texas, United States, (29.4246002, -98.4951405))"
22,"San Jose, CA United States",
23,"San Francisco, CA United States","(San Francisco, CAL Fire Northern Region, California, United States, (37.7790262, -122.419906))"
24,"Montreal, QC Canada","(Montréal, Agglomération de Montréal, Montréal (06), Québec, Canada, (45.5031824, -73.5698065))"
25,"Minneapolis, MN United States","(Minneapolis, Hennepin County, Minnesota, United States, (44.9772995, -93.2654692))"


In [77]:
# get lat & long from location:
(
    df_top_cities['geopy_location'].iloc[20:26]
    .apply(lambda loc: loc.latitude if loc else None)
)

20    49.260872
21    29.424600
22          NaN
23    37.779026
24    45.503182
25    44.977300
Name: geopy_location, dtype: float64

In [78]:
# get lat & long from location:
(
    df_top_cities['geopy_location'].iloc[20:26]
    .apply(lambda loc: loc.longitude if loc else None)
)

20   -123.113952
21    -98.495141
22           NaN
23   -122.419906
24    -73.569806
25    -93.265469
Name: geopy_location, dtype: float64

## Get location for all cities

In [126]:
batch_size = 40
iter_chunks = range(1 + len(df_top_cities) // batch_size)
print(iter_chunks)

range(0, 24)


In [127]:
# copy original name with potential errors
# l_df_geopy_loc_old = l_df_geopy_loc.copy()

In [128]:
len(l_df_geopy_loc_old)

49

In [None]:
geocode_limiter = RateLimiter(geolocator.geocode, min_delay_seconds=0.03)
l_df_geopy_loc = list()

for i in tqdm(iter_chunks):    
    df_slice_ = (
        df_top_cities.iloc[i * batch_size:(i + 1) * batch_size]
        [['city_to_encode']]
        .copy()
        .reset_index()
    )
    df_slice_['geopy_location'] = (
        df_slice_['city_to_encode']
        .apply(geocode_limiter)
    )
    l_df_geopy_loc.append(df_slice_)
    time.sleep(0.5)

  4%|███▍                                                                               | 1/24 [00:20<07:55, 20.68s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Madrid, M Spain',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.

RateLimiter swallowed an error after 2 retries. Called with (*('Madrid, M Spain',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line = str(

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Santiago, RM Chile',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line 

 12%|██████████▍                                                                        | 3/24 [01:55<13:21, 38.16s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Copenhagen, 84 Denmark',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users

RateLimiter swallowed an error after 2 retries. Called with (*('Copenhagen, 84 Denmark',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line

RateLimiter caught an error, retrying (1/2 tries). Called with (*('Barcelona, B Spain',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
    line 

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Milwaukee, WI United States',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status


RateLimiter swallowed an error after 2 retries. Called with (*('Milwaukee, WI United States',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 280, in _read_status
   

 17%|█████████████▊                                                                     | 4/24 [03:26<19:36, 58.81s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Kochi, KL India',), **{}).
Traceback (most recent call last):
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/david.bermejo/repos/subreddit_clustering_i18n/.venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 1373, in getresponse
    response.begin()
  File "/Users/david.bermejo/miniconda3/envs/subreddit_clustering_i18n/lib/python3.7/http/client.py", line 319, in begin
    version, status, reason = self._read_status()
  File "/Users/david.

In [None]:
len(l_df_geopy_loc)

In [None]:
style_df_numeric(l_df_geopy_loc[0])

In [None]:
l_df_geopy_loc[-1]

In [None]:
# geocode_limiter = RateLimiter(geolocator.geocode, min_delay_seconds=1.0)


# for i in tqdm(range(11, 47)):    
#     df_slice_ = (
#         df_top_cities.iloc[i * batch_size:(i + 1) * batch_size]
#         [['city_to_encode']]
#         .copy()
#         .reset_index()
#     )
#     df_slice_['geopy_location'] = (
#         df_slice_['city_to_encode']
#         .apply(geocode_limiter)
#     )
#     l_df_geopy_loc.append(df_slice_)
#     time.sleep(3)


In [None]:
# l_df_geopy_loc[-2]

In [None]:
# l_df_geopy_loc[-1]

# Dump data to df so I can start building a map in a separate notebook

In [None]:
# (
#     pd.concat(l_df_geopy_loc_old, ignore_index=True, axis=0)
#     .drop(columns=['index'])
#     .rename(columns={'city_to_encode': 'city_to_encode_short'})
# )

In [114]:
# (
#     df_top_cities.drop(columns=['geopy_location'])
#     .assign(city_to_encode_short=lambda x: x['geo_city'] + ", " + x['geo_region'] + " " + x['geo_country_code'])
# ).head()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name,city_to_encode_short
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom,"London, ENG GB"
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States,"Los Angeles, CA US"
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States,"New York, NY US"
3,"Chicago, IL United States",14.617279,2023-03-01,US,IL,Chicago,2229471,3,4,United States,"Chicago, IL US"
4,"Sydney, NSW Australia",14.560106,2023-03-01,AU,NSW,Sydney,2105579,1,5,Australia,"Sydney, NSW AU"


## Old format -- we expect these to include some errors

In [116]:
df_city_and_loc_old = (
    (
        df_top_cities.drop(columns=['geopy_location'])
        .assign(city_to_encode_short=lambda x: x['geo_city'] + ", " + x['geo_region'] + " " + x['geo_country_code'])
        .copy()
    )
    .merge(
        (
            pd.concat(l_df_geopy_loc_old, ignore_index=True, axis=0)
            .drop(columns=['index'])
            .rename(columns={'city_to_encode': 'city_to_encode_short'})
        ),
        how='left',
        on=['city_to_encode_short']
    )
)
print(df_city_and_loc_old.shape)

# get lat & long from location
df_city_and_loc_old['latitude'] = (
    df_city_and_loc_old['geopy_location']
    .apply(lambda loc: loc.latitude if loc else None)
)

df_city_and_loc_old['longitude'] = (
    df_city_and_loc_old['geopy_location']
    .apply(lambda loc: loc.longitude if loc else None)
)

(964, 12)

In [119]:
df_city_and_loc_old.head()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name,city_to_encode_short,geopy_location,latitude,longitude
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom,"London, ENG GB","(London, Greater London, England, United Kingdom, (51.5073359, -0.12765))",51.507336,-0.12765
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States,"Los Angeles, CA US","(Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States, (34.0536909, -118.242766))",34.053691,-118.242766
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States,"New York, NY US","(City of New York, New York, United States, (40.7127281, -74.0060152))",40.712728,-74.006015
3,"Chicago, IL United States",14.617279,2023-03-01,US,IL,Chicago,2229471,3,4,United States,"Chicago, IL US","(Chicago, Cook County, Illinois, United States, (41.8755616, -87.6244212))",41.875562,-87.624421
4,"Sydney, NSW Australia",14.560106,2023-03-01,AU,NSW,Sydney,2105579,1,5,Australia,"Sydney, NSW AU","(Sydney, Council of the City of Sydney, New South Wales, Australia, (-33.8698439, 151.2082848))",-33.869844,151.208285


In [125]:
# df_city_and_loc_old.to_csv(
#     f"djb-df_top_cities_loc_old-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}.csv",
#     index=False,
# )

## With new location data
We expect this to be better because we're including the whole country name instead of just an abbreviation

In [None]:
TODO