# Purpose

2023-03-07. In this notebook we'll  
- pull the data for the top ~900 cities at Reddit (by DAU)
- get the lat & long coordinates for those cities
- Create a base plot to test whether we can add city-popular subreddits to a map

The main idea is that by making the MVP ti'll be more tangible and easier to get buy-in to add a map to the discovery tab (or something like it).

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Register bigquery magic (only needed for laptop/local, not colab)
%load_ext google.cloud.bigquery

In [3]:
# increase display width of cells
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [4]:
import time

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from tqdm import tqdm

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import subclu

# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


setup_logging()
notebook_display_config()
print_lib_versions([geopy, np, pd, plotly, subclu])

python		v 3.7.11
===
geopy		v: 2.3.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 5.11.0
subclu		v: 0.6.1


In [5]:
# plotting defaults
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')


# Load data with location info

In [82]:
df_city_and_loc = pd.read_csv(
    f"djb-df_top_cities_loc-2023-03-08_191937.csv",
)
df_city_and_loc.shape


(924, 14)

In [83]:
df_city_and_loc.head()

Unnamed: 0,city_to_encode,city_users_l7_ln,pt,geo_country_code,geo_region,geo_city,city_users_l7,city_rank_country,city_rank_world,country_name,geopy_location,geopy_country_name,latitude,longitude
0,"London, ENG United Kingdom",15.124056,2023-03-01,GB,ENG,London,3700775,1,1,United Kingdom,"London, Greater London, England, United Kingdom",United Kingdom,51.507336,-0.12765
1,"Los Angeles, CA United States",14.813316,2023-03-01,US,CA,Los Angeles,2712314,1,2,United States,"Los Angeles, Los Angeles County, CAL Fire Contract Counties, California, United States",United States,34.053691,-118.242766
2,"New York, NY United States",14.723372,2023-03-01,US,NY,New York,2479005,2,3,United States,"City of New York, New York, United States",United States,40.712728,-74.006015
3,"Chicago, IL United States",14.617279,2023-03-01,US,IL,Chicago,2229471,3,4,United States,"Chicago, Cook County, Illinois, United States",United States,41.875562,-87.624421
4,"Sydney, NSW Australia",14.560106,2023-03-01,AU,NSW,Sydney,2105579,1,5,Australia,"Sydney, Council of the City of Sydney, New South Wales, Australia",Australia,-33.869844,151.208285


In [84]:
counts_describe(df_city_and_loc)

Unnamed: 0,dtype,count,unique,unique-percent,null-count,null-percent
city_to_encode,object,924,924,100.00%,0,0.00%
city_users_l7_ln,float64,924,923,99.89%,0,0.00%
pt,object,924,1,0.11%,0,0.00%
geo_country_code,object,924,40,4.33%,0,0.00%
geo_region,object,923,344,37.27%,1,0.11%
geo_city,object,924,894,96.75%,0,0.00%
city_users_l7,int64,924,923,99.89%,0,0.00%
city_rank_country,int64,924,350,37.88%,0,0.00%
city_rank_world,int64,924,924,100.00%,0,0.00%
country_name,object,924,40,4.33%,0,0.00%


# Scale cities for better plots
problem:
- Using Log makes all the cities look the same
- Using the default values makes the small cities almost invisible

We need a new way to scale cities, in this case using approximate deciles.


---

With any Log transformation base, the result is the same: the proportions are about the same, so they're not helpful for us.

In [85]:
print(df_city_and_loc['city_users_l7_ln'].iloc[:5])
print(df_city_and_loc['city_users_l7_ln'].iloc[-5:])

0    15.124056
1    14.813316
2    14.723372
3    14.617279
4    14.560106
Name: city_users_l7_ln, dtype: float64
919    9.923339
920    9.911902
921    9.909171
922    9.907330
923    9.906882
Name: city_users_l7_ln, dtype: float64


In [86]:
15.124056 / 9.906882

1.5266211912082934

In [87]:
# convert log to make the size of larger cities easier to spot
print(np.emath.logn(1.01, df_city_and_loc['city_users_l7'].iloc[:5]))
print(np.emath.logn(1.01, df_city_and_loc['city_users_l7'].iloc[-5:]))

[1519.95476716 1488.72564356 1479.6862582  1469.02400646 1463.27808206]
[997.2380871  996.08809637 995.81352876 995.62839668 995.58331294]


In [22]:
1519.95476716 / 995.58331294

1.5266977131943973

## Convert city visitors into quintiles
This way it'll be easier to see the difference between the largest cities

In [90]:
# pd.qcut(
#     df_city_and_loc['city_users_l7'],
#     5,
#     labels=[1, 2, 3, 4, 5]
# )

In [89]:
style_df_numeric(
    df_city_and_loc['city_users_l7'].describe(
        percentiles=[0.01, 0.05, 0.10, 0.20, 0.25, 0.5, 0.8, 0.9, 0.95, 0.98, 0.99, 0.999]
    ).to_frame()
)

Unnamed: 0,city_users_l7
count,924.0
mean,157314.29
std,289374.48
min,20058.0
1%,20594.72
5%,22975.65
10%,27024.5
20%,38567.2
25%,46188.25
50%,66035.5


In [91]:
df_city_and_loc['city_users_l7'].quantile(0.1)

27024.5

In [184]:
df_city_and_loc['city_users_l7_scaled'] = pd.cut(
    df_city_and_loc['city_users_l7'],
    bins=[
        -1, df_city_and_loc['city_users_l7'].quantile(0.20),
        df_city_and_loc['city_users_l7'].quantile(0.40),
        df_city_and_loc['city_users_l7'].quantile(0.55),
        df_city_and_loc['city_users_l7'].quantile(0.65),
        df_city_and_loc['city_users_l7'].quantile(0.75),
        df_city_and_loc['city_users_l7'].quantile(0.85),
        df_city_and_loc['city_users_l7'].quantile(0.97),
        df_city_and_loc['city_users_l7'].quantile(0.990)
        , np.inf
    ],
    labels=[0.7, 0.9, 1.1, 1.2, 1.6, 2.4, 3.8, 9, 20]
)
print(df_city_and_loc['city_users_l7_scaled'].describe())
value_counts_and_pcts(df_city_and_loc['city_users_l7_scaled'], sort_index=True, sort_index_ascending=True)

count     924.0
unique      9.0
top         0.7
freq      185.0
Name: city_users_l7_scaled, dtype: float64


Unnamed: 0,city_users_l7_scaled-count,city_users_l7_scaled-percent,city_users_l7_scaled-pct_cumulative_sum
0.7,185,20.0%,20.0%
0.9,185,20.0%,40.0%
1.1,138,14.9%,55.0%
1.2,92,10.0%,64.9%
1.6,93,10.1%,75.0%
2.4,92,10.0%,85.0%
3.8,111,12.0%,97.0%
9.0,18,1.9%,98.9%
20.0,10,1.1%,100.0%


# Basic map 

In [187]:
fig = px.scatter_geo(
    df_city_and_loc.dropna(how='any'),
    lat="latitude",
    lon='longitude',
    color="country_name",
    hover_name="geo_city", 
    size="city_users_l7_scaled",
    # projection="natural earth"
)
fig.show()

# Make fixes to a few cities (TODO(djb)


In [None]:
# d_cities_to_fix = [
#     'victoria, BC'
# ]

In [12]:
fig = px.scatter_geo(
    df_city_and_loc.dropna(how='any'),
    lat="latitude",
    lon='longitude',
    color="country_name",
    hover_name="geo_city", 
    size="city_users_l7",
    # projection="natural earth"
)
fig.show()