# Generate home and work based racial diversity (CBG level)
output: CA_CBG_dominantRace.pkl or LA_CBG_dominantRace.pkl

These are census based racial diversity data. No need to run this notebook again.

In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200
%sql trino://localhost:9090/cuebiq/

import pandas as pd
import yaml
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os
from pyhive import trino
import pydeck as pdk
from typing import List
import json
import copy
import itertools
# import geohash
from shapely.geometry import shape
from shapely.geometry import Polygon
from shapely.geometry import box
import geopandas as geopd
from pyquadkey2 import quadkey
from pyquadkey2.quadkey import TileAnchor, QuadKey
from h3 import h3
import seaborn as sns
import folium
from keplergl import KeplerGl
from datetime import datetime, timedelta
import math

os.environ['MAPBOX_API_KEY'] = "INSERT YOUR MAPBOX TOKEN HERE"
pd.set_option('display.max_colwidth', 0)

In [2]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [3]:
geography_table = "cuebiq.paas_cda_pe_v3.geography_registry"
# Census Block Groups typically have a population between 600 to 3000 people, this makes the data privacy safe.
hw_table = "cuebiq.paas_cda_pe_v3.device_recurring_area"
date = 20220301
date_formated = datetime.strptime(str(date), "%Y%m%d").strftime("%Y-%m-%d")

census_data_table = "cuebiq.paas_public_data.census_data"
census_taxonomy_table = "cuebiq.paas_public_data.census_taxonomy"

geog_clause = 'US.CA.037%'  #'US.CA.%'   'US.CA.037%'
# sql_engine.read_sql(f"desc {census_data_table}")

In [1]:
%%time
#select HOME locations
df_home_loc = sql_engine.read_sql(
    f"""
    select *
    from {hw_table}
    where
        provider_id = '190199'
        and country_code = 'US'
        and snapshot_event_date = {date} 
        and tag_type_code = 'HOME'
        and block_group_id like '{geog_clause}'
    """
)
# df_home_loc = df_home_loc[~df_home_loc['block_group_id'].isin(['US.CA.037.599100.2','US.CA.037.599000.2','US.CA.037.599000.1','US.CA.037.599000.4','US.CA.037.599000.3','US.CA.037.599100.1'])]
print(df_home_loc.shape[0])
df_home_loc.head()

In [5]:
# sql_engine.read_sql(
#     f"""
#     select distinct 
#         segment_group_id
#     from {census_data_table}
#     """
# )

Unnamed: 0,segment_group_id
0,YEARLY_MEDIAN_HOUSEHOLD_INCOME
1,YEARLY_MEDIAN_FAMILY_INCOME_BY_FAMILY_SIZE
2,AGGREGATE_HOUSEHOLD_INCOME_BY_QUINTILE
3,MEDIAN_AGE_BY_SEX_ETHNICITY
4,HOUSEHOLD_INCOME_BY_VALUE
5,ETHNICITY
6,TOTAL_POPULATION
7,HISPANIC_OR_LATINO_ETHNICITY
8,SEX_BY_AGE
9,MEDIAN_AGE_BY_SEX


In [7]:
# sql_engine.read_sql(
#     f"""
#     select
#         census_variable_code,
#         segment_group_id,
#         census_variable_description
#     from {census_taxonomy_table}
#     where 
#         segment_group_id = 'HISPANIC_OR_LATINO_ETHNICITY'
#     order by 1
#     """
# )

Unnamed: 0,census_variable_code,segment_group_id,census_variable_description
0,B03002_001E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total
1,B03002_002E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino
2,B03002_003E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!White alone
3,B03002_004E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!Black or African American alone
4,B03002_005E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!American Indian and Alaska Native alone
5,B03002_006E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!Asian alone
6,B03002_007E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone
7,B03002_008E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!Some other race alone
8,B03002_009E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!Two or more races
9,B03002_010E,HISPANIC_OR_LATINO_ETHNICITY,Estimate!!Total!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race


In [15]:
# sql_engine.read_sql(
#     f"""
#     select
#         census_variable_code,
#         segment_group_id,
#         census_variable_description
#     from {census_taxonomy_table}
#     where 
#         segment_group_id = 'ETHNICITY'
#     order by 1
#     """
# )

Unnamed: 0,census_variable_code,segment_group_id,census_variable_description
0,B02001_001E,ETHNICITY,Estimate!!Total
1,B02001_002E,ETHNICITY,Estimate!!Total!!White alone
2,B02001_003E,ETHNICITY,Estimate!!Total!!Black or African American alone
3,B02001_004E,ETHNICITY,Estimate!!Total!!American Indian and Alaska Native alone
4,B02001_005E,ETHNICITY,Estimate!!Total!!Asian alone
5,B02001_006E,ETHNICITY,Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone
6,B02001_007E,ETHNICITY,Estimate!!Total!!Some other race alone
7,B02001_008E,ETHNICITY,Estimate!!Total!!Two or more races
8,B02001_009E,ETHNICITY,Estimate!!Total!!Two or more races!!Two races including Some other race
9,B02001_010E,ETHNICITY,"Estimate!!Total!!Two or more races!!Two races excluding Some other race, and three or more races"


In [16]:
# test = sql_engine.read_sql(f"""
#         select
#             geography_id_2 as block_group_id,
#             census_variable_code as ethnicity,
#             value,
#             sum(value) over (partition by geography_id) as total_value
#         from {census_data_table}
#         where 
#             segment_group_id = 'HISPANIC_OR_LATINO_ETHNICITY'
#             and admin_level = 'block_group'
#             and geography_id_2 like 'US.CA.037.113303.2%'
#             and census_variable_code != 'B03002_001E'
#             and census_variable_code != 'B03002_002E'
#             and census_variable_code != 'B03002_013E'
#             and census_variable_code != 'B03002_014E'
#             and census_variable_code != 'B03002_015E'
#             and census_variable_code != 'B03002_016E'
#             and census_variable_code != 'B03002_017E'
#             and census_variable_code != 'B03002_018E'
#             and census_variable_code != 'B03002_019E'
#             and census_variable_code != 'B03002_020E'
#             and census_variable_code != 'B03002_021E'
#         """
# )

# test

In [6]:
%%time 

### block_group level ethnicity data
ethnicity_subquery = f"""
    ethnicity_tmp as ( 
        select
            geography_id_2 as block_group_id,
            census_variable_code as ethnicity,
            value,
            sum(value) over (partition by geography_id) as total_value
        from {census_data_table}
        where 
            segment_group_id = 'HISPANIC_OR_LATINO_ETHNICITY'
            and admin_level = 'block_group'
            and geography_id_2 like '{geog_clause}'
            and census_variable_code != 'B03002_001E'
            and census_variable_code != 'B03002_002E'
            and census_variable_code != 'B03002_013E'
            and census_variable_code != 'B03002_014E'
            and census_variable_code != 'B03002_015E'
            and census_variable_code != 'B03002_016E'
            and census_variable_code != 'B03002_017E'
            and census_variable_code != 'B03002_018E'
            and census_variable_code != 'B03002_019E'
            and census_variable_code != 'B03002_020E'
            and census_variable_code != 'B03002_021E'            
    ),
    
    ethnicity as (
        select
            block_group_id,
            total_value as population,
            map_agg(ethnicity, value/total_value) as ethnicity
        from ethnicity_tmp
        group by 1, 2
    )
    """

df_ethnicity = sql_engine.read_sql(
    f"""
    with 
    {ethnicity_subquery}
    select * from ethnicity
    """
)

def split_dict_to_columns(row, col):
    for key, value in row[col].items():
        row[key] = value
    return pd.Series(row)

# Apply the custom function to each row in the DataFrame
df_ethnicity_new = df_ethnicity.apply(lambda x: split_dict_to_columns(x,'ethnicity'), axis=1)
df_ethnicity_new['R_White'] = df_ethnicity_new['B03002_003E']
df_ethnicity_new['R_AfricanAmerican'] = df_ethnicity_new['B03002_004E']
df_ethnicity_new['R_AmericanIndianAlaska'] = df_ethnicity_new['B03002_005E']
df_ethnicity_new['R_Asian'] = df_ethnicity_new['B03002_006E'] +  df_ethnicity_new['B03002_007E'] #Asian alone + Native Hawaiian and Other Pacific Islander alone
df_ethnicity_new['R_Other'] = df_ethnicity_new['B03002_008E'] + df_ethnicity_new['B03002_009E']+df_ethnicity_new['B03002_010E']+df_ethnicity_new['B03002_011E']
df_ethnicity_new['R_HispanicLatinx'] = df_ethnicity_new['B03002_012E']
df_ethnicity_new = df_ethnicity_new[['block_group_id','population','R_White','R_HispanicLatinx','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other']]
print(df_ethnicity_new.shape[0])
df_ethnicity_new.head()

6425
CPU times: user 25.4 s, sys: 290 µs, total: 25.4 s
Wall time: 47.4 s


Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other
0,US.CA.037.120400.1,3412.0,0.203986,0.436108,0.084115,0.0,0.250293,0.025498
1,US.CA.037.199201.1,1373.0,0.007283,0.872542,0.0,0.0,0.120175,0.0
2,US.CA.037.218120.2,794.0,0.020151,0.700252,0.028967,0.0,0.217884,0.032746
3,US.CA.037.232400.4,2214.0,0.017164,0.628726,0.320235,0.0,0.0,0.033875
4,US.CA.037.265202.1,447.0,0.550336,0.098434,0.0,0.0,0.35123,0.0


In [11]:
df_ethnicity_new['white'] = df_ethnicity_new['population']*df_ethnicity_new['R_White']
df_ethnicity_new['latino'] = df_ethnicity_new['population']*df_ethnicity_new['R_HispanicLatinx']
df_ethnicity_new['black'] = df_ethnicity_new['population']*df_ethnicity_new['R_AfricanAmerican']
df_ethnicity_new['indian'] = df_ethnicity_new['population']*df_ethnicity_new['R_AmericanIndianAlaska']
df_ethnicity_new['asian'] = df_ethnicity_new['population']*df_ethnicity_new['R_Asian']
df_ethnicity_new['other'] = df_ethnicity_new['population']*df_ethnicity_new['R_Other']
print(df_ethnicity_new['white'].sum(), df_ethnicity_new['latino'].sum(), df_ethnicity_new['black'].sum(), df_ethnicity_new['indian'].sum(), df_ethnicity_new['asian'].sum(), df_ethnicity_new['other'].sum())
print(2659052 +  4893603 +  795505 +  20307 +  1476381 +  476484)
for i  in [2659052,4893603,795505,20307,1476381,476484]:
    print(i/10321332*100)

2659052.0 4893603.0 795505.0 20307.0 1476381.0 476484.0
10321332
25.76268256849019
47.41251419874877
7.707386992299056
0.1967478616132104
14.304171205809482
4.616497173039294


In [17]:
%%time
def compute_scaled_entropy(row,col1,col2,col3,col4,col5,col6):
    sum_nominator = 0
    for item in [row[col1],row[col2],row[col3],row[col4],row[col5],row[col6]]:
        if item != 0:
            sum_nominator += item*(np.log(1/item))
    row['scaled_entropy'] = sum_nominator/np.log(6)
    return row

df_ethnicity_entropy = df_ethnicity_new.apply(lambda x:compute_scaled_entropy(x,'R_White','R_HispanicLatinx','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other'), axis=1)

CPU times: user 3.78 s, sys: 0 ns, total: 3.78 s
Wall time: 3.78 s


In [4]:
%%time
# read LA CBG data
cbg_geom = sql_engine.read_sql(
    f"""
    select
        geography_id, geometry_wkt
    from {geography_table}
    where
        country_code = 'US'
        and geography_type_code = 'admin4'
        and geography_id like '{geog_clause}'
    """
)

cbg_geom.rename(columns={'geography_id': 'block_group_id', 'geometry_wkt': 'geom'}, inplace=True)
# eliminate Catalina island and another island in the south of Catalina island
cbg_geom = cbg_geom[~cbg_geom['block_group_id'].isin(['US.CA.037.599100.2','US.CA.037.599000.2','US.CA.037.599000.1','US.CA.037.599000.4','US.CA.037.599000.3','US.CA.037.599100.1'])]
print(cbg_geom.shape[0])

6419
CPU times: user 221 ms, sys: 3.95 ms, total: 225 ms
Wall time: 8.41 s


# census based segregation

In [19]:
def identify_max_col(row,col1,col2,col3,col4,col5,col6):
    dominant = max([row[col1],row[col2],row[col3],row[col4],row[col5],row[col6]])
    if dominant == row[col1]: row['dominant_race'] = col1
    if dominant == row[col2]: row['dominant_race'] = col2
    if dominant == row[col3]: row['dominant_race'] = col3
    if dominant == row[col4]: row['dominant_race'] = col4
    if dominant == row[col5]: row['dominant_race'] = col5
    if dominant == row[col6]: row['dominant_race'] = col6
    return row

def extract_dominant_group(df):
    df.loc[df['scaled_entropy']<=0.3707,'class'] = 'low diversity'
    df.loc[df['R_White']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_HispanicLatinx']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_AfricanAmerican']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_AmericanIndianAlaska']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_Asian']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_Other']>=0.8,'class'] = 'low diversity'

    df.loc[(df['scaled_entropy']>=0.7414)&(df['R_White']<0.45)
                             &(df['R_HispanicLatinx']<0.45)&(df['R_AfricanAmerican']<0.45)
                             &(df['R_AmericanIndianAlaska']<0.45)&(df['R_Asian']<0.45)
                             &(df['R_Other']<0.45)
                             ,'class'] = 'high diversity'

    df['class'] = df['class'].fillna('moderate diversity')
    print(df['class'].value_counts())
    
    df = df.apply(lambda x:identify_max_col(x,'R_White','R_HispanicLatinx','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other'), axis=1)
    
    df['class_dominant'] = df['dominant_race'] + '_' + df['class']
    df.loc[df['class']=='high diversity','class_dominant'] = 'high diversity'
    print(df.class_dominant.value_counts())
    return df

In [20]:
df_ethnicity_entropy = extract_dominant_group(df_ethnicity_entropy)
df_ethnicity_entropy.head()

moderate diversity    4232
low diversity         1676
high diversity        517 
Name: class, dtype: int64
R_HispanicLatinx_moderate diversity     1907
R_White_moderate diversity              1664
R_HispanicLatinx_low diversity          1342
high diversity                          517 
R_Asian_moderate diversity              449 
R_White_low diversity                   235 
R_AfricanAmerican_moderate diversity    180 
R_AfricanAmerican_low diversity         49  
R_Asian_low diversity                   48  
R_Other_moderate diversity              3   
R_AmericanIndianAlaska_low diversity    2   
Name: class_dominant, dtype: int64


Unnamed: 0,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_HispanicLatinx,R_Other,R_White,block_group_id,class,dominant_race,population,scaled_entropy,class_dominant
0,0.024465,0.0,0.01682,0.928135,0.0,0.030581,US.CA.037.104320.4,low diversity,R_HispanicLatinx,654.0,0.187165,R_HispanicLatinx_low diversity
1,0.007865,0.0,0.065169,0.206742,0.144944,0.575281,US.CA.037.187300.2,moderate diversity,R_White,1780.0,0.63623,R_White_moderate diversity
2,0.592987,0.0,0.001252,0.324358,0.046337,0.035066,US.CA.037.234502.3,moderate diversity,R_AfricanAmerican,1597.0,0.526454,R_AfricanAmerican_moderate diversity
3,0.021684,0.0,0.153571,0.52602,0.046173,0.252551,US.CA.037.400603.1,moderate diversity,R_HispanicLatinx,3920.0,0.668769,R_HispanicLatinx_moderate diversity
4,0.107566,0.0,0.238037,0.116973,0.104703,0.43272,US.CA.037.463602.2,high diversity,R_White,2445.0,0.798796,high diversity


In [10]:
if geog_clause == 'US.CA.%':
    df_ethnicity_entropy.to_pickle("../output/CA_CBG_dominantRace.pkl")
elif geog_clause == 'US.CA.037%':
    df_ethnicity_entropy.to_pickle("../output/LA_CBG_dominantRace.pkl")

In [11]:
# sum_nominator = 0
# for item in [0.45,0.45,0.1,0,0,0]:
#     if item != 0:
#         sum_nominator += item*(np.log(1/item))
# sum_nominator/np.log(6) #= 0.36


## Mapping

In [5]:
if geog_clause == 'US.CA.%':
    df_ethnicity_entropy = pd.read_pickle("../output/CA_CBG_dominantRace.pkl")
elif geog_clause == 'US.CA.037%':
    df_ethnicity_entropy = pd.read_pickle("../output/LA_CBG_dominantRace.pkl")
print(df_ethnicity_entropy.shape[0]) #6425 CBGs    

In [6]:
df_ethnicity_shp = pd.merge(df_ethnicity_entropy[['block_group_id','population','class_dominant','class','dominant_race','scaled_entropy']],cbg_geom,on='block_group_id')
df_ethnicity_shp = df_ethnicity_shp[df_ethnicity_shp.scaled_entropy>=0]
print(df_ethnicity_shp.shape[0]) #6390 CBGs
df_ethnicity_shp.head(1)

Unnamed: 0,block_group_id,population,class_dominant,class,dominant_race,scaled_entropy,geom
0,US.CA.037.119340.1,4032.0,R_HispanicLatinx_low diversity,low diversity,R_HispanicLatinx,0.124208,"POLYGON ((-118.453549 34.235433, -118.453549 34.231806999999996, -118.450254 34.231784000000005, -118.450254 34.233134, -118.45025000000001 34.23541, -118.453549 34.235433))"


In [7]:
print(df_ethnicity_shp['block_group_id'].unique().shape[0])
df_ethnicity_shp['class_dominant'].value_counts()

6390


R_HispanicLatinx_moderate diversity     1906
R_White_moderate diversity              1660
R_HispanicLatinx_low diversity          1341
high diversity                          517 
R_Asian_moderate diversity              449 
R_White_low diversity                   235 
R_AfricanAmerican_moderate diversity    180 
R_AfricanAmerican_low diversity         49  
R_Asian_low diversity                   48  
R_Other_moderate diversity              3   
R_AmericanIndianAlaska_low diversity    2   
Name: class_dominant, dtype: int64

In [8]:
df_ethnicity_shp['class_dominant'].value_counts() / len(df_ethnicity_shp) * 100

R_HispanicLatinx_moderate diversity     29.827856
R_White_moderate diversity              25.978091
R_HispanicLatinx_low diversity          20.985915
high diversity                          8.090767 
R_Asian_moderate diversity              7.026604 
R_White_low diversity                   3.677621 
R_AfricanAmerican_moderate diversity    2.816901 
R_AfricanAmerican_low diversity         0.766823 
R_Asian_low diversity                   0.751174 
R_Other_moderate diversity              0.046948 
R_AmericanIndianAlaska_low diversity    0.031299 
Name: class_dominant, dtype: float64

In [1]:
gdf_ethnicity_shp = geopd.GeoDataFrame(df_ethnicity_shp, crs="EPSG:4326", geometry='geom')
gdf_ethnicity_shp.head()

In [2]:
df_ethnicity_shp.to_file('../output/LA_CBG_dominantRace.shp', driver='ESRI Shapefile')

In [20]:
%%time
df_ethnicity_shp['category'] = df_ethnicity_shp['class_dominant']

k2 = KeplerGl(data={'h':df_ethnicity_shp[df_ethnicity_shp['category']!='R_Other_moderate diversity']}, height=700) 
with open('config/home_entropy.py', 'r') as config_file:
    k2.config = json.loads(config_file.read())
k2 # k.save_to_html(file_name='test.html')

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
CPU times: user 79 ms, sys: 15.6 ms, total: 94.6 ms
Wall time: 92.6 ms


KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': '4gb7yr', 'type': '…

# cubic work location based segregation

In [77]:
df_ethnicity_new.head()

Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other
0,US.CA.037.113303.2,1901.0,0.375066,0.471331,0.02788,0.005786,0.092057,0.02788
1,US.CA.037.134201.2,914.0,0.358862,0.420131,0.047046,0.0,0.16302,0.010941
2,US.CA.037.188100.1,1880.0,0.332979,0.413298,0.023404,0.0,0.031915,0.198404
3,US.CA.037.194500.2,963.0,0.839045,0.017653,0.006231,0.0,0.058152,0.07892
4,US.CA.037.209810.2,1316.0,0.155775,0.62234,0.054711,0.0,0.12766,0.039514


In [78]:
%%time
#select work locations

dfg_work = sql_engine.read_sql(
    f"""
    select *
    from {hw_table}
    where
        provider_id = '190199'
        and country_code = 'US'
        and snapshot_event_date = {date} 
        and tag_type_code = 'WORK'
        and block_group_id like 'US.CA.037%'   --- <<< filter recurring areas in LA
    """
)

dfg_work = dfg_work[~dfg_work['block_group_id'].isin(['US.CA.037.599100.2','US.CA.037.599000.2','US.CA.037.599000.1','US.CA.037.599000.4','US.CA.037.599000.3','US.CA.037.599100.1'])]

CPU times: user 3.72 s, sys: 183 ms, total: 3.9 s
Wall time: 22.4 s


In [2]:
# dfg_work
df_home_loc_race = pd.merge(df_home_loc[['cuebiq_id','block_group_id']],df_ethnicity_new,on='block_group_id')  
df_home_loc_race.rename(columns={'block_group_id': 'home_block'}, inplace=True)
df_home_loc_race = df_home_loc_race[df_home_loc_race['R_White'].notna()]
df_home_loc_race

In [73]:
# dfg_work['confidence_level'].unique()
# test = dfg_work.groupby('cuebiq_id').size().reset_index(name='counts')
# test[test.counts>1]

In [83]:
df_work_loc_race = pd.merge(dfg_work[['cuebiq_id','block_group_id']],df_home_loc_race,on='cuebiq_id')  
print(df_work_loc_race.shape[0]) #27k
df_work_loc_race = df_work_loc_race.drop(['cuebiq_id'], inplace=False, axis=1)
df_work_loc_race['devices'] = 1
df_work_loc_race = df_work_loc_race.groupby('block_group_id').sum().reset_index()
df_work_loc_race = df_work_loc_race[df_work_loc_race['devices']>5]
df_work_loc_race.head()

271594


  df_work_loc_race = df_work_loc_race.groupby('block_group_id').sum().reset_index()


Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices
0,US.CA.037.101110.1,28698.0,7.145338,3.369432,0.302998,0.002159,2.059023,1.12105,14
1,US.CA.037.101110.2,50583.0,12.286846,8.21129,0.641664,0.130364,2.766493,0.963342,25
2,US.CA.037.101110.3,282420.0,67.406997,53.846179,4.102572,0.272035,13.759681,5.612535,145
3,US.CA.037.101122.1,33281.0,9.562052,3.792788,0.447056,0.090872,2.63846,1.468773,18
4,US.CA.037.101122.2,27863.0,6.967776,4.655852,0.152566,0.014357,2.355524,0.853924,15


In [84]:
# df_work_loc_race['total'] = df_work_loc_race['B02001_002E'] + df_work_loc_race['B02001_003E']+ df_work_loc_race['B02001_004E']+ df_work_loc_race['B02001_005E']+ df_work_loc_race['B02001_006E']
# print(df_work_loc_race.shape[0])
df_work_loc_race['R_White'] = df_work_loc_race['R_White']/df_work_loc_race['devices']
df_work_loc_race['R_AfricanAmerican'] = df_work_loc_race['R_AfricanAmerican']/df_work_loc_race['devices']
df_work_loc_race['R_AmericanIndianAlaska'] = df_work_loc_race['R_AmericanIndianAlaska']/df_work_loc_race['devices']
df_work_loc_race['R_Asian'] = df_work_loc_race['R_Asian']/df_work_loc_race['devices']
df_work_loc_race['R_HispanicLatinx'] = df_work_loc_race['R_HispanicLatinx']/df_work_loc_race['devices']
df_work_loc_race['R_Other'] = df_work_loc_race['R_Other']/df_work_loc_race['devices']

df_work_loc_entropy = df_work_loc_race.apply(lambda x:compute_scaled_entropy(x,'R_White','R_AfricanAmerican','R_HispanicLatinx','R_AmericanIndianAlaska','R_Asian','R_Other'), axis=1)
df_work_loc_entropy = df_work_loc_entropy[df_work_loc_entropy['scaled_entropy'].notna()]
df_work_loc_entropy

Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices,scaled_entropy
0,US.CA.037.101110.1,28698.0,0.510381,0.240674,0.021643,0.000154,0.147073,0.080075,14,0.700135
1,US.CA.037.101110.2,50583.0,0.491474,0.328452,0.025667,0.005215,0.110660,0.038534,25,0.672684
2,US.CA.037.101110.3,282420.0,0.464876,0.371353,0.028294,0.001876,0.094894,0.038707,145,0.661886
3,US.CA.037.101122.1,33281.0,0.531225,0.210710,0.024836,0.005048,0.146581,0.081598,18,0.708016
4,US.CA.037.101122.2,27863.0,0.464518,0.310390,0.010171,0.000957,0.157035,0.056928,15,0.684521
...,...,...,...,...,...,...,...,...,...,...
6386,US.CA.037.980025.1,108203.0,0.229204,0.469824,0.120232,0.002444,0.130173,0.048123,65,0.766484
6388,US.CA.037.980028.1,2401336.0,0.212360,0.462763,0.129516,0.002487,0.145143,0.047732,1233,0.776107
6389,US.CA.037.980030.1,94515.0,0.253027,0.424919,0.075433,0.003415,0.180066,0.063139,59,0.786310
6390,US.CA.037.980031.1,417304.0,0.311659,0.409415,0.078140,0.001901,0.132839,0.066045,228,0.774489


In [86]:
df_work_loc_entropy = extract_dominant_group(df_work_loc_entropy)
df_work_loc_entropy.head()

moderate diversity    4378
high diversity        1109
low diversity         298 
Name: class, dtype: int64
R_HispanicLatinx_moderate diversity     3109
high diversity                          1109
R_White_moderate diversity              1059
R_HispanicLatinx_low diversity          298 
R_Asian_moderate diversity              189 
R_AfricanAmerican_moderate diversity    21  
Name: class_dominantclass_dominant, dtype: int64


In [109]:
df_work_ethnicity_shp = pd.merge(df_work_loc_entropy[['block_group_id','population','devices','class_dominant','class','dominant_race','scaled_entropy']],cbg_geom,on='block_group_id')
df_work_ethnicity_shp = df_work_ethnicity_shp[df_work_ethnicity_shp.scaled_entropy>=0]
df_work_ethnicity_shp.head(1)

Unnamed: 0,block_group_id,population,devices,class_dominantclass_dominant,class,dominant_race,scaled_entropy,geom
0,US.CA.037.101110.1,28698.0,14,R_White_moderate diversity,moderate diversity,R_White,0.700135,"POLYGON ((-118.28592399999998 34.255894999999995, -118.28498000000002 34.255888999999996, -118.284913 34.262463, -118.28846699999998 34.26282, -118.29130900000001 34.262854, -118.29132400000002 34.259555000000006, -118.291044 34.255927, -118.287333 34.255904, -118.28592399999998 34.255894999999995))"


In [113]:
%%time
df_work_ethnicity_shp['category'] = df_work_ethnicity_shp['class_dominant']
k3 = KeplerGl(data={'h':df_work_ethnicity_shp}, height=700) 
with open('config/work_entropy.py', 'r') as config_file:
    k3.config = json.loads(config_file.read())
k3 

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
CPU times: user 101 ms, sys: 104 µs, total: 101 ms
Wall time: 99 ms


KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': '4gb7yr', 'type': '…

# Stop location based segregation (don't run below)

In [3]:
df_home_loc_race 

In [17]:
# stop_table = f"dedicated.rsu.la_stop_uplevelled"
# visit_table = f"cuebiq.paas_cda_pe_v3.visit"
# date_plus = int((datetime.strptime(str(date), "%Y%m%d") + timedelta(days=3)).strftime("%Y%m%d"))# You see that we can limit to the first three processing dates starting from the local date of interest.


In [46]:
# %%time
# df_stop_LA = sql_engine.read_sql(
#     f"""
#     select *, cast(hour(from_iso8601_timestamp(stop_zoned_datetime)) as int) as hour
#     from {stop_table}
#     where
#         substr(stop_zoned_datetime,1,10) = '{date_formated}'
#     """
# )
# print(df_stop_LA.shape[0])
# df_stop_LA.head()

In [22]:
# %%time
# df_visit_LA = sql_engine.read_sql(f"""
#     with la_visit as (
#         select
#             *, 
#             cast(hour(from_iso8601_timestamp(zoned_datetime)) as int) as hour
#         from {visit_table}
#         where 
#             country_code = 'US'
#             and provider_id = '190199'
#             and processing_date between {date} and {date_plus}
#             and event_date = {date}
#             and admin2_id = 'US.CA.037'
#         ),
        
#         la_cbg as (
#         select
#             geography_id, geometry_wkt
#         from {geography_table}
#         where
#             country_code = 'US'
#             and geography_type_code = 'admin4'
#             and geography_id like 'US.CA.037%'     --- <<< filter geometries in LA
#         )
        
#         select
#         *
#         from la_visit s
#         inner join la_cbg c
#         on st_contains(st_geometryfromtext(c.geometry_wkt), st_point(s.lng, s.lat))
#         """
# )
# print(df_visit_LA.shape[0])
# df_visit_LA.head()

264083
CPU times: user 6.55 s, sys: 285 ms, total: 6.83 s
Wall time: 43.6 s


Unnamed: 0,admin1_id,admin2_id,brand_id,country_code,cuebiq_id,dwell_time_minutes,event_date,geohash,geoset_id,lat,...,os_name,place_id,place_version,zipcode_id,zoned_datetime,processing_date,provider_id,hour,geography_id,geometry_wkt
0,US.CA,US.CA.037,1704,US,4079472660,5.25,20220310,9qh09n2rc,9478,33.873562,...,IOS,27822215,1646828334131,US.90701,2022-03-10T14:04:59-08:00,20220311,190199,14,US.CA.037.554801.2,"POLYGON ((-118.080043 33.873053999999996, -118.08006499999999 33.875319, -118.079633 33.875325, -118.07964899999999 33.876462, -118.082302 33.876456, -118.08234999999999 33.880365999999995, -118.08238399999999 33.880365999999995, -118.08780600000001 33.880338, -118.087777 33.87675, -118.091036 33.876512, -118.09099200000001 33.872980999999996, -118.082284 33.873013, -118.080043 33.873053999999996))"
1,US.CA,US.CA.037,315,US,4433486738,7.4,20220310,9q5dy0nj3,8015,34.23432,...,IOS,25759121,1646482823500,US.91324,2022-03-10T17:01:20-08:00,20220311,190199,17,US.CA.037.113301.2,"POLYGON ((-118.553566 34.232338999999996, -118.55357599999999 34.250001, -118.56180300000001 34.249997, -118.56281400000002 34.248839, -118.56354799999998 34.247324, -118.564497 34.24274, -118.56445699999999 34.235533, -118.56399100000002 34.234860999999995, -118.56230700000002 34.234317, -118.56230700000002 34.234049999999996, -118.553566 34.232338999999996))"
2,US.CA,US.CA.037,1580,US,5188616004,42.116667,20220310,9q5f2r8pc,9546,34.188079,...,ANDROID,27861995,1646828343574,US.91411,2022-03-10T15:34:15-08:00,20220311,190199,15,US.CA.037.127806.3,"POLYGON ((-118.466183 34.190289, -118.466182 34.188475, -118.46618 34.18665, -118.46399700000002 34.186651, -118.46399800000002 34.188476, -118.457449 34.188662, -118.45745000000001 34.190294, -118.464 34.19029, -118.466183 34.190289))"
3,US.CA,US.CA.037,629,US,5145469708,8.616667,20220310,9q5g0c36q,11132,34.284601,...,IOS,31106344,1646569145525,US.91340,2022-03-10T22:16:29-08:00,20220311,190199,22,US.CA.037.320300.2,"POLYGON ((-118.43894399999998 34.282337, -118.440536 34.283833, -118.44344300000002 34.286578, -118.444265 34.287347, -118.448036 34.284586, -118.445562 34.282266, -118.44271600000002 34.279581, -118.44187300000002 34.278780999999995, -118.43924699999998 34.280677, -118.439495 34.280916999999995, -118.438839 34.281344, -118.43939600000002 34.282002, -118.43894399999998 34.282337))"
4,US.CA,US.CA.037,84,US,4839293717,8.233333,20220310,9q5c4w7zj,7699,33.961351,...,IOS,25233326,1646400555575,US.90301,2022-03-10T17:23:18-08:00,20220311,190199,17,US.CA.037.601211.2,"POLYGON ((-118.35328299999999 33.959477, -118.353303 33.961774, -118.35730900000001 33.961681999999996, -118.35733900000001 33.962931999999995, -118.361724 33.962866999999996, -118.361712 33.961669, -118.36165999999999 33.958777, -118.35724900000001 33.958833999999996, -118.35459800000001 33.959455999999996, -118.35328299999999 33.959477))"


In [46]:
# df_visit_LA_hourly = df_visit_LA[df_visit_LA['hour']==15]
# print(df_visit_LA_hourly.shape[0])
# df_visit_LA_hourly.rename(columns={'geography_id': 'block_group_id'}, inplace=True)

# df_visit_LA_hourly_race = pd.merge(df_visit_LA_hourly[['cuebiq_id','block_group_id']],df_home_loc_race,on='cuebiq_id')  
# print(df_visit_LA_hourly_race.shape[0])
# df_visit_LA_hourly_race = df_visit_LA_hourly_race.drop(['cuebiq_id'], inplace=False, axis=1)
# df_visit_LA_hourly_race['devices'] = 1
# df_visit_LA_hourly_race = df_visit_LA_hourly_race.groupby('block_group_id').sum().reset_index()
# df_visit_LA_hourly_race = df_visit_LA_hourly_race[df_visit_LA_hourly_race['devices']>5]
# df_visit_LA_hourly_race.head()
# # df_visit_LA_hourly.head()

22982
20211


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_visit_LA_hourly.rename(columns={'geography_id': 'block_group_id'}, inplace=True)
  df_visit_LA_hourly_race = df_visit_LA_hourly_race.groupby('block_group_id').sum().reset_index()


Unnamed: 0,block_group_id,R_White,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices
2,US.CA.037.101300.2,8.78061,0.072423,0.011041,1.261158,1.874768,12
3,US.CA.037.101300.3,7.932072,0.0,0.0,0.680997,0.38693,9
9,US.CA.037.103102.5,5.599671,0.180411,0.00297,1.138109,2.078839,9
10,US.CA.037.103200.2,7.67202,0.441198,0.081616,1.80551,1.999656,12
17,US.CA.037.104203.1,8.164102,1.249074,0.033158,0.524419,6.029247,16


In [44]:
# df_visit_LA_hourly_race['R_White'] = df_visit_LA_hourly_race['R_White']/df_visit_LA_hourly_race['devices']
# df_visit_LA_hourly_race['R_AfricanAmerican'] = df_visit_LA_hourly_race['R_AfricanAmerican']/df_visit_LA_hourly_race['devices']
# df_visit_LA_hourly_race['R_AmericanIndianAlaska'] = df_visit_LA_hourly_race['R_AmericanIndianAlaska']/df_visit_LA_hourly_race['devices']
# df_visit_LA_hourly_race['R_Asian'] = df_visit_LA_hourly_race['R_Asian']/df_visit_LA_hourly_race['devices']
# df_visit_LA_hourly_race['R_Other'] = df_visit_LA_hourly_race['R_Other']/df_visit_LA_hourly_race['devices']
# # df_visit_LA_hourly_race.head()

# df_la_visit_loc_entropy = df_visit_LA_hourly_race.apply(lambda x:compute_scaled_entropy(x,'R_White','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other'), axis=1)
# df_la_visit_loc_entropy = df_la_visit_loc_entropy[df_la_visit_loc_entropy['scaled_entropy'].notna()]
# # df_la_visit_loc_entropy

# df_la_visit_loc_entropy_shp = pd.merge(df_la_visit_loc_entropy[['block_group_id','scaled_entropy','devices']],cbg_geom[['block_group_id','geom']],on='block_group_id')  
# df_la_visit_loc_entropy_shp = df_la_visit_loc_entropy_shp[df_la_visit_loc_entropy_shp.scaled_entropy>=0]
# df_la_visit_loc_entropy_shp.head(1)


Unnamed: 0,block_group_id,scaled_entropy,devices,geom
0,US.CA.037.101300.2,0.002965,12,"POLYGON ((-118.272473 34.232527, -118.266803 34.231235, -118.266733 34.240795999999996, -118.269008 34.242208999999995, -118.274034 34.245334, -118.276103 34.246483999999995, -118.276295 34.245152, -118.27395599999998 34.242992, -118.273341 34.240743, -118.27047400000001 34.236937, -118.270361 34.233942, -118.27068500000001 34.232994999999995, -118.27120200000002 34.233243, -118.271936 34.233067, -118.272473 34.232527))"


In [45]:
# %%time
# df_la_visit_loc_entropy_shp['colorscale'] = df_la_visit_loc_entropy_shp['scaled_entropy']
# k4 = KeplerGl(data={'h':df_la_visit_loc_entropy_shp}, height=700)
# with open('config/home_count_map.yml', 'r', encoding='utf-8') as config_file:
#     k4.config = yaml.load(config_file, yaml.FullLoader)
# k4 

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
CPU times: user 28.4 ms, sys: 94 µs, total: 28.5 ms
Wall time: 26.6 ms


KeplerGl(config={'config': {'mapState': {'bearing': 0, 'dragRotate': False, 'isSplit': False, 'latitude': 33.9…