# This notebook is used to infer individual's race information based on their home locations

In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200
%sql trino://localhost:9090/cuebiq/

import pandas as pd
import yaml
import numpy as np
import os
from pyhive import trino
import pydeck as pdk
from typing import List
import copy
import itertools
from pyquadkey2 import quadkey
from pyquadkey2.quadkey import TileAnchor, QuadKey
from datetime import datetime, timedelta
import math
import pickle

os.environ['MAPBOX_API_KEY'] = "INSERT YOUR MAPBOX TOKEN HERE"
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [3]:
def compute_scaled_entropy(row,col1,col2,col3,col4,col5,col6):
    sum_nominator = 0
    for item in [row[col1],row[col2],row[col3],row[col4],row[col5],row[col6]]:
        if item != 0:
            sum_nominator += item*(np.log(1/item))
    row['scaled_entropy'] = sum_nominator/np.log(6)
    return row

def identify_max_col(row,col1,col2,col3,col4,col5,col6):
    dominant = max([row[col1],row[col2],row[col3],row[col4],row[col5],row[col6]])
    if dominant == row[col1]: row['dominant_race'] = col1
    if dominant == row[col2]: row['dominant_race'] = col2
    if dominant == row[col3]: row['dominant_race'] = col3
    if dominant == row[col4]: row['dominant_race'] = col4
    if dominant == row[col5]: row['dominant_race'] = col5
    if dominant == row[col6]: row['dominant_race'] = col6
    return row

def extract_dominant_group(df):
    df.loc[df['scaled_entropy']<=0.3707,'class'] = 'low diversity'
    df.loc[df['R_White']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_HispanicLatinx']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_AfricanAmerican']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_AmericanIndianAlaska']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_Asian']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_Other']>=0.8,'class'] = 'low diversity'

    df.loc[(df['scaled_entropy']>=0.7414)&(df['R_White']<0.45)
                             &(df['R_HispanicLatinx']<0.45)&(df['R_AfricanAmerican']<0.45)
                             &(df['R_AmericanIndianAlaska']<0.45)&(df['R_Asian']<0.45)
                             &(df['R_Other']<0.45)
                             ,'class'] = 'high diversity'

    df['class'] = df['class'].fillna('moderate diversity')
    print(df['class'].value_counts())
    
    df = df.apply(lambda x:identify_max_col(x,'R_White','R_HispanicLatinx','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other'), axis=1)
    
    df['class_dominant'] = df['dominant_race'] + '_' + df['class']
    df.loc[df['class']=='high diversity','class_dominant'] = 'high diversity'
    print(df.class_dominant.value_counts())
    return df

In [4]:
# Census Block Groups typically have a population between 600 to 3000 people, this makes the data privacy safe.
hw_table = "cuebiq.paas_cda_pe_v3.device_recurring_area"
date = 20190301 #20220301 # Thursday
date_formated = datetime.strptime(str(date), "%Y%m%d").strftime("%Y-%m-%d")
date_plus = int((datetime.strptime(str(date), "%Y%m%d") + timedelta(days=3)).strftime("%Y%m%d"))# You see that we can limit to the first three processing dates starting from the local date of interest.

census_data_table = "cuebiq.paas_public_data.census_data"

In [19]:
# %%time
# # census_taxonomy_table = "cuebiq.paas_public_data.census_taxonomy"#
# # sql_engine.read_sql(f"desc {census_taxonomy_table}")
# # show code book for race
# df_census_taxonomy = sql_engine.read_sql(f"select * from {census_taxonomy_table}")
# # df_census_taxonomy

In [17]:
%%time
print(date)
# #Option 1: select HOME locations in LA
# # df_home_loc = sql_engine.read_sql(
# #     f"""
# #     select *
# #     from {hw_table}
# #     where
# #         provider_id = '190199'
# #         and country_code = 'US'
# #         and snapshot_event_date = {date} 
# #         and tag_type_code = 'HOME'
# #         and block_group_id like 'US.CA.037%'   --- <<< filter recurring areas in LA
# #     """
# # )
# # df_home_loc = df_home_loc[~df_home_loc['block_group_id'].isin(['US.CA.037.599100.2','US.CA.037.599000.2','US.CA.037.599000.1','US.CA.037.599000.4','US.CA.037.599000.3','US.CA.037.599100.1'])]

#Option 2: select HOME locations in CA
# df_home_loc = sql_engine.read_sql(
#     f"""
#     select *
#     from {hw_table}
#     where
#         provider_id = '190199'
#         and country_code = 'US'
#         and snapshot_event_date = {date} 
#         and tag_type_code = 'HOME'
#         and block_group_id like 'US.CA.%' 
#     """
# )
# df_home_loc.to_pickle("../output/df_hoemloc_wholeCA_"+str(date)+".pkl")
# df_home_loc = pd.read_pickle("../output/df_hoemloc_wholeCA_"+str(date)+".pkl")

#Option 3: select HOME locations in Greater LA
# Greater LA: Ventura County 111, San Bernardino County 071, Riverside County 065, Los Angeles County 037, Orange County 059
# df_home_loc = sql_engine.read_sql(
#     f""" 
#     select *
#     from {hw_table}
#     where
#         provider_id = '190199'
#         and country_code = 'US'
#         and snapshot_event_date = {date} 
#         and tag_type_code = 'HOME'
#         and (block_group_id like 'US.CA.037%' or block_group_id like 'US.CA.111%'
#         or block_group_id like 'US.CA.071%' or block_group_id like 'US.CA.065%'
#         or block_group_id like 'US.CA.059%')
#     """
# )
# df_home_loc.to_pickle("../output/df_hoemloc_GreaterLA_"+str(date)+".pkl")
df_home_loc = pd.read_pickle("../output/df_hoemloc_GreaterLA_"+str(date)+".pkl")
print(df_home_loc.shape[0])

20190301
526144
CPU times: user 360 ms, sys: 71.9 ms, total: 432 ms
Wall time: 431 ms


In [18]:
%%time
### block_group level ethnicity census data in LA
# ethnicity_subquery = f"""
#     ethnicity_tmp as ( 
#         select
#             geography_id_2 as block_group_id,
#             census_variable_code as ethnicity,
#             value,
#             sum(value) over (partition by geography_id) as total_value
#         from {census_data_table}
#         where 
#             segment_group_id = 'HISPANIC_OR_LATINO_ETHNICITY'
#             and admin_level = 'block_group'
#             and geography_id_2 like 'US.CA.037%'
#             and census_variable_code != 'B03002_001E'
#             and census_variable_code != 'B03002_002E'
#             and census_variable_code != 'B03002_013E'
#             and census_variable_code != 'B03002_014E'
#             and census_variable_code != 'B03002_015E'
#             and census_variable_code != 'B03002_016E'
#             and census_variable_code != 'B03002_017E'
#             and census_variable_code != 'B03002_018E'
#             and census_variable_code != 'B03002_019E'
#             and census_variable_code != 'B03002_020E'
#             and census_variable_code != 'B03002_021E'            
#     ),
    
#     ethnicity as (
#         select
#             block_group_id,
#             total_value as population,
#             map_agg(ethnicity, value/total_value) as ethnicity
#         from ethnicity_tmp
#         group by 1, 2
#     )
#     """

# ### block_group level ethnicity census data in CA
# ethnicity_subquery = f"""
#     ethnicity_tmp as ( 
#         select
#             geography_id_2 as block_group_id,
#             census_variable_code as ethnicity,
#             value,
#             sum(value) over (partition by geography_id) as total_value
#         from {census_data_table}
#         where 
#             segment_group_id = 'HISPANIC_OR_LATINO_ETHNICITY'
#             and admin_level = 'block_group'
#             and geography_id_2 like 'US.CA.%' 
#             and census_variable_code != 'B03002_001E'
#             and census_variable_code != 'B03002_002E'
#             and census_variable_code != 'B03002_013E'
#             and census_variable_code != 'B03002_014E'
#             and census_variable_code != 'B03002_015E'
#             and census_variable_code != 'B03002_016E'
#             and census_variable_code != 'B03002_017E'
#             and census_variable_code != 'B03002_018E'
#             and census_variable_code != 'B03002_019E'
#             and census_variable_code != 'B03002_020E'
#             and census_variable_code != 'B03002_021E'            
#     ),
    
#     ethnicity as (
#         select
#             block_group_id,
#             total_value as population,
#             map_agg(ethnicity, value/total_value) as ethnicity
#         from ethnicity_tmp
#         group by 1, 2
#     )
#     """

### block_group level ethnicity census data in Greater LA
ethnicity_subquery = f"""
    ethnicity_tmp as ( 
        select
            geography_id_2 as block_group_id,
            census_variable_code as ethnicity,
            value,
            sum(value) over (partition by geography_id) as total_value
        from {census_data_table}
        where 
            segment_group_id = 'HISPANIC_OR_LATINO_ETHNICITY'
            and admin_level = 'block_group'
            and (geography_id_2 like 'US.CA.037%' or geography_id_2 like 'US.CA.111%'
            or geography_id_2 like 'US.CA.071%' or geography_id_2 like 'US.CA.065%'
            or geography_id_2 like 'US.CA.059%')
            and census_variable_code != 'B03002_001E'
            and census_variable_code != 'B03002_002E'
            and census_variable_code != 'B03002_013E'
            and census_variable_code != 'B03002_014E'
            and census_variable_code != 'B03002_015E'
            and census_variable_code != 'B03002_016E'
            and census_variable_code != 'B03002_017E'
            and census_variable_code != 'B03002_018E'
            and census_variable_code != 'B03002_019E'
            and census_variable_code != 'B03002_020E'
            and census_variable_code != 'B03002_021E'            
    ),
    
    ethnicity as (
        select
            block_group_id,
            total_value as population,
            map_agg(ethnicity, value/total_value) as ethnicity
        from ethnicity_tmp
        group by 1, 2
    )
    """

df_ethnicity = sql_engine.read_sql(
    f"""
    with 
    {ethnicity_subquery}
    select * from ethnicity
    """
)

print('number of CBG: ',df_ethnicity.shape[0]) #Greater LA: 10800, Whole CA:23212

def split_dict_to_columns(row, col):
    for key, value in row[col].items():
        row[key] = value
    return pd.Series(row)

# Apply the custom function to each row in the DataFrame
df_ethnicity_new = df_ethnicity.apply(lambda x: split_dict_to_columns(x,'ethnicity'), axis=1)
df_ethnicity_new['R_White'] = df_ethnicity_new['B03002_003E']
df_ethnicity_new['R_AfricanAmerican'] = df_ethnicity_new['B03002_004E']
df_ethnicity_new['R_AmericanIndianAlaska'] = df_ethnicity_new['B03002_005E']
df_ethnicity_new['R_Asian'] = df_ethnicity_new['B03002_006E'] +  df_ethnicity_new['B03002_007E'] #Asian alone + Native Hawaiian and Other Pacific Islander alone
df_ethnicity_new['R_Other'] = df_ethnicity_new['B03002_008E'] + df_ethnicity_new['B03002_009E']+df_ethnicity_new['B03002_010E']+df_ethnicity_new['B03002_011E']
df_ethnicity_new['R_HispanicLatinx'] = df_ethnicity_new['B03002_012E']
df_ethnicity_new = df_ethnicity_new[['block_group_id','population','R_White','R_HispanicLatinx','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other']]
print('number of CBG: ',df_ethnicity_new.shape[0])#23212 for the whole CA
df_ethnicity_new.head()

number of CBG:  10800
number of CBG:  10800
CPU times: user 43.6 s, sys: 27.8 ms, total: 43.6 s
Wall time: 50.4 s


Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other
0,US.CA.037.120010.1,1705.0,0.09912,0.734311,0.043402,0.0,0.123167,0.0
1,US.CA.037.135111.1,1033.0,0.496612,0.471442,0.0,0.0,0.031946,0.0
2,US.CA.037.141201.3,628.0,0.75,0.181529,0.015924,0.0,0.052548,0.0
3,US.CA.037.185204.1,1250.0,0.3712,0.204,0.0152,0.0,0.3232,0.0864
4,US.CA.037.190510.1,2790.0,0.419355,0.398925,0.036918,0.0,0.114695,0.030108


In [19]:
# print(df_ethnicity.shape[0])
# df_ethnicity.head()

## individual home location based race composition

In [20]:
# pearson correlation between census block group population and the observed number of devices 
df_home_loc['devices'] = 1
df_weight = df_home_loc.groupby('block_group_id').sum().reset_index()
print(df_weight.shape[0],df_ethnicity_new.shape[0])
df_ethnicity_new = pd.merge(df_ethnicity_new,df_weight[['block_group_id','devices']],on='block_group_id')  
print(df_ethnicity_new.shape[0])

from scipy.stats import pearsonr
corr, pval = pearsonr(df_ethnicity_new['population'], df_ethnicity_new['devices'])
print("Correlation Coefficient:", corr)
print("P-value:", pval)

10778 10800
10778
Correlation Coefficient: 0.7536171332102021
P-value: 0.0


In [21]:
print(df_ethnicity_new.shape[0])
df_ethnicity_new['weight'] = df_ethnicity_new.population/df_ethnicity_new.devices
df_ethnicity_new = df_ethnicity_new[df_ethnicity_new['devices']>5]
print(df_ethnicity_new.shape[0])
df_ethnicity_new.head()

10778
10682


Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices,weight
0,US.CA.037.120010.1,1705.0,0.09912,0.734311,0.043402,0.0,0.123167,0.0,36,47.361111
1,US.CA.037.135111.1,1033.0,0.496612,0.471442,0.0,0.0,0.031946,0.0,30,34.433333
2,US.CA.037.141201.3,628.0,0.75,0.181529,0.015924,0.0,0.052548,0.0,26,24.153846
3,US.CA.037.185204.1,1250.0,0.3712,0.204,0.0152,0.0,0.3232,0.0864,26,48.076923
4,US.CA.037.190510.1,2790.0,0.419355,0.398925,0.036918,0.0,0.114695,0.030108,91,30.659341


In [22]:
df_indi_home_loc_race = pd.merge(df_home_loc[['cuebiq_id','block_group_id']],df_ethnicity_new,on='block_group_id')  
df_indi_home_loc_race.rename(columns={'block_group_id': 'home_block'}, inplace=True)
df_indi_home_loc_race = df_indi_home_loc_race[df_indi_home_loc_race['R_White'].notna()]

considering sample weight

In [23]:
df_indi_home_loc_race['R_White'] = df_indi_home_loc_race['R_White']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_HispanicLatinx'] = df_indi_home_loc_race['R_HispanicLatinx']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_AfricanAmerican'] = df_indi_home_loc_race['R_AfricanAmerican']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_AmericanIndianAlaska'] = df_indi_home_loc_race['R_AmericanIndianAlaska']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_Asian'] = df_indi_home_loc_race['R_Asian']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_Other'] = df_indi_home_loc_race['R_Other']*df_indi_home_loc_race['weight']
df_indi_home_loc_race

Unnamed: 0,cuebiq_id,home_block,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices,weight
0,1847634951,US.CA.065.044510.2,2497.0,10.597561,16.231707,1.853659,0.341463,0.304878,1.121951,82,30.45122
1,1465482154,US.CA.065.044510.2,2497.0,10.597561,16.231707,1.853659,0.341463,0.304878,1.121951,82,30.45122
2,1766623679,US.CA.065.044510.2,2497.0,10.597561,16.231707,1.853659,0.341463,0.304878,1.121951,82,30.45122
3,1815054636,US.CA.065.044510.2,2497.0,10.597561,16.231707,1.853659,0.341463,0.304878,1.121951,82,30.45122
4,772983654,US.CA.065.044510.2,2497.0,10.597561,16.231707,1.853659,0.341463,0.304878,1.121951,82,30.45122
...,...,...,...,...,...,...,...,...,...,...,...
525818,828791396,US.CA.037.234000.3,808.0,11.000000,3.500000,77.250000,0.000000,2.500000,6.750000,8,101.00000
525819,867883715,US.CA.037.234000.3,808.0,11.000000,3.500000,77.250000,0.000000,2.500000,6.750000,8,101.00000
525820,1732058303,US.CA.037.234000.3,808.0,11.000000,3.500000,77.250000,0.000000,2.500000,6.750000,8,101.00000
525821,1369127249,US.CA.037.234000.3,808.0,11.000000,3.500000,77.250000,0.000000,2.500000,6.750000,8,101.00000


In [24]:
df_indi_home_loc_race.to_pickle("../output/GreaterLA_Individual_race_composition_"+str(date)+".pkl")  #CA_Individual_race_composition, GreaterLA_Individual_race_composition

In [25]:
df_indi_home_loc_race = pd.read_pickle("../output/GreaterLA_Individual_race_composition_"+str(date)+".pkl") #CA_Individual_race_composition, GreaterLA_Individual_race_composition
df_indi_home_loc_race.head(1)

Unnamed: 0,cuebiq_id,home_block,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices,weight
0,1847634951,US.CA.065.044510.2,2497.0,10.597561,16.231707,1.853659,0.341463,0.304878,1.121951,82,30.45122
