# This notebook is used to infer individual's race information based on their home locations
I tried to update the analysis using the 2021 census (most recent data as of Oct 2023). However, according to the code block under "individual home location based race composition", only 18717 records can match with the home CBG data provided by Spectus. This is because different years of census use different CBG divisions. So there should be some disparities between the 2018 (Spectus) and 2021 (most recent) census. The correlation between the number of devices and the population is only 0.535. I decided not to proceed as all datasets provided by Spectus (home CBGs, ) are based on the 2018 CBG division.

In [62]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200
%sql trino://localhost:9090/cuebiq/

import pandas as pd
import yaml
import numpy as np
import os
from pyhive import trino
import pydeck as pdk
from typing import List
import copy
import itertools
from pyquadkey2 import quadkey
from pyquadkey2.quadkey import TileAnchor, QuadKey
from datetime import datetime, timedelta
import math
import pickle

os.environ['MAPBOX_API_KEY'] = "INSERT YOUR MAPBOX TOKEN HERE"
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.filterwarnings('ignore')

In [63]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [64]:
def compute_scaled_entropy(row,col1,col2,col3,col4,col5,col6):
    sum_nominator = 0
    for item in [row[col1],row[col2],row[col3],row[col4],row[col5],row[col6]]:
        if item != 0:
            sum_nominator += item*(np.log(1/item))
    row['scaled_entropy'] = sum_nominator/np.log(6)
    return row

def identify_max_col(row,col1,col2,col3,col4,col5,col6):
    dominant = max([row[col1],row[col2],row[col3],row[col4],row[col5],row[col6]])
    if dominant == row[col1]: row['dominant_race'] = col1
    if dominant == row[col2]: row['dominant_race'] = col2
    if dominant == row[col3]: row['dominant_race'] = col3
    if dominant == row[col4]: row['dominant_race'] = col4
    if dominant == row[col5]: row['dominant_race'] = col5
    if dominant == row[col6]: row['dominant_race'] = col6
    return row

def extract_dominant_group(df):
    df.loc[df['scaled_entropy']<=0.3707,'class'] = 'low diversity'
    df.loc[df['R_White']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_HispanicLatinx']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_AfricanAmerican']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_AmericanIndianAlaska']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_Asian']>=0.8,'class'] = 'low diversity'
    df.loc[df['R_Other']>=0.8,'class'] = 'low diversity'

    df.loc[(df['scaled_entropy']>=0.7414)&(df['R_White']<0.45)
                             &(df['R_HispanicLatinx']<0.45)&(df['R_AfricanAmerican']<0.45)
                             &(df['R_AmericanIndianAlaska']<0.45)&(df['R_Asian']<0.45)
                             &(df['R_Other']<0.45)
                             ,'class'] = 'high diversity'

    df['class'] = df['class'].fillna('moderate diversity')
    print(df['class'].value_counts())
    
    df = df.apply(lambda x:identify_max_col(x,'R_White','R_HispanicLatinx','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other'), axis=1)
    
    df['class_dominant'] = df['dominant_race'] + '_' + df['class']
    df.loc[df['class']=='high diversity','class_dominant'] = 'high diversity'
    print(df.class_dominant.value_counts())
    return df

In [91]:
# Census Block Groups typically have a population between 600 to 3000 people, this makes the data privacy safe.
hw_table = "cuebiq.paas_cda_pe_v3.device_recurring_area"
date = 20220301 # Thursday
date_formated = datetime.strptime(str(date), "%Y%m%d").strftime("%Y-%m-%d")
date_plus = int((datetime.strptime(str(date), "%Y%m%d") + timedelta(days=3)).strftime("%Y%m%d"))# You see that we can limit to the first three processing dates starting from the local date of interest.
census_data = pd.read_csv("data/ACSDT5Y2021.B03002-Data.csv",header=0).tail(-1)

In [92]:
# %%time
# # census_taxonomy_table = "cuebiq.paas_public_data.census_taxonomy"#
# # sql_engine.read_sql(f"desc {census_taxonomy_table}")
# # show code book for race
# df_census_taxonomy = sql_engine.read_sql(f"select * from {census_taxonomy_table}")
# # df_census_taxonomy

In [93]:
census_data['block_group_id'] = 'US.CA.' + census_data['GEO_ID'].str[11:14] + '.' + census_data['GEO_ID'].str[14:20] + '.' + census_data['GEO_ID'].str[-1:]
first_column = census_data.pop('block_group_id')
census_data.insert(0, 'block_group_id', first_column)
print('total number of CBG:',census_data.shape[0])
# census_data.head(1)
pd.read_csv("data/ACSDT5Y2021.B03002-Data.csv",header=0).head(2)

total number of CBG: 25607


Unnamed: 0,GEO_ID,NAME,B03002_001E,B03002_001M,B03002_001MA,B03002_001EA,B03002_002E,B03002_002EA,B03002_002M,B03002_002MA,B03002_003E,B03002_003M,B03002_003MA,B03002_003EA,B03002_004E,B03002_004M,B03002_004MA,B03002_004EA,B03002_005E,B03002_005M,B03002_005MA,B03002_005EA,B03002_006E,B03002_006M,B03002_006MA,B03002_006EA,B03002_007E,B03002_007M,B03002_007MA,B03002_007EA,B03002_008E,B03002_008M,B03002_008MA,B03002_008EA,B03002_009E,B03002_009EA,B03002_009M,B03002_009MA,B03002_010E,B03002_010M,B03002_010MA,B03002_010EA,B03002_011E,B03002_011M,B03002_011MA,B03002_011EA,B03002_012E,B03002_012M,B03002_012MA,B03002_012EA,B03002_013E,B03002_013M,B03002_013MA,B03002_013EA,B03002_014E,B03002_014M,B03002_014MA,B03002_014EA,B03002_015E,B03002_015EA,B03002_015M,B03002_015MA,B03002_016E,B03002_016M,B03002_016MA,B03002_016EA,B03002_017E,B03002_017EA,B03002_017M,B03002_017MA,B03002_018E,B03002_018EA,B03002_018M,B03002_018MA,B03002_019E,B03002_019M,B03002_019MA,B03002_019EA,B03002_020E,B03002_020M,B03002_020MA,B03002_020EA,B03002_021E,B03002_021M,B03002_021MA,B03002_021EA,Unnamed: 86
0,Geography,Geographic Area Name,Estimate!!Total:,Margin of Error!!Total:,Annotation of Margin of Error!!Total:,Annotation of Estimate!!Total:,Estimate!!Total:!!Not Hispanic or Latino:,Annotation of Estimate!!Total:!!Not Hispanic or Latino:,Margin of Error!!Total:!!Not Hispanic or Latino:,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:,Estimate!!Total:!!Not Hispanic or Latino:!!White alone,Margin of Error!!Total:!!Not Hispanic or Latino:!!White alone,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!White alone,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!White alone,Estimate!!Total:!!Not Hispanic or Latino:!!Black or African American alone,Margin of Error!!Total:!!Not Hispanic or Latino:!!Black or African American alone,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!Black or African American alone,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!Black or African American alone,Estimate!!Total:!!Not Hispanic or Latino:!!American Indian and Alaska Native alone,Margin of Error!!Total:!!Not Hispanic or Latino:!!American Indian and Alaska Native alone,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!American Indian and Alaska Native alone,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!American Indian and Alaska Native alone,Estimate!!Total:!!Not Hispanic or Latino:!!Asian alone,Margin of Error!!Total:!!Not Hispanic or Latino:!!Asian alone,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!Asian alone,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!Asian alone,Estimate!!Total:!!Not Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Margin of Error!!Total:!!Not Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Estimate!!Total:!!Not Hispanic or Latino:!!Some other race alone,Margin of Error!!Total:!!Not Hispanic or Latino:!!Some other race alone,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!Some other race alone,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!Some other race alone,Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:,Margin of Error!!Total:!!Not Hispanic or Latino:!!Two or more races:,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!Two or more races:,Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races including Some other race,Margin of Error!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races including Some other race,Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races including Some other race,Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races including Some other race,"Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races","Margin of Error!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races","Annotation of Margin of Error!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races","Annotation of Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races",Estimate!!Total:!!Hispanic or Latino:,Margin of Error!!Total:!!Hispanic or Latino:,Annotation of Margin of Error!!Total:!!Hispanic or Latino:,Annotation of Estimate!!Total:!!Hispanic or Latino:,Estimate!!Total:!!Hispanic or Latino:!!White alone,Margin of Error!!Total:!!Hispanic or Latino:!!White alone,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!White alone,Annotation of Estimate!!Total:!!Hispanic or Latino:!!White alone,Estimate!!Total:!!Hispanic or Latino:!!Black or African American alone,Margin of Error!!Total:!!Hispanic or Latino:!!Black or African American alone,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!Black or African American alone,Annotation of Estimate!!Total:!!Hispanic or Latino:!!Black or African American alone,Estimate!!Total:!!Hispanic or Latino:!!American Indian and Alaska Native alone,Annotation of Estimate!!Total:!!Hispanic or Latino:!!American Indian and Alaska Native alone,Margin of Error!!Total:!!Hispanic or Latino:!!American Indian and Alaska Native alone,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!American Indian and Alaska Native alone,Estimate!!Total:!!Hispanic or Latino:!!Asian alone,Margin of Error!!Total:!!Hispanic or Latino:!!Asian alone,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!Asian alone,Annotation of Estimate!!Total:!!Hispanic or Latino:!!Asian alone,Estimate!!Total:!!Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Annotation of Estimate!!Total:!!Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Margin of Error!!Total:!!Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone,Estimate!!Total:!!Hispanic or Latino:!!Some other race alone,Annotation of Estimate!!Total:!!Hispanic or Latino:!!Some other race alone,Margin of Error!!Total:!!Hispanic or Latino:!!Some other race alone,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!Some other race alone,Estimate!!Total:!!Hispanic or Latino:!!Two or more races:,Margin of Error!!Total:!!Hispanic or Latino:!!Two or more races:,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!Two or more races:,Annotation of Estimate!!Total:!!Hispanic or Latino:!!Two or more races:,Estimate!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races including Some other race,Margin of Error!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races including Some other race,Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races including Some other race,Annotation of Estimate!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races including Some other race,"Estimate!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races","Margin of Error!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races","Annotation of Margin of Error!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races","Annotation of Estimate!!Total:!!Hispanic or Latino:!!Two or more races:!!Two races excluding Some other race, and three or more races",
1,1500000US060014001001,"Block Group 1, Census Tract 4001, Alameda County, California",1963,457,,,1847,,441,,1429,434,,,44,41,,,0,13,,,257,100,,,0,13,,,17,29,,,100,,92,,0,13,,,100,92,,,116,83,,,95,81,,,0,13,,,0,,13,,0,13,,,0,,13,,0,,13,,21,29,,,21,29,,,0,13,,,


In [113]:
df_ethnicity_new = census_data
df_ethnicity_new['population'] = df_ethnicity_new['B03002_001E'].astype(int)
df_ethnicity_new['R_White'] = df_ethnicity_new['B03002_003E'].astype(int)/df_ethnicity_new['population']
df_ethnicity_new['R_AfricanAmerican'] = df_ethnicity_new['B03002_004E'].astype(int)/df_ethnicity_new['population']
df_ethnicity_new['R_AmericanIndianAlaska'] = df_ethnicity_new['B03002_005E'].astype(int)/df_ethnicity_new['population']
df_ethnicity_new['R_Asian'] = (df_ethnicity_new['B03002_006E'].astype(int) +  df_ethnicity_new['B03002_007E'].astype(int))/df_ethnicity_new['population'] #Asian alone + Native Hawaiian and Other Pacific Islander alone
df_ethnicity_new['R_Other'] = (df_ethnicity_new['B03002_008E'].astype(int) + df_ethnicity_new['B03002_009E'].astype(int)+df_ethnicity_new['B03002_010E'].astype(int)+df_ethnicity_new['B03002_011E'].astype(int))/df_ethnicity_new['population']
df_ethnicity_new['R_HispanicLatinx'] = df_ethnicity_new['B03002_012E'].astype(int)/df_ethnicity_new['population']
df_ethnicity_new = df_ethnicity_new[['block_group_id','population','R_White','R_HispanicLatinx','R_AfricanAmerican','R_AmericanIndianAlaska','R_Asian','R_Other']]
print(df_ethnicity_new.shape[0])#23212 for the whole CA
df_ethnicity_new.head()

25607


Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other
1,US.CA.001.400100.1,1963,0.727967,0.059093,0.022415,0.0,0.130922,0.110545
2,US.CA.001.400100.2,1361,0.664217,0.098457,0.044085,0.0,0.162381,0.061719
3,US.CA.001.400200.1,1029,0.793975,0.068027,0.005831,0.0,0.045675,0.172983
4,US.CA.001.400200.2,1009,0.623389,0.081269,0.041625,0.0,0.148662,0.203171
5,US.CA.001.400300.1,1122,0.65508,0.062389,0.0,0.006239,0.254011,0.044563


In [111]:
# %%time
# #Option 2: select HOME locations in CA
# df_home_loc = sql_engine.read_sql(
#     f"""
#     select *
#     from {hw_table}
#     where
#         provider_id = '190199'
#         and country_code = 'US'
#         and snapshot_event_date = {date} 
#         and tag_type_code = 'HOME'
#         and block_group_id like 'US.CA.%' 
#     """
# )
# df_home_loc.to_pickle("../output/df_hoemloc_wholeCA.pkl")
df_home_loc = pd.read_pickle("../output/df_hoemloc_wholeCA.pkl")

## individual home location based race composition

In [114]:
# pearson correlation between census block group population and the observed number of devices 
df_home_loc['devices'] = 1
df_weight = df_home_loc.groupby('block_group_id').sum().reset_index()
print(df_weight.shape[0],df_ethnicity_new.shape[0])
df_ethnicity_new = pd.merge(df_ethnicity_new,df_weight[['block_group_id','devices']],on='block_group_id')  
print(df_ethnicity_new.shape[0])

from scipy.stats import pearsonr
corr, pval = pearsonr(df_ethnicity_new['population'], df_ethnicity_new['devices'])
print("Correlation Coefficient:", corr)
print("P-value:", pval)

23162 25607
18717
Correlation Coefficient: 0.535201876401364
P-value: 0.0


In [32]:
print(df_ethnicity_new.shape[0])
df_ethnicity_new['weight'] = df_ethnicity_new.population/df_ethnicity_new.devices
df_ethnicity_new = df_ethnicity_new[df_ethnicity_new['devices']>5]
print(df_ethnicity_new.shape[0])
df_ethnicity_new.head()

10781
10733


Unnamed: 0,block_group_id,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices,weight
0,US.CA.037.104701.1,1758.0,0.0,0.995449,0.004551,0.0,0.0,0.0,35,50.228571
1,US.CA.037.111400.1,2689.0,0.383414,0.361101,0.027148,0.0,0.228338,0.0,87,30.908046
2,US.CA.037.113321.1,4143.0,0.283611,0.381849,0.089549,0.0,0.16196,0.083032,145,28.572414
3,US.CA.037.115202.1,2066.0,0.386738,0.217812,0.101162,0.0,0.188771,0.105518,84,24.595238
4,US.CA.037.117301.2,1429.0,0.395381,0.425472,0.012596,0.0,0.075577,0.090973,59,24.220339


In [33]:
df_indi_home_loc_race = pd.merge(df_home_loc[['cuebiq_id','block_group_id']],df_ethnicity_new,on='block_group_id')  
df_indi_home_loc_race.rename(columns={'block_group_id': 'home_block'}, inplace=True)
df_indi_home_loc_race = df_indi_home_loc_race[df_indi_home_loc_race['R_White'].notna()]

considering sample weight

In [2]:
df_indi_home_loc_race['R_White'] = df_indi_home_loc_race['R_White']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_HispanicLatinx'] = df_indi_home_loc_race['R_HispanicLatinx']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_AfricanAmerican'] = df_indi_home_loc_race['R_AfricanAmerican']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_AmericanIndianAlaska'] = df_indi_home_loc_race['R_AmericanIndianAlaska']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_Asian'] = df_indi_home_loc_race['R_Asian']*df_indi_home_loc_race['weight']
df_indi_home_loc_race['R_Other'] = df_indi_home_loc_race['R_Other']*df_indi_home_loc_race['weight']
df_indi_home_loc_race

In [35]:
df_indi_home_loc_race.to_pickle("./output/GreaterLA_Individual_race_composition.pkl")  

In [2]:
df_indi_home_loc_race = pd.read_pickle("../output/GreaterLA_Individual_race_composition.pkl")
df_indi_home_loc_race.head(1)

Unnamed: 0,cuebiq_id,home_block,population,R_White,R_HispanicLatinx,R_AfricanAmerican,R_AmericanIndianAlaska,R_Asian,R_Other,devices,weight
0,4374082992,US.CA.037.553802.3,2674.0,1.0,24.795455,1.772727,0.0,0.522727,2.295455,88,30.386364
