In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200
%sql trino://localhost:9090/cuebiq/

import pandas as pd
import yaml
import numpy as np
import os
from pyhive import trino
import pydeck as pdk
from typing import List
import json
import copy
import itertools
# import geohash
import geopandas as geopd
from pyquadkey2 import quadkey
from pyquadkey2.quadkey import TileAnchor, QuadKey
from h3 import h3
import seaborn as sns
from datetime import datetime, timedelta
import math
import pickle

os.environ['MAPBOX_API_KEY'] = "INSERT YOUR MAPBOX TOKEN HERE"
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [3]:
# Census Block Groups typically have a population between 600 to 3000 people, this makes the data privacy safe.
geography_table = "cuebiq.paas_cda_pe_v3.geography_registry"
visit_table = "cuebiq.paas_cda_pe_v3.visit"

# A. Extract visit table - one day

In [6]:
date = 20220310 # Thursday
date_plus = int((datetime.strptime(str(date), "%Y%m%d") + timedelta(days=3)).strftime("%Y%m%d"))# You see that we can limit to the first three processing dates starting from the local date of interest.

In [66]:
# sql_engine.read_sql(f"""
# SELECT from_iso8601_timestamp('2022-03-10T13:54:09-08:00') AT TIME ZONE 'UTC'
# """)

In [65]:
# sql_engine.read_sql(f"""
# SELECT from_iso8601_timestamp('2022-03-10T13:54:09-08:00') AT TIME ZONE '-08:00'
# """)

In [28]:
# test = sql_engine.read_sql(f"""
#         select
#             *, 
#             from_iso8601_timestamp(zoned_datetime) as start_time, 
#             date_add('second', cast(dwell_time_minutes * 60 as bigint), from_iso8601_timestamp(zoned_datetime)) as end_time
#         from {visit_table}
#         where 
#             country_code = 'US'
#             and provider_id = '190199'
#             and processing_date between {date} and {date_plus}
#             and event_date = {date}
#             and admin2_id = 'US.CA.037'
#             and dwell_time_minutes < 1440
#         limit 10
#         """
# )
# test.head()

In [None]:
# from_iso8601_timestamp(zoned_datetime) as start_time, 
# date_add('second', cast(dwell_time_minutes * 60 as bigint), from_iso8601_timestamp(zoned_datetime)) as end_time

In [1]:
%%time
# 264083
# CPU times: user 12.2 s, sys: 387 ms, total: 12.6 s
# Wall time: 1min 7s

df_visit_LA = sql_engine.read_sql(f"""
    with la_visit as (
        select
            *
        from {visit_table}
        where 
            country_code = 'US'
            and provider_id = '190199'
            and processing_date between {date} and {date_plus}
            and event_date = {date}
            and admin2_id = 'US.CA.037'
            and dwell_time_minutes < 1440
        ),
        
        la_cbg as (
        select
            geography_id, geometry_wkt
        from {geography_table}
        where
            country_code = 'US'
            and geography_type_code = 'admin4'
            and geography_id like 'US.CA.037%'     --- <<< filter geometries in LA
        )
        
        select
        *
        from la_visit s
        inner join la_cbg c
        on st_contains(st_geometryfromtext(c.geometry_wkt), st_point(s.lng, s.lat))
        """
)

print(df_visit_LA.shape[0])
df_visit_LA.drop(['zipcode_id','place_version','processing_date','country_code',
                 'geometry_wkt','admin1_id','device_type_code','os_name','provider_id'], inplace=True, axis=1)
df_visit_LA.head()

In [None]:
%%time
#create start_time, end_time, start_hour, and end_hour columns
df_visit_LA['start_time'] =  df_visit_LA['zoned_datetime'].astype(str).str[:19]
df_visit_LA['start_time'] = pd.to_datetime(df_visit_LA['start_time'], errors='coerce')
df_visit_LA['time_added'] = pd.to_timedelta(df_visit_LA['dwell_time_minutes'],'m')
df_visit_LA['end_time'] = df_visit_LA['start_time'] + df_visit_LA['time_added']
df_visit_LA['start_hour'] = df_visit_LA['start_time'].dt.hour
df_visit_LA['end_hour'] = df_visit_LA['end_time'].dt.hour
df_visit_LA.drop(['time_added'], inplace=True, axis=1)
df_visit_LA.head()

In [None]:
# df_visit_LA.to_pickle("./output/LA_visit_"+str(date)+".pkl")  
# df_visit_LA = pd.read_pickle("./output/LA_visit_"+str(date)+".pkl")

# B. Extract visit table - time range

In [6]:
#March 2022: 
#weekend days: 20220305,20220306,20220312,20220313,20220319,20220320,20220326,20220327

#20220301,20220308
#20220309,20220316
#20220317,20220323
#20220324,20220331
start_date,end_date = 20220301,20220308
end_date_plus = int((datetime.strptime(str(end_date), "%Y%m%d") + timedelta(days=3)).strftime("%Y%m%d"))# You see that we can limit to the first three processing dates starting from the local date of interest.

In [8]:
# from_iso8601_timestamp(zoned_datetime) as start_time, 
# date_add('second', cast(dwell_time_minutes * 60 as bigint), from_iso8601_timestamp(zoned_datetime)) as end_time

In [2]:
%%time

df_visit_LA_range = sql_engine.read_sql(f"""
    with la_visit as (
        select
            *
        from {visit_table}
        where 
            country_code = 'US'
            and provider_id = '190199'
            and processing_date between {start_date} and {end_date_plus}
            and event_date between {start_date} and {end_date}
            and admin2_id = 'US.CA.037'     --- <<< filter geometries in LA
        ),
        
        la_cbg as (
        select
            geography_id, geometry_wkt
        from {geography_table}
        where
            country_code = 'US'
            and geography_type_code = 'admin4'
            and geography_id like 'US.CA.037%'     --- <<< filter geometries in LA
        )
        
        select
        *
        from la_visit s
        inner join la_cbg c
        on st_contains(st_geometryfromtext(c.geometry_wkt), st_point(s.lng, s.lat))
        """
)

print(df_visit_LA_range.shape[0])
df_visit_LA_range.drop(['zipcode_id','place_version','processing_date','country_code',
                 'geometry_wkt','admin1_id','device_type_code','os_name','provider_id'], inplace=True, axis=1)
df_visit_LA_range.head()

In [3]:
%%time 
#create start_time, end_time, start_hour, and end_hour columns
df_visit_LA_range['start_time'] =  df_visit_LA_range['zoned_datetime'].astype(str).str[:19]
df_visit_LA_range['start_time'] = pd.to_datetime(df_visit_LA_range['start_time'], errors='coerce')
df_visit_LA_range['time_added'] = pd.to_timedelta(df_visit_LA_range['dwell_time_minutes'],'m')
df_visit_LA_range['end_time'] = df_visit_LA_range['start_time'] + df_visit_LA_range['time_added']
df_visit_LA_range['start_hour'] = df_visit_LA_range['start_time'].dt.hour
df_visit_LA_range['end_hour'] = df_visit_LA_range['end_time'].dt.hour
df_visit_LA_range.drop(['time_added'], inplace=True, axis=1)
df_visit_LA_range.head()

In [30]:
## Voided, not useful
# df_visit_LA_range['start_time'] = df_visit_LA_range['zoned_datetime'].dt.tz_convert('America/Los_Angeles')
# df_visit_LA_range['end_time'] = df_visit_LA_range['end_time'].dt.tz_convert('America/Los_Angeles')
# df_visit_LA_range['start_time']  = df_visit_LA_range['start_time'].dt.tz_localize(None)
# df_visit_LA_range['end_time']  = df_visit_LA_range['end_time'].dt.tz_localize(None)
# df_visit_LA_range['start_hour'] = df_visit_LA_range['start_time'].dt.hour
# df_visit_LA_range['end_hour'] = df_visit_LA_range['end_time'].dt.hour

In [23]:
# df_visit_LA_range.to_pickle("./output/LA_visit_"+str(start_date)+'_'+str(end_date)+".pkl")  
# df_visit_LA_range = pd.read_pickle("./output/LA_visit_"+str(start_date)+'_'+str(end_date)+".pkl")

# Don't run below

In [9]:
%%time
# read LA CBG data
cbg_geom = sql_engine.read_sql(
    f"""
    select
        geography_id, geometry_wkt
    from {geography_table}
    where
        country_code = 'US'
        and geography_type_code = 'admin4'
        and geography_id like 'US.CA.083%'     --- <<< filter geometries in LA
    """
)

cbg_geom.rename(columns={'geography_id': 'block_group_id'}, inplace=True)
# eliminate Catalina island and another island in the south of Catalina island
# cbg_geom = cbg_geom[~cbg_geom['block_group_id'].isin(['US.CA.037.599100.2','US.CA.037.599000.2','US.CA.037.599000.1','US.CA.037.599000.4','US.CA.037.599000.3','US.CA.037.599100.1'])]

CPU times: user 86.5 ms, sys: 617 µs, total: 87.1 ms
Wall time: 6.29 s


In [10]:
from keplergl import KeplerGl
KeplerGl(data={'geometries': cbg_geom})

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'geometries':          block_group_id  \
0    US.CA.083.001604.1   
1    US.CA.083.002702.1   
…