In [1]:
%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.autopandas=True
%config SqlMagic.displaylimit=200
%sql trino://localhost:9090/cuebiq/

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import yaml
import numpy as np
import os
from pyhive import trino
import pydeck as pdk
from typing import List
import copy
import itertools
from pyquadkey2 import quadkey
from pyquadkey2.quadkey import TileAnchor, QuadKey
from h3 import h3
from datetime import datetime, timedelta
import math
import pickle

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
os.environ['MAPBOX_API_KEY'] = "INSERT YOUR MAPBOX TOKEN HERE"
pd.set_option('display.max_colwidth', 0)

In [2]:
# SQL engine
from trino.dbapi import connect 
from sqlalchemy import create_engine

class TrinoEngine():
    def __init__(self):
        conn = connect(
            host="localhost",
            port=9090,
            catalog="cuebiq"
        )
        self.cur = conn.cursor()
        self.engine = create_engine("trino://localhost:9090/cuebiq/")
    
    def execute_statement(self, query:str) -> list:
        """
        Create and drop statements.
        """
        self.cur.execute(query)
        return self.cur.fetchall()
    
    def read_sql(self, query:str) -> pd.DataFrame: 
        """
        Select and insert into operations.
        """
        return pd.read_sql(query, self.engine)

sql_engine = TrinoEngine()

In [4]:
brand_table = "cuebiq.paas_cda_pe_v3.brand"
poi_table = "cuebiq.paas_cda_pe_v3.poi"
# Census Block Groups typically have a population between 600 to 3000 people, this makes the data privacy safe.
date = 20220310 # Thursday

In [4]:
# sql_engine.read_sql(f"desc {brand_table}")
# %sql show tables from cuebiq.paas_cda_pe_v3

# assign POI categories based on SIC code

In [11]:
%%time
df_POI = sql_engine.read_sql(
    f"""
    select place_id,brand_id,lat,lng,place_name
    from {poi_table}
    where 
        country_code = 'US' 
        and admin2_id = 'US.CA.037'
    """
)
print(df_POI.shape[0]) # 63118 POI in LA
# df_POI.to_pickle("./output/LA_POI_table.pkl")

64991
CPU times: user 654 ms, sys: 18.2 ms, total: 672 ms
Wall time: 11.9 s


In [12]:
%%time
df_brand = sql_engine.read_sql(
    f"""
    select distinct *
    from {brand_table}
    """
)
# df_brand.to_pickle("./output/Brand_table.pkl")
print(df_brand.shape[0]) #3416
df_brand.head()

3463
CPU times: user 85.8 ms, sys: 0 ns, total: 85.8 ms
Wall time: 1.52 s


Unnamed: 0,brand_id,brand_name,vertical_id,vertical_name,sic_code,sic_name
0,2400,Ulta,769.0,Fitness,,
1,2649,Caruso,769.0,Fitness,6512.0,Operators of Nonresidential Buildings
2,1288,Chuze Fitness,769.0,Fitness,7991.0,Physical Fitness Facilities
3,1804,"IHRSA (International Health, Racquet & Sportsclub Association)",769.0,Fitness,7991.0,Physical Fitness Facilities
4,857,Janie and Jack,806.0,Children's Clothing,5641.0,Children's and Infants' Wear Stores


In [13]:
df_POI.rename(columns={'lat': 'poi_lat', 'lng': 'poi_lng'}, inplace=True)
df_POI_type = pd.merge(df_POI[['place_id','brand_id','poi_lat','poi_lng','place_name']],df_brand,on='brand_id')

# https://en.wikipedia.org/wiki/Standard_Industrial_Classification
# 0100-0999	Agriculture, Forestry and Fishing
# 1000-1499	Mining
# 1500-1799	Construction
# 1800-1999	not used
# 2000-3999	Manufacturing
# 4000-4999	Transportation, Communications, Electric, Gas and Sanitary service
# 5000-5199	Wholesale Trade
# 5200-5999	Retail Trade
# 6000-6799	Finance, Insurance and Real Estate
# 7000-8999	Services
# 9100-9729	Public Administration
# 9900-9999	Nonclassifiable

df_POI_type['category'] = 'NA'
df_POI_type.loc[(df_POI_type['sic_code']>=0)&(df_POI_type['sic_code']<=999),'category'] = 'Agriculture, Forestry and Fishing'
df_POI_type.loc[(df_POI_type['sic_code']>=1000)&(df_POI_type['sic_code']<=1499),'category'] = 'Mining'
df_POI_type.loc[(df_POI_type['sic_code']>=1500)&(df_POI_type['sic_code']<=1799),'category'] = 'Construction'
df_POI_type.loc[(df_POI_type['sic_code']>=1800)&(df_POI_type['sic_code']<=1999),'category'] = 'not used'
df_POI_type.loc[(df_POI_type['sic_code']>=2000)&(df_POI_type['sic_code']<=3999),'category'] = 'Manufacturing'
df_POI_type.loc[(df_POI_type['sic_code']>=4000)&(df_POI_type['sic_code']<=4999),'category'] = 'Transportation, Communications, Electric, Gas and Sanitary service'
df_POI_type.loc[(df_POI_type['sic_code']>=5000)&(df_POI_type['sic_code']<=5199),'category'] = 'Wholesale Trade'
df_POI_type.loc[(df_POI_type['sic_code']>=5200)&(df_POI_type['sic_code']<=5999),'category'] = 'Retail Trade'
df_POI_type.loc[(df_POI_type['sic_code']>=6000)&(df_POI_type['sic_code']<=6799),'category'] = 'Finance, Insurance and Real Estate'
df_POI_type.loc[(df_POI_type['sic_code']>=7000)&(df_POI_type['sic_code']<=8999),'category'] = 'Services'
df_POI_type.loc[(df_POI_type['sic_code']>=9100)&(df_POI_type['sic_code']<=9729),'category'] = 'Public Administration'
df_POI_type.loc[(df_POI_type['sic_code']>=9900)&(df_POI_type['sic_code']<=9999),'category'] = 'Nonclassifiable'
df_POI_type.loc[df_POI_type['sic_code'].isna(),'category'] = 'NA'
df_POI_type['category'].value_counts()

Retail Trade                                                          24444
Services                                                              8469 
Finance, Insurance and Real Estate                                    8357 
Transportation, Communications, Electric, Gas and Sanitary service    8302 
Manufacturing                                                         7650 
NA                                                                    4885 
Wholesale Trade                                                       1414 
Public Administration                                                 1159 
Agriculture, Forestry and Fishing                                     308  
Construction                                                          3    
Name: category, dtype: int64

In [14]:
print(df_POI_type.shape[0])
df_POI_type = df_POI_type.drop_duplicates(keep='last')
print(df_POI_type.shape[0])
df_POI_type = df_POI_type.drop_duplicates(subset='place_id', keep="last")
print(df_POI_type.shape[0])

64991
64991
64991


In [71]:
# print(df_POI_type.shape[0])
# print(df_POI_type.place_id.unique().shape[0])
# test = df_POI_type.groupby('place_id').count().reset_index()
# dul_idlist = test[test.brand_id!=1].place_id.to_list()
# df_POI_type[df_POI_type.place_id.isin(dul_idlist)]

In [15]:
df_POI_type.to_pickle("../output/LA_POI_category.pkl")

In [10]:
df_POI_type = pd.read_pickle("../output/LA_POI_category.pkl")
print(df_POI_type.shape[0])

62808


# a. Left join with daily visit table

In [1]:
df_visit_LA = pd.read_pickle("./output/LA_visit_"+str(date)+".pkl")  
print(df_visit_LA.shape[0])
df_visit_LA = pd.merge(df_visit_LA,df_POI_type,on=['place_id','brand_id'])
print(df_visit_LA.shape[0]) # the difference compare to above might because some visits are outside LA
df_visit_LA.head()

In [None]:
df_visit_LA.to_pickle("./output/LA_visit_with_poiCategory"+str(date)+".pkl")

# b. Left join with visit table of a time range

In [23]:
start_date,end_date = 20190324,20190331
#20190301,20190308
#20190309,20190316
#20190317,20190323
#20190324,20190331

In [24]:
df_visit_LA_range = pd.read_pickle("../output/LA_visit_"+str(start_date)+'_'+str(end_date)+".pkl")
print(df_visit_LA_range.shape[0])
df_visit_LA_range = pd.merge(df_visit_LA_range,df_POI_type,on=['place_id','brand_id'])
print(df_visit_LA_range.shape[0]) # the difference compare to above might because some visits are outside LA
df_visit_LA_range.to_pickle("../output/LA_visit_with_poiCategory"+str(start_date)+'_'+str(end_date)+".pkl")

1909313
1419429


In [26]:
df_visit_LA_range.head(1)

Unnamed: 0,admin2_id,brand_id,cuebiq_id,dwell_time_minutes,event_date,geohash,geoset_id,lat,lng,place_id,zoned_datetime,geography_id,start_time,end_time,start_hour,end_hour,poi_lat,poi_lng,place_name,brand_name,vertical_id,vertical_name,sic_code,sic_name,category
0,US.CA.037,629,1069274429,7.966667,20190327,9q5cfdwu2,11132,34.07211,-118.357421,31119715,2019-03-27T19:05:07-07:00,US.CA.037.214501.2,2019-03-27 19:05:07,2019-03-27 19:13:05.000000020,19,19,34.070747,-118.360201,US Malls,US Malls,812.0,Malls,,,


In [25]:
print("File Size is :", os.path.getsize("../output/LA_visit_with_poiCategory"+str(start_date)+'_'+str(end_date)+".pkl")/1000000, "mb")

File Size is : 330.491063 mb


# No need to run below

# Additional exploratory analysis: number of visits by place category

top visited POIs

In [42]:
df_top_cate = df_visit_LA[['sic_code']].groupby('sic_code')['sic_code'].count().reset_index(name='visit_count').sort_values(['visit_count'], ascending=False)
# print(df_top_cate.head(10))
# df_top_cate['count'].hist()
df_brand_filtered = df_brand[['sic_code','sic_name']].drop_duplicates(keep='last')    
df_top_cate_brand = pd.merge(df_top_cate,df_brand_filtered,on='sic_code')
df_top_cate_brand.head(5)
 
#5812.0 Eating Places,  5411.0 Grocery Stores, 7991.0 Physical Fitness Facilities
#7231.0 Beauty Shops, 6021.0 National Commercial Banks, 7011.0 Hotels and Motels 5311.0

Unnamed: 0,sic_code,visit_count,sic_name
0,5812.0,36879,Eating Places
1,4812.0,9050,Radiotelephone Communications
2,5541.0,8388,Gasoline Service Stations
3,6021.0,7122,National Commercial Banks
4,6029.0,6935,"Commercial Banks, Not Elsewhere Classified"


top POI categories by count

In [48]:
df_top_poi_count = df_POI_type.drop_duplicates(keep='last').groupby('sic_code')['sic_code'].count().reset_index(name='poi_count').sort_values(['poi_count'], ascending=False)
df_top_poi_count = pd.merge(df_top_poi_count,df_brand_filtered,on='sic_code')
df_top_poi_count.head(5)

Unnamed: 0,sic_code,poi_count,sic_name
0,5812.0,7885,Eating Places
1,4812.0,3529,Radiotelephone Communications
2,6021.0,3282,National Commercial Banks
3,5531.0,3136,Auto and Home Supply Stores
4,5541.0,2415,Gasoline Service Stations


In [49]:
# print(df_visit_LA['vertical_id'].isna().sum())
print(df_visit_LA['vertical_name'].isna().sum())
# print(df_visit_LA['sic_code'].isna().sum())
print(df_visit_LA['sic_name'].isna().sum())
# print(df_visit_LA[['sic_name','vertical_name']].isna().sum())
print(df_visit_LA[df_visit_LA[['sic_name', 'vertical_name']].isna().all(axis=1)].shape[0])
# print(df_visit_LA['sic_name'].isna())
# 11168/264083=0.04 #4% are missing

11014
65049
0
