# Data Preparation

## Import Modules

In [3]:
import os
import pandas as pd
import duckdb
import numpy as np

```bash
cd data
uvx --from deweypy dewey --api-key <API_KEY> speedy-download <FOLDER_ID>
```

## Constants

In [4]:
# San Diego Country Zip Codes 
zipcodes = [
    91901, 91902, 91903, 91905, 91906, 91908, 91909, 91910, 91911, 91913,
    91914, 91915, 91916, 91917, 91919, 91921, 91931, 91932, 91934, 91935,
    91941, 91942, 91943, 91944, 91945, 91946, 91947, 91948, 91950, 91951,
    91962, 91963, 91964, 91965, 91976, 91977, 91978, 91979, 91980, 91987,
    92003, 92004, 92007, 92008, 92009, 92010, 92011, 92013, 92014, 92018,
    92019, 92020, 92021, 92022, 92023, 92024, 92025, 92026, 92027, 92028,
    92029, 92033, 92036, 92037, 92038, 92039, 92040, 92046, 92049, 92051,
    92052, 92054, 92055, 92056, 92057, 92058, 92059, 92060, 92061, 92062,
    92064, 92065, 92066, 92067, 92068, 92069, 92070, 92071, 92072, 92074,
    92075, 92078, 92079, 92082, 92083, 92084, 92085, 92086, 92088, 92090,
    92091, 92092, 92093, 92096, 92099, 92101, 92102, 92103, 92104, 92105,
    92106, 92107, 92108, 92109, 92110, 92111, 92112, 92113, 92114, 92115,
    92116, 92117, 92118, 92119, 92120, 92121, 92122, 92123, 92124, 92126,
    92127, 92128, 92129, 92130, 92131, 92132, 92133, 92134, 92135, 92136,
    92137, 92138, 92139, 92140, 92142, 92143, 92144, 92145, 92147, 92149,
    92150, 92152, 92153, 92154, 92155, 92158, 92159, 92160, 92161, 92162,
    92163, 92164, 92165, 92166, 92167, 92168, 92169, 92170, 92171, 92172,
    92173, 92174, 92175, 92176, 92177, 92178, 92179, 92180, 92182, 92184,
    92186, 92187, 92190, 92191, 92192, 92193, 92194, 92195, 92196, 92197,
    92198, 92199
]

GLOBAL_PLACES_DIR = 'data/dewey-downloads/global-places-for-lat-long-ca' # adjust as needed
SPEND_PATTERNS_DIR = 'data/dewey-downloads/safegraph-spend-patterns-ca-only'

In [None]:
# Load the Global Places dataset
GLOBAL_PLACES_PATH = os.path.join(GLOBAL_PLACES_DIR, '*.parquet')
con = duckdb.connect()

# Filter for only San Diego County zip codes (removed columns and filtered to CA in Dewey)
df = con.execute(f"""
            SELECT
                *
            FROM '{GLOBAL_PLACES_PATH}' 
            WHERE POSTAL_CODE IN ({','.join(map(str, zipcodes))})""").fetchdf()
df.head()

Unnamed: 0,CITY,LATITUDE,LONGITUDE,PARENT_PLACEKEY,PLACEKEY,POLYGON_WKT,POSTAL_CODE,REGION,SAFEGRAPH_BRAND_IDS,STREET_ADDRESS
0,San Diego,32.745078,-117.149813,,zzz-222@5z5-qdv-j35,MULTIPOLYGON (((-117.14910778399701 32.7457697...,92103.0,CA,,Cypress Canyon Os
1,La Jolla,32.884272,-117.22443,222-288@5z5-pnh-cdv,222-222@5z5-pnh-dgk,POLYGON ((-117.22560003399997 32.8858112940000...,92037.0,CA,,9894 Genesee Ave
2,San Diego,32.766985,-117.202906,,222-222@5z5-pq2-7h5,POLYGON ((-117.20293921327564 32.7671775484230...,92110.0,CA,,980 Buenos Ave
3,San Diego,32.83967,-117.126109,,222-222@5z5-pwz-wtv,"POLYGON ((-117.1261735 32.8395108693057, -117....",92123.0,CA,,9350 Waxie Way
4,San Diego,32.891073,-117.147801,,222-222@5z5-px6-5s5,POLYGON ((-117.14713154005051 32.8915420239990...,92126.0,CA,,9340 Dowdy Dr


In [6]:
# export subset to parquet
df.to_parquet('data/san-diego-county-places.parquet')

In [7]:
df.columns

Index(['CITY', 'LATITUDE', 'LONGITUDE', 'PARENT_PLACEKEY', 'PLACEKEY',
       'POLYGON_WKT', 'POSTAL_CODE', 'REGION', 'SAFEGRAPH_BRAND_IDS',
       'STREET_ADDRESS'],
      dtype='object')

In [10]:
# np.sort(df['TOP_CATEGORY'].unique()) # remove category as its available in spend

In [9]:
df['POSTAL_CODE'].value_counts()

POSTAL_CODE
92101.0    12843
92103.0     8704
92037.0     8336
92108.0     8115
92123.0     7457
           ...  
92072.0        1
91951.0        1
92165.0        1
92174.0        1
92170.0        1
Name: count, Length: 166, dtype: int64

## Spend Patterns

In [11]:
SPEND_PATTERNS_PATH = os.path.join(SPEND_PATTERNS_DIR, '2025-07--data_01bffb7b-0106-c81a-0042-fa0703e58316_108_2_0.snappy.parquet')

spend_df = con.execute(f"""
            SELECT 
                *
            FROM '{SPEND_PATTERNS_PATH}'
            WHERE POSTAL_CODE IN ({','.join(map(str, zipcodes))}) AND
            BRANDS IS NOT NULL""").fetchdf()
spend_df.head()

Unnamed: 0,BRANDS,BUCKETED_CUSTOMER_FREQUENCY,BUCKETED_CUSTOMER_INCOMES,CITY,CUSTOMER_HOME_CITY,DAY_COUNTS,LOCATION_NAME,MEAN_SPEND_PER_CUSTOMER_BY_FREQUENCY,MEAN_SPEND_PER_CUSTOMER_BY_INCOME,MEDIAN_SPEND_PER_CUSTOMER,...,SPEND_DATE_RANGE_END,SPEND_DATE_RANGE_START,SPEND_PCT_CHANGE_VS_PREV_MONTH,SPEND_PCT_CHANGE_VS_PREV_YEAR,SPEND_PER_TRANSACTION_BY_DAY,SPEND_PER_TRANSACTION_PERCENTILES,STREET_ADDRESS,SUB_CATEGORY,TOP_CATEGORY,TRANSACTION_INTERMEDIARY
0,Jack in the Box,"{""1"":4,""2"":0,""3"":0,""4"":0,""5-10"":0,"">10"":0}","{""60-75k"":2,""75-100k"":2,"">150k"":3}",National City,"{""key_value"":[{""key"":"", ND"",""value"":2},{""key"":...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Jack in the Box,"{""1"":22.04}","{""60-75k"":5.96,""75-100k"":19.87,"">150k"":31.16}",25.52,...,2025-08-01,2025-07-01,,,"[null,null,31.16,null,null,null,null,5.96,null...","{""25"":9.44,""75"":31.16}",700 Roosevelt Ave,Limited-Service Restaurants,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""No intermediary"",""value""..."
1,Walmart,"{""1"":1378,""2"":288,""3"":73,""4"":34,""5-10"":48,"">10...","{""100-150k"":212,""25-45k"":340,""45-60k"":239,""60-...",Poway,"{""key_value"":[{""key"":""Richland, WA"",""value"":2}...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Walmart Supercenter,"{""1"":45.64,""2"":91.39,""3"":210.87,""4"":164.88,""5-...","{""100-150k"":63.73,""25-45k"":48.44,""45-60k"":51.1...",33.93,...,2025-08-01,2025-07-01,36.0,167.0,"[28.69,39.41,68.28,26.95,18.77,49.75,30.63,28....","{""25"":12,""75"":46.7}",13425 Community Rd,Warehouse Clubs and Supercenters,"General Merchandise Stores, including Warehous...","{""key_value"":[{""key"":""No intermediary"",""value""..."
2,Carl's Jr.,"{""1"":18,""2"":1,""3"":0,""4"":0,""5-10"":0,"">10"":0}","{""100-150k"":2,""25-45k"":2,""45-60k"":2,""60-75k"":2...",La Mesa,"{""key_value"":[{""key"":""Encinitas, CA"",""value"":2...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Carl's Jr.,"{""1"":18.13,""2"":26.11}","{""100-150k"":24.16,""25-45k"":11.38,""45-60k"":26.3...",16.14,...,2025-08-01,2025-07-01,55.0,7.0,"[24.71,null,null,16.14,null,null,24.16,null,7....","{""25"":12.56,""75"":24.57}",8110 Parkway Dr,Limited-Service Restaurants,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""No intermediary"",""value""..."
3,24 Hour Fitness,"{""1"":2,""2"":1,""3"":1,""4"":0,""5-10"":0,"">10"":0}","{""25-45k"":2,"">150k"":2}",San Diego,"{""key_value"":[{""key"":""Lakeside, CA"",""value"":2}...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",24 Hour Fitness,"{""1"":4.78,""2"":40,""3"":21.25}","{""25-45k"":10.27,"">150k"":40}",13.37,...,2025-08-01,2025-07-01,,,"[null,null,null,null,20,5.48,null,null,null,8....","{""25"":4.29,""75"":20}",5885 Rancho Mission Rd,Fitness and Recreational Sports Centers,Other Amusement and Recreation Industries,"{""key_value"":[{""key"":""No intermediary"",""value""..."
4,Great Clips,"{""1"":10,""2"":3,""3"":0,""4"":0,""5-10"":0,"">10"":0}","{""100-150k"":2,""25-45k"":2,""45-60k"":2,""60-75k"":2...",Oceanside,"{""key_value"":[{""key"":""Oceanside, CA"",""value"":9...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Great Clips,"{""1"":33.6,""2"":51.67}","{""100-150k"":46,""25-45k"":31.5,""45-60k"":33.66,""6...",35.0,...,2025-08-01,2025-07-01,-4.0,-6.0,"[null,null,null,null,26,null,39,24.5,28,30,nul...","{""25"":25.99,""75"":34}",3825 Mission Ave,Beauty Salons,Personal Care Services,"{""key_value"":[{""key"":""No intermediary"",""value""..."


In [14]:
spend_df.to_parquet('data/san-diego-county-spend-patterns.parquet')

In [None]:
con.execute("CREATE OR REPLACE TABLE places AS SELECT * FROM 'data/san-diego-county-places.parquet'")
con.execute("CREATE OR REPLACE TABLE spend AS SELECT * FROM 'data/san-diego-county-spend-patterns.parquet'")

joined_df = con.execute("""
    SELECT 
        pl.*,
        sp.*
    FROM places pl
    JOIN spend sp
      ON pl.PLACEKEY = sp.PLACEKEY
""").fetchdf()

joined_df.head()


Unnamed: 0,CITY,LATITUDE,LONGITUDE,PARENT_PLACEKEY,PLACEKEY,POLYGON_WKT,POSTAL_CODE,REGION,SAFEGRAPH_BRAND_IDS,STREET_ADDRESS,...,SPEND_DATE_RANGE_END,SPEND_DATE_RANGE_START,SPEND_PCT_CHANGE_VS_PREV_MONTH,SPEND_PCT_CHANGE_VS_PREV_YEAR,SPEND_PER_TRANSACTION_BY_DAY,SPEND_PER_TRANSACTION_PERCENTILES,STREET_ADDRESS_1,SUB_CATEGORY,TOP_CATEGORY,TRANSACTION_INTERMEDIARY
0,Fallbrook,33.363182,-117.249713,,222-222@5z5-sq3-7h5,POLYGON ((-117.24961958701293 33.3632514508898...,92028.0,CA,,1625 S Mission Rd,...,2025-08-01,2025-07-01,-65.0,4.0,"[null,null,null,null,null,63.72,null,null,null...","{""25"":37.97,""75"":93.26}",1625 S Mission Rd,Caterers,Special Food Services,"{""key_value"":[{""key"":""No intermediary"",""value""..."
1,San Diego,32.962082,-117.189917,223-224@5z5-q36-5j9,223-223@5z5-q36-4sq,POLYGON ((-117.18968099635235 32.9621080316114...,92130.0,CA,SG_BRAND_01644e771a1e10f6,5980 Village Way,...,2025-08-01,2025-07-01,,,"[null,null,null,null,null,null,null,null,null,...","{""25"":7.99,""75"":14.99}",5980 Village Way,Snack and Nonalcoholic Beverage Bars,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""Plastiq"",""value"":3}]}"
2,Poway,32.950256,-117.062976,,224-222@5z5-pwm-6tv,"POLYGON ((-117.062901490944 32.95036774128007,...",92064.0,CA,,12906 Pomerado Rd,...,2025-08-01,2025-07-01,-61.0,-75.0,"[null,27.37,null,null,null,null,20.01,null,2.7...","{""25"":3.96,""75"":23.48}",12906 Pomerado Rd,"Beer, Wine, and Liquor Stores","Beer, Wine, and Liquor Stores","{""key_value"":[{""key"":""No intermediary"",""value""..."
3,San Diego,32.979783,-117.082701,224-223@5z5-pwd-2c5,225-222@5z5-pwd-2c5,POLYGON ((-117.08280491270557 32.9798247527082...,92128.0,CA,SG_BRAND_87bd111b6dd52ef32eaf3abf6d509065,11134 Rancho Carmel Dr Ste 104,...,2025-08-01,2025-07-01,9.0,-43.0,"[17.29,13.01,11.15,37.12,23.6,15.68,13.3,25.27...","{""25"":11.8,""75"":23.6}",11134 Rancho Carmel Dr Ste 104,Limited-Service Restaurants,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""No intermediary"",""value""..."
4,Fallbrook,33.372348,-117.252617,,225-222@5z5-sq2-fmk,POLYGON ((-117.25238131748273 33.3721978555450...,92028.0,CA,SG_BRAND_0c9818695d20f06b7c6cd8ccb3bf3905,1101 S Main Ave Ste B,...,2025-08-01,2025-07-01,92.0,13.0,"[null,null,null,null,null,null,null,null,null,...","{""25"":47.28,""75"":55.26}",1101 S Main Ave Ste B,Pet and Pet Supplies Stores,Other Miscellaneous Store Retailers,"{""key_value"":[{""key"":""No intermediary"",""value""..."


In [19]:
# show duplicate columns
end_1_cols = [col for col in joined_df.columns if col.endswith('_1')]
print(end_1_cols)

['CITY_1', 'PLACEKEY_1', 'POSTAL_CODE_1', 'REGION_1', 'SAFEGRAPH_BRAND_IDS_1', 'STREET_ADDRESS_1']


In [20]:
# drop duplicate columns
joined_df = joined_df.loc[:, ~joined_df.columns.str.endswith('_1')]

In [22]:
joined_df.to_parquet('data/san-diego-county-places-spend.parquet')
