# Data Preparation

## Import Modules

In [1]:
import os
import pandas as pd
import duckdb
import numpy as np

```bash
cd data
uvx --from deweypy dewey --api-key <API_KEY> speedy-download <FOLDER_ID>
```

## Constants

In [None]:
# San Diego Country Zip Codes 
zipcodes = [
    91901, 91902, 91903, 91905, 91906, 91908, 91909, 91910, 91911, 91913,
    91914, 91915, 91916, 91917, 91919, 91921, 91931, 91932, 91934, 91935,
    91941, 91942, 91943, 91944, 91945, 91946, 91947, 91948, 91950, 91951,
    91962, 91963, 91964, 91965, 91976, 91977, 91978, 91979, 91980, 91987,
    92003, 92004, 92007, 92008, 92009, 92010, 92011, 92013, 92014, 92018,
    92019, 92020, 92021, 92022, 92023, 92024, 92025, 92026, 92027, 92028,
    92029, 92033, 92036, 92037, 92038, 92039, 92040, 92046, 92049, 92051,
    92052, 92054, 92055, 92056, 92057, 92058, 92059, 92060, 92061, 92062,
    92064, 92065, 92066, 92067, 92068, 92069, 92070, 92071, 92072, 92074,
    92075, 92078, 92079, 92082, 92083, 92084, 92085, 92086, 92088, 92090,
    92091, 92092, 92093, 92096, 92099, 92101, 92102, 92103, 92104, 92105,
    92106, 92107, 92108, 92109, 92110, 92111, 92112, 92113, 92114, 92115,
    92116, 92117, 92118, 92119, 92120, 92121, 92122, 92123, 92124, 92126,
    92127, 92128, 92129, 92130, 92131, 92132, 92133, 92134, 92135, 92136,
    92137, 92138, 92139, 92140, 92142, 92143, 92144, 92145, 92147, 92149,
    92150, 92152, 92153, 92154, 92155, 92158, 92159, 92160, 92161, 92162,
    92163, 92164, 92165, 92166, 92167, 92168, 92169, 92170, 92171, 92172,
    92173, 92174, 92175, 92176, 92177, 92178, 92179, 92180, 92182, 92184,
    92186, 92187, 92190, 92191, 92192, 92193, 92194, 92195, 92196, 92197,
    92198, 92199
]

GLOBAL_PLACES_DIR = 'data/dewey-downloads/us-ca-places-update' # adjust as needed (redownloaded places with updated data)
SPEND_PATTERNS_DIR = 'data/dewey-downloads/safegraph-spend-patterns-ca-only'

In [27]:
# Load the Global Places dataset
GLOBAL_PLACES_PATH = os.path.join(GLOBAL_PLACES_DIR, '*.parquet')
con = duckdb.connect()

# Filter for only San Diego County zip codes (removed columns and filtered to CA in Dewey)
df = con.execute(f"""
            SELECT
                *
            FROM '{GLOBAL_PLACES_PATH}' 
            WHERE POSTAL_CODE IN ({','.join(map(str, zipcodes))})""").fetchdf()
df.head()

Unnamed: 0,BRANDS,CATEGORY_TAGS,CITY,CLOSED_ON,DOMAINS,ENCLOSED,GEOMETRY_TYPE,INCLUDES_PARKING_LOT,ISO_COUNTRY_CODE,IS_SYNTHETIC,...,POLYGON_WKT,POSTAL_CODE,REGION,STORE_ID,STREET_ADDRESS,SUB_CATEGORY,TOP_CATEGORY,TRACKING_CLOSED_SINCE,WEBSITE,WKT_AREA_SQ_METERS
0,[],"[""Bar or Pub"",""Beer Garden"",""Brewery or Brewpu...",Santa Ysabel,NaT,"[""julianstation.com""]",False,POLYGON,False,US,False,...,POLYGON ((-116.64671203999995 33.0983326280000...,92070,CA,,4468 Julian Rd,Wineries,Beverage Manufacturing,2019-07-01,http://www.julianstation.com,1684.0
1,[],[],Poway,NaT,[],False,POLYGON,,US,False,...,MULTIPOLYGON (((-117.01470399899034 32.9743729...,92064,CA,,Kent Hill Open Space,Nature Parks and Other Similar Institutions,"Museums, Historical Sites, and Similar Institu...",2019-07-01,,204969.0
2,[],[],Del Mar,NaT,[],False,POLYGON,,US,False,...,MULTIPOLYGON (((-117.25588799933166 32.9554660...,92014,CA,,Del Mar Public Parkland,Nature Parks and Other Similar Institutions,"Museums, Historical Sites, and Similar Institu...",2019-07-01,,28019.0
3,[],[],San Diego,NaT,[],False,POLYGON,True,US,False,...,POLYGON ((-117.0344262708557 32.68046736407763...,92139,CA,,7373 Tooma St,Nature Parks and Other Similar Institutions,"Museums, Historical Sites, and Similar Institu...",2019-07-01,,48711.0
4,[],[],Oceanside,NaT,"[""oceanside.ca.us""]",False,POLYGON,,US,False,...,POLYGON ((-117.26260899900609 33.2479320001617...,92057,CA,,Spring Creek Park,Nature Parks and Other Similar Institutions,"Museums, Historical Sites, and Similar Institu...",2019-07-01,,10874.0


In [15]:
df['PLACEKEY'].nunique()

247339

In [28]:
# export subset to parquet
df.to_parquet('data/san-diego-county-places.parquet')

In [None]:
df.columns

In [None]:
# np.sort(df['TOP_CATEGORY'].unique()) # remove category as its available in spend

In [None]:
df['POSTAL_CODE'].value_counts()

## Spend Patterns

In [31]:
SPEND_PATTERNS_PATH = os.path.join(SPEND_PATTERNS_DIR, '2025-07--data_01bffb7b-0106-c81a-0042-fa0703e58316_108_2_0.snappy.parquet')

spend_df = con.execute(f"""
            SELECT 
                *
            FROM '{SPEND_PATTERNS_PATH}'
            WHERE POSTAL_CODE IN ({','.join(map(str, zipcodes))}) AND
            BRANDS IS NOT NULL""").fetchdf()
spend_df.head()

Unnamed: 0,BRANDS,BUCKETED_CUSTOMER_FREQUENCY,BUCKETED_CUSTOMER_INCOMES,CITY,CUSTOMER_HOME_CITY,DAY_COUNTS,LOCATION_NAME,MEAN_SPEND_PER_CUSTOMER_BY_FREQUENCY,MEAN_SPEND_PER_CUSTOMER_BY_INCOME,MEDIAN_SPEND_PER_CUSTOMER,...,SPEND_DATE_RANGE_END,SPEND_DATE_RANGE_START,SPEND_PCT_CHANGE_VS_PREV_MONTH,SPEND_PCT_CHANGE_VS_PREV_YEAR,SPEND_PER_TRANSACTION_BY_DAY,SPEND_PER_TRANSACTION_PERCENTILES,STREET_ADDRESS,SUB_CATEGORY,TOP_CATEGORY,TRANSACTION_INTERMEDIARY
0,Jack in the Box,"{""1"":4,""2"":0,""3"":0,""4"":0,""5-10"":0,"">10"":0}","{""60-75k"":2,""75-100k"":2,"">150k"":3}",National City,"{""key_value"":[{""key"":"", ND"",""value"":2},{""key"":...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Jack in the Box,"{""1"":22.04}","{""60-75k"":5.96,""75-100k"":19.87,"">150k"":31.16}",25.52,...,2025-08-01,2025-07-01,,,"[null,null,31.16,null,null,null,null,5.96,null...","{""25"":9.44,""75"":31.16}",700 Roosevelt Ave,Limited-Service Restaurants,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""No intermediary"",""value""..."
1,Walmart,"{""1"":1378,""2"":288,""3"":73,""4"":34,""5-10"":48,"">10...","{""100-150k"":212,""25-45k"":340,""45-60k"":239,""60-...",Poway,"{""key_value"":[{""key"":""Richland, WA"",""value"":2}...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Walmart Supercenter,"{""1"":45.64,""2"":91.39,""3"":210.87,""4"":164.88,""5-...","{""100-150k"":63.73,""25-45k"":48.44,""45-60k"":51.1...",33.93,...,2025-08-01,2025-07-01,36.0,167.0,"[28.69,39.41,68.28,26.95,18.77,49.75,30.63,28....","{""25"":12,""75"":46.7}",13425 Community Rd,Warehouse Clubs and Supercenters,"General Merchandise Stores, including Warehous...","{""key_value"":[{""key"":""No intermediary"",""value""..."
2,Carl's Jr.,"{""1"":18,""2"":1,""3"":0,""4"":0,""5-10"":0,"">10"":0}","{""100-150k"":2,""25-45k"":2,""45-60k"":2,""60-75k"":2...",La Mesa,"{""key_value"":[{""key"":""Encinitas, CA"",""value"":2...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Carl's Jr.,"{""1"":18.13,""2"":26.11}","{""100-150k"":24.16,""25-45k"":11.38,""45-60k"":26.3...",16.14,...,2025-08-01,2025-07-01,55.0,7.0,"[24.71,null,null,16.14,null,null,24.16,null,7....","{""25"":12.56,""75"":24.57}",8110 Parkway Dr,Limited-Service Restaurants,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""No intermediary"",""value""..."
3,24 Hour Fitness,"{""1"":2,""2"":1,""3"":1,""4"":0,""5-10"":0,"">10"":0}","{""25-45k"":2,"">150k"":2}",San Diego,"{""key_value"":[{""key"":""Lakeside, CA"",""value"":2}...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",24 Hour Fitness,"{""1"":4.78,""2"":40,""3"":21.25}","{""25-45k"":10.27,"">150k"":40}",13.37,...,2025-08-01,2025-07-01,,,"[null,null,null,null,20,5.48,null,null,null,8....","{""25"":4.29,""75"":20}",5885 Rancho Mission Rd,Fitness and Recreational Sports Centers,Other Amusement and Recreation Industries,"{""key_value"":[{""key"":""No intermediary"",""value""..."
4,Great Clips,"{""1"":10,""2"":3,""3"":0,""4"":0,""5-10"":0,"">10"":0}","{""100-150k"":2,""25-45k"":2,""45-60k"":2,""60-75k"":2...",Oceanside,"{""key_value"":[{""key"":""Oceanside, CA"",""value"":9...","{""Friday"":4,""Monday"":4,""Saturday"":4,""Sunday"":4...",Great Clips,"{""1"":33.6,""2"":51.67}","{""100-150k"":46,""25-45k"":31.5,""45-60k"":33.66,""6...",35.0,...,2025-08-01,2025-07-01,-4.0,-6.0,"[null,null,null,null,26,null,39,24.5,28,30,nul...","{""25"":25.99,""75"":34}",3825 Mission Ave,Beauty Salons,Personal Care Services,"{""key_value"":[{""key"":""No intermediary"",""value""..."


In [33]:
spend_df.to_parquet('data/san-diego-county-spend-patterns.parquet')

In [None]:
con.execute("CREATE OR REPLACE TABLE places AS SELECT * FROM 'data/san-diego-county-places.parquet'")
con.execute("CREATE OR REPLACE TABLE spend AS SELECT * FROM 'data/san-diego-county-spend-patterns.parquet'")

joined_df = con.execute("""
    SELECT 
        pl.*,
        sp.*
    FROM spend sp
    LEFT JOIN places pl
      ON sp.PLACEKEY = pl.PLACEKEY
""").fetchdf()

joined_df.head()

spend_count = con.execute("SELECT COUNT(*) FROM spend").fetchone()[0]
joined_count = len(joined_df)

print(f"Spend records: {spend_count}")
print(f"Joined records: {joined_count}")
print(f"All spend records kept: {spend_count == joined_count}")


Spend records: 8628
Joined records: 8628
All spend records kept: True


In [46]:
joined_df.columns

Index(['BRANDS', 'CATEGORY_TAGS', 'CITY', 'CLOSED_ON', 'DOMAINS', 'ENCLOSED',
       'GEOMETRY_TYPE', 'INCLUDES_PARKING_LOT', 'ISO_COUNTRY_CODE',
       'IS_SYNTHETIC', 'LATITUDE', 'LOCATION_NAME', 'LONGITUDE', 'NAICS_CODE',
       'OPENED_ON', 'OPEN_HOURS', 'PARENT_PLACEKEY', 'PHONE_NUMBER',
       'PLACEKEY', 'POLYGON_CLASS', 'POLYGON_WKT', 'POSTAL_CODE', 'REGION',
       'STORE_ID', 'STREET_ADDRESS', 'SUB_CATEGORY', 'TOP_CATEGORY',
       'TRACKING_CLOSED_SINCE', 'WEBSITE', 'WKT_AREA_SQ_METERS', 'BRANDS_1',
       'BUCKETED_CUSTOMER_FREQUENCY', 'BUCKETED_CUSTOMER_INCOMES', 'CITY_1',
       'CUSTOMER_HOME_CITY', 'DAY_COUNTS', 'LOCATION_NAME_1',
       'MEAN_SPEND_PER_CUSTOMER_BY_FREQUENCY',
       'MEAN_SPEND_PER_CUSTOMER_BY_INCOME', 'MEDIAN_SPEND_PER_CUSTOMER',
       'MEDIAN_SPEND_PER_TRANSACTION', 'NAICS_CODE_1', 'ONLINE_SPEND',
       'ONLINE_TRANSACTIONS', 'PLACEKEY_1', 'POSTAL_CODE_1',
       'RAW_NUM_CUSTOMERS', 'RAW_NUM_TRANSACTIONS', 'RAW_TOTAL_SPEND',
       'REGION_1', 'RE

In [None]:
# show duplicate columns
end_1_cols = [col for col in joined_df.columns if col.endswith('_1')]
print(end_1_cols)

In [None]:
# drop duplicate columns
joined_df = joined_df.loc[:, ~joined_df.columns.str.endswith('_1')]


In [None]:
joined_df.to_parquet('data/san-diego-county-places-spend.parquet')