## Imports

In [1]:
import duckdb
import networkx as nx
import json
import pandas as pd

from pyvis.network import Network
import matplotlib.pyplot as plt

import folium
from shapely import wkt

## Constants

In [2]:
X_SHOP_LOCAL_BRANDS_PCT = 'RELATED_CROSS_SHOPPING_LOCAL_BRANDS_PCT'
X_SHOP_ONLINE_BRANDS_PCT = 'RELATED_CROSS_SHOPPING_ONLINE_MERCHANTS_PCT'
X_SHOP_PHYSICAL_BRANDS_PCT = 'RELATED_CROSS_SHOPPING_PHYSICAL_BRANDS_PCT'
X_SHOP_SAME_CATEGORY_BRANDS_PCT = 'RELATED_CROSS_SHOPPING_SAME_CATEGORY_BRANDS_PCT'
LOCATION_NAME = 'LOCATION_NAME'
TOP_CATEGORY = 'TOP_CATEGORY'
RAW_NUM_CUSTOMERS = 'RAW_NUM_CUSTOMERS'
SPEND_PLACES_PATH = 'data/san-diego-county-places-spend.parquet' # added based on joined output

## Load Data

In [3]:
r1 = duckdb.read_parquet(SPEND_PLACES_PATH)
df = r1.to_df()
df.head()

Unnamed: 0,CITY,LATITUDE,LONGITUDE,PARENT_PLACEKEY,PLACEKEY,POLYGON_WKT,POSTAL_CODE,REGION,SAFEGRAPH_BRAND_IDS,STREET_ADDRESS,...,SPEND_BY_TRANSACTION_INTERMEDIARY,SPEND_DATE_RANGE_END,SPEND_DATE_RANGE_START,SPEND_PCT_CHANGE_VS_PREV_MONTH,SPEND_PCT_CHANGE_VS_PREV_YEAR,SPEND_PER_TRANSACTION_BY_DAY,SPEND_PER_TRANSACTION_PERCENTILES,SUB_CATEGORY,TOP_CATEGORY,TRANSACTION_INTERMEDIARY
0,San Diego,32.755488,-117.107622,,22j-222@5z5-qdx-6zf,POLYGON ((-117.10754558444023 32.7555074919251...,92105.0,CA,SG_BRAND_1f2a4b730dc533366115ba7f5168b074,4090 El Cajon Blvd Ste C,...,"{""key_value"":[{""key"":""No intermediary"",""value""...",2025-08-01,2025-07-01,-45.0,,"[null,null,null,null,null,null,34.77,null,null...","{""25"":16.91,""75"":47.13}",Limited-Service Restaurants,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""No intermediary"",""value""..."
1,San Diego,32.762727,-117.131964,,22m-222@5z5-qdy-d9z,POLYGON ((-117.13201920422857 32.7627738709169...,92116.0,CA,,2873 Adams Ave,...,"{""key_value"":[{""key"":""Square"",""value"":203.92}]}",2025-08-01,2025-07-01,9.0,318.0,"[null,11.5,null,15.24,null,null,27.6,null,null...","{""25"":10.49,""75"":40.66}",Snack and Nonalcoholic Beverage Bars,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""Square"",""value"":10}]}"
2,San Diego,32.889963,-117.17993,,22r-222@5z5-px8-v75,POLYGON ((-117.18009507918524 32.8901942559195...,92121.0,CA,,6364 Ferris Sq,...,"{""key_value"":[{""key"":""No intermediary"",""value""...",2025-08-01,2025-07-01,-55.0,-19.0,"[null,null,null,26.38,null,5,null,null,null,24...","{""25"":13.61,""75"":47.7}",,"Advertising, Public Relations, and Related Ser...","{""key_value"":[{""key"":""No intermediary"",""value""..."
3,La Jolla,32.853546,-117.254233,zzy-222@5z5-pmy-r49,22t-222@5z5-pmy-mzf,POLYGON ((-117.25400457128175 32.8536535551412...,92037.0,CA,,2261 Avenida de la Playa,...,"{""key_value"":[{""key"":""Shopify"",""value"":421.4}]}",2025-08-01,2025-07-01,235.0,30.0,"[null,null,31.25,null,null,null,53.88,null,53....","{""25"":35.53,""75"":70.99}",Sporting Goods Stores,"Sporting Goods, Hobby, and Musical Instrument ...","{""key_value"":[{""key"":""Shopify"",""value"":7}]}"
4,San Ysidro,32.545093,-117.038717,,234-222@5z5-wp2-26k,POLYGON ((-117.0387910794577 32.54518979730449...,92173.0,CA,SG_BRAND_5179b21fc1d50950b99b4eecaa48c614,4449 Camino de la Plz,...,"{""key_value"":[{""key"":""No intermediary"",""value""...",2025-08-01,2025-07-01,23.0,0.0,"[9.04,3.88,9.21,12.28,19.16,21.02,11.08,28.73,...","{""25"":5.39,""75"":18.11}",Limited-Service Restaurants,Restaurants and Other Eating Places,"{""key_value"":[{""key"":""No intermediary"",""value""..."


In [4]:
def parse_json_to_tuple(json_str: str) -> list[tuple[str, int]]:
    if not json_str:
        return None
    try:
        data_dict = json.loads(json_str)

        # Handle the nested structure
        if isinstance(data_dict, dict) and "key_value" in data_dict:
            return [(item["key"], item["value"]) for item in data_dict["key_value"]]

        # If it's already a flat dict
        elif isinstance(data_dict, dict):
            return [(k, v) for k, v in data_dict.items()]

        # If it's a list of dicts already
        elif isinstance(data_dict, list):
            return [(item["key"], item["value"]) for item in data_dict]

        return None
    except (ValueError, SyntaxError, KeyError, TypeError):
        return None

## Parse Related Cross Shopping Columns

In [5]:
df['parsed_local_brands'] = df[X_SHOP_LOCAL_BRANDS_PCT].apply(parse_json_to_tuple)
df['parsed_online_brands'] = df[X_SHOP_ONLINE_BRANDS_PCT].apply(parse_json_to_tuple)
df['parsed_physical_brands'] = df[X_SHOP_PHYSICAL_BRANDS_PCT].apply(parse_json_to_tuple)
df['parsed_same_category_brands'] = df[X_SHOP_SAME_CATEGORY_BRANDS_PCT].apply(parse_json_to_tuple)

## Construct the graph

In [6]:
graphs = {
    "local": nx.DiGraph(),
    "online": nx.DiGraph(),
    "physical": nx.DiGraph(),
    "same_category": nx.DiGraph(),
}

df["brand_sets"] = df.apply(lambda row: {
    "local":        row["parsed_local_brands"],
    "online":       row["parsed_online_brands"],
    "physical":     row["parsed_physical_brands"],
    "same_category":row["parsed_same_category_brands"],
}, axis=1)

def add_nodes_and_edges(row):
    # shared node attributes
    location = row[LOCATION_NAME]
    category = row[TOP_CATEGORY]
    raw_num_customers = row[RAW_NUM_CUSTOMERS]
    latitude = row["LATITUDE"]
    longitude = row["LONGITUDE"]

    # For each graph, add this node
    for g in graphs.values():
        g.add_node(
            location,
            label=location,
            category=category,
            num_customers=raw_num_customers,
            latitude=latitude,
            longitude=longitude,
        )

    # For each (graph_name, brand_list) pair in this row
    for graph_name, brand_list in row["brand_sets"].items():
        if not brand_list:
            continue

        for brand, pct in brand_list:
            weight = pct / 100
            if weight > 0.01:
                graphs[graph_name].add_edge(location, brand, weight=weight)

df.apply(add_nodes_and_edges, axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
8532    None
8533    None
8534    None
8535    None
8536    None
Length: 8537, dtype: object

In [None]:
# check graph nodes and edge counts
print('Local: ', graphs['local'].number_of_nodes(), graphs['local'].number_of_edges())
print('Online: ', graphs['online'].number_of_nodes(), graphs['online'].number_of_edges())
print('Physical: ', graphs['physical'].number_of_nodes(), graphs['physical'].number_of_edges())
print('Same Category: ', graphs['same_category'].number_of_nodes(), graphs['same_category'].number_of_edges())

Local Graph:  4527 45689
Online Graph:  4834 91328
Physical Graph:  5674 103858
Same Category Graph:  5342 22337


## Export Graph to GEXF

In [None]:
# export the graph to a GEXF file for visualization in Gephi
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"graph_{timestamp}.gexf"
nx.write_gexf(G, output_file)

In [None]:
print(len(list(G.nodes)))
print(len(list(G.edges)))
#print(len(list(nx.connected_components(G)))) # only works for undirected graphs

## Clustering

In [None]:
clusters = nx.clustering(G)
sorted(clusters.items(), key=lambda item: item[1], reverse=True)

## Isolated Nodes

In [None]:
# find isolated nodes
isolated_nodes = list(nx.isolates(G))
print(f"Number of isolated nodes: {len(isolated_nodes)}")
print("Isolated nodes:", isolated_nodes)


## Degree Analysis

In [None]:
# find most influential nodes using out_degree 
in_degrees = dict(G.in_degree())
out_degrees = dict(G.out_degree())

plt.scatter(in_degrees.values(), out_degrees.values())
plt.xlabel("In-degree")
plt.ylabel("Out-degree")
plt.title("In vs Out Degree Distribution")
plt.show()

In [None]:
# find the 10 nodes with the most out_degree
top_out_degree = sorted(out_degrees.items(), key=lambda item: item[1], reverse=True)[:10]
print("Top 10 nodes by out-degree:")
for node, degree in top_out_degree:
    print(f"{node}: {degree}")  

# find the 10 nodes with the most in_degree
top_in_degree = sorted(in_degrees.items(), key=lambda item: item[1], reverse=True)[:10]
print("\nTop 10 nodes by in-degree:")
for node, degree in top_in_degree:
    print(f"{node}: {degree}")  

## 2nd Degree Neighbors

In [None]:
G.nodes["ALDI"]

In [None]:
node = "ALDI" # node of interest

# First-degree neighbors
first_deg = set(G.neighbors(node))

# Second-degree = neighbors of neighbors, excluding self and first-degree
second_deg = set()
for n in first_deg:
    second_deg.update(G.neighbors(n))
second_deg -= first_deg
second_deg.discard(node)

print(f"Node of interest: {node}")
print("Number of first-degree neighbors:", len(first_deg))
print("First-degree:", first_deg)
print("Number of second-degree neighbors:", len(second_deg))
print("Second-degree:", second_deg)

### Include distance as a filter

We will expand this analysis by applying a function that allows filtering based on distance for a given node or location. We use geopy for its accuracy but haversine distance can also be used.

In [None]:
from geopy.distance import great_circle

In [None]:
# Look at known locations in Encinitas for testing
df[['LATITUDE', 'LONGITUDE', 'LOCATION_NAME', 'POSTAL_CODE']].loc[df['POSTAL_CODE'] == 92024].head(20)

In [None]:
place_1 = df.iloc[787]
place_2 = df.iloc[288]

print(place_1['LOCATION_NAME'])
print(place_2['LOCATION_NAME'])

In [None]:
# Get the latitude and longitude for the two locations
place_1_lat_long = (place_1['LATITUDE'], place_1['LONGITUDE'])
place_2_lat_long = (place_2['LATITUDE'], place_2['LONGITUDE'])

# Print the distance between the two locations, this is correct
print(great_circle(place_1_lat_long, place_2_lat_long).miles)

In [None]:
def get_nodes_within_radius(G, center_node, radius_mi):
    """
    Get all nodes within a given radius (in km) of a center node.
    Uses geodesic distance for accuracy.
    """
    if center_node not in G:
        return []
    
    center_lat = G.nodes[center_node].get('latitude')
    center_lon = G.nodes[center_node].get('longitude')
    
    if center_lat is None or center_lon is None:
        print(f"Warning: {center_node} does not have latitude/longitude data")
        return []
    
    center_point = (center_lat, center_lon)
    nodes_within_radius = []
    
    for node in G.nodes():
        node_lat = G.nodes[node].get('latitude')
        node_lon = G.nodes[node].get('longitude')
        
        if node_lat is None or node_lon is None:
            continue
        
        node_point = (node_lat, node_lon)
        distance = great_circle(center_point, node_point).miles
        
        if distance <= radius_mi:
            nodes_within_radius.append((node, distance))
    
    nodes_within_radius.sort(key=lambda x: x[1])
    return nodes_within_radius

In [None]:
get_nodes_within_radius(G, "Target", 1)

### Now we will map to show available locations and test distance
This evaluation shows examples of polygons available in our dataset and how down stream analysis can be used by stakeholders to make more accurate decisons with accurate visualizations 

In [None]:
df_enc = df.loc[df['POSTAL_CODE'] == 92024]

In [None]:
# Convert POLYGON_WKT to geometry
df_enc['geometry'] = df_enc['POLYGON_WKT'].apply(lambda x: wkt.loads(x) if pd.notna(x) else None)

# Create GeoDataFrame for plotting
gdf = gpd.GeoDataFrame(df_enc, geometry='geometry', crs='EPSG:4326')

In [None]:
# Create map, centered on Encinitas for my test
m = folium.Map(location=[33.036986, -117.292447], zoom_start=10)

In [None]:
# Now I will add polygons to my encinitas map
folium.GeoJson(
    gdf[['LOCATION_NAME', 'TOP_CATEGORY', 'geometry']].head(1000).to_json(), 
    style_function=lambda feature: {
        'fillColor': 'lightblue',
        'color': 'blue',
        'weight': 1,
        'fillOpacity': 0.5,
    },
    tooltip=folium.GeoJsonTooltip(fields=['LOCATION_NAME'], aliases=['Location:']),
    popup=folium.GeoJsonPopup(fields=['LOCATION_NAME', 'TOP_CATEGORY'])
).add_to(m)

# Add a marker for the two locations I used before and draw a line for distance check
folium.Marker(
    location=place_1_lat_long,
    popup=place_1['LOCATION_NAME'],
    tooltip=place_1['LOCATION_NAME']
).add_to(m)

folium.Marker(
    location=place_2_lat_long,
    popup=place_2['LOCATION_NAME'],
    tooltip=place_2['LOCATION_NAME']
).add_to(m)

folium.PolyLine(
    locations=[place_1_lat_long, place_2_lat_long],
    weight=2,
    color='red'
).add_to(m)

m.save('polygon_map.html')
m