In [1]:
import pandas as pd
from geopy.distance import geodesic
import numpy as np

In [2]:
#Read in data
df = pd.read_parquet("../data/iowa_liquor_2023_2025.parquet", columns=['store', 'store_location', 'county'])
df = df.dropna(subset=["store_location"]).drop_duplicates(subset=["store"])
df.head()

Unnamed: 0,store,store_location,county
0,4829,"{'coordinates': [-93.61378, 41.60575], 'type':...",POLK
30,2190,"{'coordinates': [-93.61979, 41.60558], 'type':...",POLK
80,2666,"{'coordinates': [-93.62173, 41.70471], 'type':...",POLK
100,2699,"{'coordinates': [-93.62362, 41.70324], 'type':...",POLK
118,6245,"{'coordinates': [-93.58545, 41.73189], 'type':...",POLK


In [3]:
from math import radians
#isolate longitude and latitude coordinates
df['lon'] = df['store_location'].apply(
    lambda x: None if pd.isna(x) else x['coordinates'][0]
)
df['lat'] = df['store_location'].apply(
    lambda x: None if pd.isna(x) else x['coordinates'][1]
)
#convert to radians
df["lat"] = df['lat'].astype(float).apply(radians)
df["lon"] = df['lon'].astype(float).apply(radians)
df.drop(columns=["store_location"], inplace=True)

In [4]:
from sklearn.metrics.pairwise import haversine_distances
#calculate pairwise distances
result = haversine_distances(df[["lat", "lon"]])

In [5]:
#convert to miles
pairwise_distance_miles = result * 3959

In [6]:
# make diagonal of pairwise distance matrix NAs
np.fill_diagonal(pairwise_distance_miles, np.NaN)
pairwise_distance_miles

array([[         nan,   0.3107378 ,   6.8502037 , ..., 113.4519488 ,
        108.98183187, 106.56962077],
       [  0.3107378 ,          nan,   6.85037582, ..., 113.76120373,
        109.28296777, 106.87041282],
       [  6.8502037 ,   6.85037582,          nan, ..., 114.4741194 ,
        107.64408286, 105.20647938],
       ...,
       [113.4519488 , 113.76120373, 114.4741194 , ...,          nan,
         38.69575115,  39.10250382],
       [108.98183187, 109.28296777, 107.64408286, ...,  38.69575115,
                 nan,   2.45965014],
       [106.56962077, 106.87041282, 105.20647938, ...,  39.10250382,
          2.45965014,          nan]])

In [7]:
df["# of stores within 5 mile radius"] = (pairwise_distance_miles < 5).sum(axis=1)


In [8]:
df["Nearest other store (mi)"] = pd.DataFrame(pairwise_distance_miles).min(axis=1, skipna=True).to_numpy()


In [9]:
df.head()

Unnamed: 0,store,county,lon,lat,# of stores within 5 mile radius,Nearest other store (mi)
0,4829,POLK,-1.633869,0.726157,87,0.310738
30,2190,POLK,-1.633974,0.726154,98,0.024184
80,2666,POLK,-1.634007,0.727885,63,0.140796
100,2699,POLK,-1.63404,0.727859,64,0.140796
118,6245,POLK,-1.633374,0.728359,46,0.043836


In [10]:
#save df
df.to_csv("proximity.csv")