In [8]:
import pandas as pd
from geopy.distance import geodesic
import numpy as np

In [10]:
df = pd.read_parquet("iowa_liquor_2023_2025.parquet", columns=['store', 'store_location'])
df = df.dropna(subset=["store_location"]).drop_duplicates(subset=["store"])
df.head()

Unnamed: 0,store,store_location
0,4829,"{'coordinates': [-93.61378, 41.60575], 'type':..."
30,2190,"{'coordinates': [-93.61979, 41.60558], 'type':..."
80,2666,"{'coordinates': [-93.62173, 41.70471], 'type':..."
100,2699,"{'coordinates': [-93.62362, 41.70324], 'type':..."
118,6245,"{'coordinates': [-93.58545, 41.73189], 'type':..."


In [11]:
from tqdm import tqdm

In [12]:
df.store_location.iloc[1]

{'coordinates': array([-93.61979,  41.60558]), 'type': 'Point'}

In [13]:
from math import radians

df['lon'] = df['store_location'].apply(
    lambda x: None if pd.isna(x) else x['coordinates'][0]
)
df['lat'] = df['store_location'].apply(
    lambda x: None if pd.isna(x) else x['coordinates'][1]
)
df["lat"] = df['lat'].astype(float).apply(radians)
df["lon"] = df['lon'].astype(float).apply(radians)
df.drop(columns=["store_location"], inplace=True)

In [14]:
df

Unnamed: 0,store,lon,lat
0,4829,-1.633869,0.726157
30,2190,-1.633974,0.726154
80,2666,-1.634007,0.727885
100,2699,-1.634040,0.727859
118,6245,-1.633374,0.728359
...,...,...,...
6839222,010647,-1.659711,0.757868
6848633,010652,-1.635614,0.729061
6852936,010639,-1.595679,0.724074
6855968,010657,-1.598323,0.733647


In [15]:
from sklearn.metrics.pairwise import haversine_distances
result = haversine_distances(df[["lat", "lon"]])

In [16]:
pairwise_distance_miles = result * 3959

In [17]:
np.fill_diagonal(pairwise_distance_miles, np.NaN)
pairwise_distance_miles

array([[         nan,   0.3107378 ,   6.8502037 , ..., 113.4519488 ,
        108.98183187, 106.56962077],
       [  0.3107378 ,          nan,   6.85037582, ..., 113.76120373,
        109.28296777, 106.87041282],
       [  6.8502037 ,   6.85037582,          nan, ..., 114.4741194 ,
        107.64408286, 105.20647938],
       ...,
       [113.4519488 , 113.76120373, 114.4741194 , ...,          nan,
         38.69575115,  39.10250382],
       [108.98183187, 109.28296777, 107.64408286, ...,  38.69575115,
                 nan,   2.45965014],
       [106.56962077, 106.87041282, 105.20647938, ...,  39.10250382,
          2.45965014,          nan]])

In [18]:
df["# of stores within 5 mile radius"] = (pairwise_distance_miles < 5).sum(axis=1)


In [19]:
df["Nearest other store (mi)"] = pd.DataFrame(pairwise_distance_miles).min(axis=1, skipna=True).to_numpy()


In [20]:
df

Unnamed: 0,store,lon,lat,# of stores within 5 mile radius,Nearest other store (mi)
0,4829,-1.633869,0.726157,87,0.310738
30,2190,-1.633974,0.726154,98,0.024184
80,2666,-1.634007,0.727885,63,0.140796
100,2699,-1.634040,0.727859,64,0.140796
118,6245,-1.633374,0.728359,46,0.043836
...,...,...,...,...,...
6839222,010647,-1.659711,0.757868,15,0.027613
6848633,010652,-1.635614,0.729061,6,0.018401
6852936,010639,-1.595679,0.724074,1,0.258425
6855968,010657,-1.598323,0.733647,68,0.020202


In [21]:
df.to_csv("proximity.csv")