In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import folium
from sklearn.cluster import KMeans

In [None]:
path = Path('../data/walmart_store_openings.csv')
df = pd.read_csv(path)
df.head()

In [None]:
# Basic cleaning
geo = df.copy()
geo['LAT'] = pd.to_numeric(geo['LAT'], errors='coerce')
geo['LON'] = pd.to_numeric(geo['LON'], errors='coerce')
geo = geo.dropna(subset=['LAT','LON'])
geo = geo[(geo['LAT'].between(10, 75)) & (geo['LON'].between(-170, -50))]  # US-ish bounds
geo.shape

## Candidate site selection via clustering
We cluster store locations and pick one representative store near each centroid.

This is a simple proxy for choosing a small number of hub/ops sites to cover a network.

In [None]:
# Choose number of candidate sites
k = 8
coords = geo[['LAT','LON']].to_numpy()

kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
labels = kmeans.fit_predict(coords)
centroids = kmeans.cluster_centers_

geo = geo.assign(cluster=labels)
centroids_df = pd.DataFrame(centroids, columns=['LAT','LON']).assign(cluster=range(k))
centroids_df.head()

In [None]:
# Pick nearest observed store to each centroid (more realistic than raw centroid)
candidates = []
for cl in range(k):
    c = centroids[cl]
    grp = geo.loc[geo['cluster'] == cl].copy()
    d2 = (grp['LAT'] - c[0])**2 + (grp['LON'] - c[1])**2
    pick = grp.loc[d2.idxmin()]
    candidates.append(pick)

candidates = pd.DataFrame(candidates).reset_index(drop=True)
candidates[['storenum','STRCITY','STRSTATE','type_store','LAT','LON','cluster']].head(k)

In [None]:
# Map
center = [float(candidates['LAT'].mean()), float(candidates['LON'].mean())]
m = folium.Map(location=center, zoom_start=4, tiles='CartoDB positron')

# Candidate sites
for _, r in candidates.iterrows():
    folium.CircleMarker(
        location=[r['LAT'], r['LON']],
        radius=7,
        color='red',
        fill=True,
        fill_opacity=0.8,
        popup=f
