In [None]:
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
import logging
import time

In [None]:
# Initialize the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [None]:
# Load the dataset
data = pd.read_csv('data.csv')

In [None]:
data.head()

In [None]:
# Assign your column names here
address_column = 'Address'
city_column = 'City'
state_column = 'State'
zip_column = 'Zip5'
country_column = 'Country'
# if you already have latitude and longitude columns, assign them here
latitude_column = 'Latitude'
longitude_column = 'Longitude'

In [None]:
# If there is not an address column that contains the address, city, state, and zip code, then create one
if 'full_address' not in data.columns:
    data['full_address'] = data[address_column] + ', ' + data[city_column] + ', ' + data[state_column] + ', ' + data[zip_column].astype(str)

# If there is no country column, default to 'US'
if country_column not in data.columns:
    data['Country'] = 'US'
else:
    # if it is there, append to have the full address
    data['full_address'] = data['full_address'] + ', ' + data[country_column]


In [None]:
# Check dataset again
data.head()


In [None]:
# Initialize the geocoder
geocoder = Nominatim(user_agent="my_app")

def geocode_address(address):
    retries = 3
    for i in range(retries):
        try:
            logger.info(f"Geocoding address: {address}")
            result = geocoder.geocode(address)
            logger.info(f"Geocoded address: {result}")
            return result
        except (geopy.exc.GeocoderTimedOut, geopy.exc.GeocoderUnavailable):
            if i == retries - 1:
                logger.warning(f"Failed to geocode address: {address}")
                return None
            logger.warning(f"Geocoding attempt {i+1} failed for address: {address}")
            time.sleep(1)

In [None]:
# Only needs to run if there is no geocoding already. If there is, skip to the next cell

if data[latitude_column].empty and data[longitude_column].empty:
    # Geocode the addresses
    data['geocoded_address'] = data['full_address'].apply(lambda address: geocode_address(address))

    # Extract the latitude and longitude from the geocoded address
    data['Latitude'] = data['geocoded_address'].apply(lambda address: address.latitude if address else -999)
    data['Longitude'] = data['geocoded_address'].apply(lambda address: address.longitude if address else -999)
else:
    logger.info("Latitude and longitude columns already exist. Skipping geocoding step.")

In [None]:
# Preprocess the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[['Latitude', 'Longitude']])

In [None]:
# Determine the optimal number of clusters using elbow analysis
sum_of_squared_distances = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(scaled_data)
    labels = kmeans.labels_
    centroid_distances = cdist(scaled_data, kmeans.cluster_centers_)
    sum_of_squared_distances.append(centroid_distances.mean())

# Get the elbow point
elbow_point = sum_of_squared_distances.index(min(sum_of_squared_distances)) + 1

In [None]:
# Visualize the elbow point analysis
import matplotlib.pyplot as plt
plt.plot(range(1, 10), sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distance to centroid')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# Manually set the elbow point if you want to increase the number of clusters
elbow_point = 9

In [None]:
# Perform k-means clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=elbow_point)
kmeans.fit(scaled_data)
labels = kmeans.labels_

# Add the cluster labels to the dataset
data['cluster'] = labels

In [None]:
# Visualize the clusters using plotly
import plotly.express as px
fig = px.scatter_mapbox(data, lat="Latitude", lon="Longitude", color="cluster", zoom=10, height=800)
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [None]:
# Save the dataset with the new columns
data.to_csv('data_with_clusters.csv', index=False)