In [9]:
import pandas as pd

In [16]:
import numpy as np
import folium

N_CLUSTERS = 5          
POINTS_PER_CLUSTER = 40
CLUSTER_SPREAD = 0.05 

FINLAND_LAT_RANGE = (60.0, 62.1)
FINLAND_LON_RANGE = (21.5, 27.5)

np.random.seed(42)
centers = pd.DataFrame({
    "cluster_id": range(N_CLUSTERS),
    "lat": np.random.uniform(*FINLAND_LAT_RANGE, N_CLUSTERS),
    "lon": np.random.uniform(*FINLAND_LON_RANGE, N_CLUSTERS)
})


points = []
for _, c in centers.iterrows():
    lats = np.random.normal(c.lat, CLUSTER_SPREAD, POINTS_PER_CLUSTER)
    lons = np.random.normal(c.lon, CLUSTER_SPREAD, POINTS_PER_CLUSTER)
    for lat, lon in zip(lats, lons):
        points.append({
            "lat": lat,
            "lon": lon,
            "cluster_true": c.cluster_id
        })

points = pd.DataFrame(points)


print(points.head())


m = folium.Map(location=[65.0, 26.0], zoom_start=5)
colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown', 'pink', 'gray', 'black', 'cyan']

for _, row in points.iterrows():
    folium.CircleMarker(
        location=[row.lat, row.lon],
        radius=4,
        color=colors[int(row.cluster_true) % len(colors)],  # <-- cast to int
        fill=True,
        fill_opacity=0.7
    ).add_to(m)

m



         lat        lon  cluster_true
0  60.763061  22.453148           0.0
1  60.813662  22.347815           0.0
2  60.763363  22.452171           0.0
3  60.763248  22.416713           0.0
4  60.798632  22.402121           0.0


In [14]:
import numpy as np
import pandas as pd
from geopy.distance import great_circle
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def cluster_quality_report(df, label_col="cluster_true"):
    """
    Compute geometric and operational cluster quality metrics for logistics assessment.
    Assumes df has columns: ['lat', 'lon', label_col].
    """

    coords = df[['lat', 'lon']].to_numpy()
    labels = df[label_col].to_numpy()

    try:
        sil_score = silhouette_score(coords, labels)
    except Exception:
        sil_score = np.nan
    try:
        db_score = davies_bouldin_score(coords, labels)
    except Exception:
        db_score = np.nan
    try:
        ch_score = calinski_harabasz_score(coords, labels)
    except Exception:
        ch_score = np.nan

    global_scores = {
        "Silhouette": sil_score,
        "Davies_Bouldin": db_score,
        "Calinski_Harabasz": ch_score,
    }

    summaries = []

    for cid, group in df.groupby(label_col):
        centroid = (group.lat.mean(), group.lon.mean())
        n_points = len(group)

 
        dists = [great_circle(centroid, (r.lat, r.lon)).km for _, r in group.iterrows()]
        avg_dist = np.mean(dists)
        max_dist = np.max(dists)
        std_dist = np.std(dists)


        pairwise = []
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                p1 = (group.iloc[i].lat, group.iloc[i].lon)
                p2 = (group.iloc[j].lat, group.iloc[j].lon)
                pairwise.append(great_circle(p1, p2).km)
        mean_pairwise = np.mean(pairwise) if pairwise else 0
        density = n_points / (mean_pairwise + 1e-6) 

        summaries.append({
            "cluster_id": int(cid),
            "n_points": n_points,
            "centroid_lat": centroid[0],
            "centroid_lon": centroid[1],
            "avg_distance_to_centroid_km": avg_dist,
            "max_distance_to_centroid_km": max_dist,
            "std_distance_km": std_dist,
            "mean_pairwise_distance_km": mean_pairwise,
            "density_index": density,
        })

    cluster_df = pd.DataFrame(summaries)

    load_balance = cluster_df["n_points"].std()
    spatial_balance = cluster_df["avg_distance_to_centroid_km"].std()

    global_scores.update({
        "Cluster_Count": cluster_df.shape[0],
        "Load_Balance_STD": load_balance,
        "Spatial_Balance_STD": spatial_balance,
    })

    print("Global Cluster Quality Metrics:")
    display(pd.DataFrame([global_scores]).round(3))

    print("Per-Cluster Logistics Summary:")
    display(cluster_df.round(3))

    return cluster_df, global_scores


In [15]:
cluster_report, global_scores = cluster_quality_report(points, label_col="cluster_true")


Global Cluster Quality Metrics:


Unnamed: 0,Silhouette,Davies_Bouldin,Calinski_Harabasz,Cluster_Count,Load_Balance_STD,Spatial_Balance_STD
0,0.934,0.091,42782.527,5,0.0,0.551


Per-Cluster Logistics Summary:


Unnamed: 0,cluster_id,n_points,centroid_lat,centroid_lon,avg_distance_to_centroid_km,max_distance_to_centroid_km,std_distance_km,mean_pairwise_distance_km,density_index
0,0,40,60.769,22.437,4.707,12.948,2.872,6.855,5.835
1,1,40,61.997,21.851,4.914,13.637,3.133,7.187,5.566
2,2,40,61.537,26.702,4.856,15.098,2.855,6.991,5.722
3,3,40,61.257,25.118,5.757,18.018,3.755,8.314,4.811
4,4,40,60.325,25.751,4.235,9.048,2.229,6.098,6.56
