# Clustering

## 1. K-means

### Préparation

On suit la même démarche que celle utilisée en TP pour tenter un premier clustering avec k-means.

In [None]:
import pandas as pd

df = pd.read_parquet("flickr_data_cleaned.parquet")
df = df.dropna(subset=["lat", "long"]).copy()

df_kmeans = df[["lat", "long"]].copy()

df_kmeans.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_kmeans)

scaled_data_df = pd.DataFrame(scaled_data, columns=df_kmeans.columns, index=df_kmeans.index)
scaled_data_df.head()

### K déterminé par la méthode du coude

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

inertia_values = []
k_values = range(1, 15)

for k in k_values:
    kmeans = KMeans(n_clusters=k, init="k-means++", random_state=0)
    kmeans.fit(scaled_data_df)
    inertia_values.append(kmeans.inertia_)

plt.figure()
plt.plot(list(k_values), inertia_values, marker="o")
plt.title("Elbow Method")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

In [None]:
def run_kmeans_clustering(df, scaled_data_df, scaler, k):
    kmeans = KMeans(n_clusters=k, init="k-means++", random_state=0)
    kmeans.fit(scaled_data_df)
    
    df["cluster_kmeans"] = kmeans.labels_
    
    # Compute centers in original space
    centers_scaled = kmeans.cluster_centers_
    centers = scaler.inverse_transform(centers_scaled)
    centers_df = pd.DataFrame(centers, columns=["lat", "long"])
    
    return centers_df

In [None]:
centers_df = run_kmeans_clustering(df, scaled_data_df, scaler, k)

In [None]:
import folium

def create_clustering_map(df, cluster_col, centers_df=None, sample_size=30000, zoom_start=12):    
    sample = df.sample(n=min(sample_size, len(df)), random_state=0)
    
    palette = [
        "red", "blue", "green", "purple", "orange",
        "darkred", "lightred", "beige", "darkblue",
        "darkgreen", "cadetblue", "darkpurple",
        "pink", "lightblue", "lightgreen",
        "gray", "black", "lightgray"
    ]
    
    m = folium.Map(
        location=[df["lat"].median(), df["long"].median()],
        zoom_start=zoom_start,
        tiles="CartoDB positron",
    )
    
    # Add cluster points
    for _, r in sample.iterrows():
        c = int(r[cluster_col])
        color = palette[c % len(palette)]
        folium.CircleMarker(
            location=[r["lat"], r["long"]],
            radius=2,
            color=color,
            fill=True,
            fill_opacity=0.7,
            popup=folium.Popup(f"""<a href="{r.get('url','')}" target="_blank">Open Flickr</a>""", max_width=250),
        ).add_to(m)
    
    # Add cluster centers if provided
    if centers_df is not None:
        for i, row in centers_df.iterrows():
            folium.Marker(
                location=[row["lat"], row["long"]],
                icon=folium.Icon(color="darkblue", icon="star"),
                popup=f"Center cluster {i}",
            ).add_to(m)
    
    return m

In [None]:
m = create_clustering_map(df, "cluster_kmeans", centers_df=centers_df)
m

Le résultat obtenu avec k = 5 n'est pas satisfaisant. La méthode du coude nous donne une valeur de k qui ne produit de clustering pertinent.

Dans Lyon, il n'y a pas que 5 zones d'intérêt. Il faudrait essayer avec un k plus grand.

In [None]:
for _, r in sample.iterrows():
    c = int(r["cluster_kmeans"])
    color = palette[c % len(palette)]
    folium.CircleMarker(
        location=[r["lat"], r["long"]],
        radius=2,
        color=color,
        fill=True,
        fill_opacity=0.7,
        popup=folium.Popup(f"""<a href="{r.get('url','')}" target="_blank">Open Flickr</a>""", max_width=250),
    ).add_to(m)

### Autres valeurs de K

In [None]:
for k in [10, 20, 30]:
    centers_df = run_kmeans_clustering(df, scaled_data_df, scaler, k)
    m = create_clustering_map(df, "cluster_kmeans", centers_df=centers_df)
    display(m)

Même avec d'autres valeurs de k, le clustering n'est pas vraiment pertinent : il permet bien de distinguer des zones, mais on ne peut pas dire que ces zones correspondent à des zones d'intérêt réelles, plutôt à de grands arrondissements.

for i, row in centers_df.iterrows():
    folium.Marker(
        location=[row["lat"], row["long"]],
        icon=folium.Icon(color="darkblue", icon="star"),
        popup=f"Center cluster {i}",
    ).add_to(m)
m
Ainsi, k-means ne semble pas adapté pour ce type de données.

## 2.2 DBSCAN

Suite à l'essai de k-means, on tente un clustering avec DBSCAN.

Le principe de DBSCAN est de regrouper les points denses ensemble, et de considérer les points isolés comme du bruit.

### Préparation

In [None]:
import pandas as pd

df = pd.read_parquet("flickr_data_cleaned.parquet")
df = df.dropna(subset=["lat", "long"]).copy()

Préparation

In [None]:
import numpy as np

coords = df[["lat", "long"]].to_numpy()
coords_rad = np.radians(coords)

On crée une fonction DBSCAN pour pouvoir l'appliquer avec différents paramètres.

TODO : Comprendre pourquoi il faut convertir en radians pour DBSCAN.

In [None]:
from sklearn.cluster import DBSCAN

def run_dbscan(coords_rad, eps_meters, min_samples):
    # conversion mètres → radians
    eps_rad = eps_meters / 6371000  # rayon Terre ~ 6 371 km

    db = DBSCAN(
        eps=eps_rad,
        min_samples=min_samples,
        metric="haversine",
    )
    labels = db.fit_predict(coords_rad)
    return labels

In [None]:
import numpy as np
from sklearn.metrics import silhouette_score

def dbscan_stats(coords_rad, labels):
    n_noise = np.sum(labels == -1)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    # Silhouette uniquement sur points non-bruit
    mask = labels != -1
    sil = None
    if mask.sum() > 1 and n_clusters > 1:
        sil = silhouette_score(coords_rad[mask], labels[mask], metric="euclidean")

    return {
        "clusters": n_clusters,
        "noise_ratio": n_noise / len(labels),
        "silhouette": sil,
    }

In [None]:
eps_values = [50, 150]        # en mètres

# possible aussi de faire la méthode du coude pour eps 

# idée = db scan dans le dbscan => pour faire un epsilon adaptatif (plus petit dans le centre de Lyon)

min_samples_values = [30]

results = []

for eps in eps_values:
    for ms in min_samples_values:
        labels = run_dbscan(coords_rad, eps, ms)
        stats = dbscan_stats(coords_rad, labels)

        results.append({
            "eps_m": eps,
            "min_samples": ms,
            **stats,
            "labels": labels
        })

Visualisation des résultats

In [None]:
import folium

def plot_dbscan_map(df, labels, title, sample_size=20000):
    dff = df.copy()
    dff["cluster"] = labels

    # échantillon
    if len(dff) > sample_size:
        dff = dff.sample(sample_size, random_state=0)

    m = folium.Map(
        location=[dff["lat"].median(), dff["long"].median()],
        zoom_start=12,
        tiles="CartoDB positron"
    )

    palette = [
        "red","blue","green","purple","orange","darkred","cadetblue",
        "darkgreen","darkpurple","pink","gray","black"
    ]

    for _, r in dff.iterrows():
        if r["cluster"] == -1:
            color = "lightgray"
        else:
            color = palette[r["cluster"] % len(palette)]

        folium.CircleMarker(
            location=[r["lat"], r["long"]],
            radius=2,
            color=color,
            fill=True,
            fill_opacity=0.6,
        ).add_to(m)

    return m

In [None]:
for r in results:
    print(f"eps={r['eps_m']}m | min_samples={r['min_samples']} | "
          f"clusters={r['clusters']} | noise={r['noise_ratio']:.2f}")
    display(plot_dbscan_map(df, r["labels"],
                            title=f"eps={r['eps_m']} / min_samples={r['min_samples']}"))

In [None]:
df

## 2.3 HDBSCAN

In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("flickr_data_cleaned.parquet")
df = df.dropna(subset=["lat", "long"]).copy()

X = df[["lat", "long"]].to_numpy()

In [None]:
import numpy as np
import pandas as pd

# df is your cleaned dataframe
# must have: user, lat, long, taken_dt (datetime)
d = df.dropna(subset=["user", "lat", "long", "taken_dt"]).copy()

# ---- PARAMETERS (tune these)
TIME_BIN_MIN = 2      # minutes
GRID_M = 25           # meters (spatial snapping)

# ---- TIME BIN
d["taken_bin"] = d["taken_dt"].dt.floor(f"{TIME_BIN_MIN}min")

deg_per_m_lat = 1 / 111_320
deg_per_m_lon = 1 / (111_320 * np.cos(np.deg2rad(d["lat"].to_numpy())))

d["lat_cell"] = np.round(d["lat"] / (GRID_M * deg_per_m_lat)).astype("int64")

lon_step = GRID_M * deg_per_m_lon
d["lon_cell"] = np.round(d["long"] / lon_step).astype("int64")

# ---- DEDUPE KEY
key_cols = ["user", "taken_bin", "lat_cell", "lon_cell"]

d_sorted = d.sort_values(key_cols + ["id"])
df_dedup = d_sorted.drop_duplicates(subset=key_cols, keep="first").copy()

print("Before:", len(df), " | After:", len(df_dedup), " | Removed:", len(df) - len(df_dedup))

In [None]:
import geopandas as gpd

gdf = gpd.GeoDataFrame(df_dedup, geometry=gpd.points_from_xy(df_dedup["long"], df_dedup["lat"]), crs="EPSG:4326")
gdf = gdf.to_crs("EPSG:3857")  # mètres

X_m = np.column_stack([gdf.geometry.x.to_numpy(), gdf.geometry.y.to_numpy()])

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=100,
    min_samples=None,
)

labels = clusterer.fit_predict(X_m)

df_dedup["cluster_hdbscan"] = labels
df_dedup["cluster_hdbscan"].value_counts().head(10)

In [None]:
import folium

sample = df_dedup.sample(n=min(30000, len(df_dedup)), random_state=0)

m = folium.Map(
    location=[df_dedup["lat"].median(), df_dedup["long"].median()],
    zoom_start=12,
    tiles="CartoDB positron"
)

# Palette Folium (suffisamment large)
palette = [
    "red", "blue", "green", "purple", "orange",
    "darkred", "lightred", "beige", "darkblue",
    "darkgreen", "cadetblue", "darkpurple",
    "pink", "lightblue", "lightgreen",
    "gray", "black", "lightgray"
]

for _, r in sample.iterrows():
    cluster = r["cluster_hdbscan"]

    if cluster == -1:
        color = "lightgray"   # bruit
    else:
        color = palette[cluster % len(palette)]

    folium.CircleMarker(
        location=[r["lat"], r["long"]],
        radius=2,
        color=color,
        fill=True,
        fill_opacity=0.6,
        popup=folium.Popup(
            f"""<a href="{r["url"]}" target="_blank">Open Flickr</a>""",
            max_width=250
        )
    ).add_to(m)

m