# Clustering

## 1. K-means

On suit la même démarche que celle utilisée en TP.

In [None]:
import pandas as pd

df = pd.read_parquet("flickr_data_cleaned.parquet")
df = df.dropna(subset=["lat", "long"]).copy()

df_clustering = df[["lat", "long"]].copy()

df_clustering.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_clustering)

scaled_data_df = pd.DataFrame(scaled_data, columns=df_clustering.columns, index=df_clustering.index)
scaled_data_df.head()

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

inertia_values = []
k_values = range(1, 15)

for k in k_values:
    kmeans = KMeans(n_clusters=k, init="k-means++", random_state=0)
    kmeans.fit(scaled_data_df)
    inertia_values.append(kmeans.inertia_)

plt.figure()
plt.plot(list(k_values), inertia_values, marker="o")
plt.title("Elbow Method")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

In [None]:
# choix de 5 pour la méthode du coude 
k = 5

kmeans = KMeans(n_clusters=k, init="k-means++", random_state=0)
kmeans.fit(scaled_data_df)

df["cluster_kmeans"] = kmeans.labels_
df["cluster_kmeans"].value_counts().head()

In [None]:
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers, columns=["lat", "long"])
centers_df.head()

In [None]:
import folium

sample = df.sample(n=min(30000, len(df)), random_state=0)

m = folium.Map(
    location=[df["lat"].median(), df["long"].median()],
    zoom_start=12,
    tiles="CartoDB positron",
)

palette = [
    "red","blue","green","purple","orange","darkred","lightred","beige","darkblue","darkgreen",
    "cadetblue","darkpurple","white","pink","lightblue","lightgreen","gray","black","lightgray"
]

for _, r in sample.iterrows():
    c = int(r["cluster_kmeans"])
    color = palette[c % len(palette)]
    folium.CircleMarker(
        location=[r["lat"], r["long"]],
        radius=2,
        color=color,
        fill=True,
        fill_opacity=0.7,
        popup=folium.Popup(f"""<a href="{r.get('url','')}" target="_blank">Open Flickr</a>""", max_width=250),
    ).add_to(m)

for i, row in centers_df.iterrows():
    folium.Marker(
        location=[row["lat"], row["long"]],
        icon=folium.Icon(color="darkblue", icon="star"),
        popup=f"Center cluster {i}",
    ).add_to(m)
m

En fait à Lyon il n'y a pas que 5 clusters => remettre en question le k obtenu avec la méthode du coude. 


## 2.2 DBSCAN

Chargement des données

In [None]:
import pandas as pd

df = pd.read_parquet("flickr_data_cleaned.parquet")
df = df.dropna(subset=["lat", "long"]).copy()

Préparation

In [None]:
import numpy as np

coords = df[["lat", "long"]].to_numpy()
coords_rad = np.radians(coords)

Fonction DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

def run_dbscan(coords_rad, eps_meters, min_samples):
    # conversion mètres → radians
    eps_rad = eps_meters / 6371000  # rayon Terre ~ 6 371 km

    db = DBSCAN(
        eps=eps_rad,
        min_samples=min_samples,
        metric="haversine",
    )
    labels = db.fit_predict(coords_rad)
    return labels

In [None]:
import numpy as np
from sklearn.metrics import silhouette_score

def dbscan_stats(coords_rad, labels):
    n_noise = np.sum(labels == -1)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    # silhouette uniquement sur points non-bruit
    mask = labels != -1
    sil = None
    if mask.sum() > 1 and n_clusters > 1:
        sil = silhouette_score(coords_rad[mask], labels[mask], metric="euclidean")

    return {
        "clusters": n_clusters,
        "noise_ratio": n_noise / len(labels),
        "silhouette": sil,
    }

In [None]:
eps_values = [50, 150]        # en mètres

#possible aussi de faire la méthode du coude pour eps 

#idée = db scan dans le dbscan => pour faire un epsilon adaptatif (plus petit dans le centre de Lyon)

min_samples_values = [30]

results = []

for eps in eps_values:
    for ms in min_samples_values:
        labels = run_dbscan(coords_rad, eps, ms)
        stats = dbscan_stats(coords_rad, labels)

        results.append({
            "eps_m": eps,
            "min_samples": ms,
            **stats,
            "labels": labels
        })

Visualisation des résultats

In [None]:
import folium

def plot_dbscan_map(df, labels, title, sample_size=20000):
    dff = df.copy()
    dff["cluster"] = labels

    # échantillon
    if len(dff) > sample_size:
        dff = dff.sample(sample_size, random_state=0)

    m = folium.Map(
        location=[dff["lat"].median(), dff["long"].median()],
        zoom_start=12,
        tiles="CartoDB positron"
    )

    palette = [
        "red","blue","green","purple","orange","darkred","cadetblue",
        "darkgreen","darkpurple","pink","gray","black"
    ]

    for _, r in dff.iterrows():
        if r["cluster"] == -1:
            color = "lightgray"
        else:
            color = palette[r["cluster"] % len(palette)]

        folium.CircleMarker(
            location=[r["lat"], r["long"]],
            radius=2,
            color=color,
            fill=True,
            fill_opacity=0.6,
        ).add_to(m)

    return m

In [None]:
for r in results:
    print(f"eps={r['eps_m']}m | min_samples={r['min_samples']} | "
          f"clusters={r['clusters']} | noise={r['noise_ratio']:.2f}")
    display(plot_dbscan_map(df, r["labels"],
                            title=f"eps={r['eps_m']} / min_samples={r['min_samples']}"))