## Text Mining 

Objectif : Fonction qui renvoie pour chaque num√©ro de cluster un mot qui d√©crit le mieux le cluster

Input : dataframe 
Output : cluster avec le title 

In [None]:
import pandas as pd
df = pd.read_parquet("flickr_data_clustered.parquet")

On commence par supprimer les stopwords

In [None]:
#fonction
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re


french_stopwords = {
    "le","la","les","de","des","du","un","une","et","√†","au","aux",
    "en","pour","sur","avec","sans","ce","cette","ces","dans",
    "est","sont","√©t√©","√™tre","avoir","[Lyon","Demeure","Chaos" ,"France]", "France", "Lyon","lyon","france","europe","lyons","instagram","img",
    "uploaded","photo","photographie","photographies","image","images","paysage","nature","ville","village","monde","voyage","travel","traveling",
    "tourisme","tourist","ddc","art","artistique","culture","historique","histoire","architecture","batiment","b√¢timent","monument","ruine","ruines",
    "iphone", "place"
}

stopwords = ENGLISH_STOP_WORDS.union(french_stopwords)

# D'abord cr√©er les colonnes cleaned_title et cleaned_tags
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # minuscules
    text = text.lower()
    # supprimer la ponctuation
    text = re.sub(r'[^a-z√†√¢√ß√©√®√™√´√Æ√Ø√¥√ª√π√º√ø√±√¶≈ì\s]', ' ', text)
    return text

df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_tags'] = df['tags'].apply(clean_text)

# Ensuite supprimer les stopwords
def clean_stopwords(text) : 
    if not isinstance(text, str):
        return ""
    
    words = text.split()
    words = [w for w in words if w not in stopwords and len(w) > 2]
    return words
    

#apply the function to the 'title' column
df['cleaned_title_stopwords'] = df['cleaned_title'].apply(clean_stopwords)

df['cleaned_tags_stopwords'] = df['cleaned_tags'].apply(clean_stopwords)

In [None]:
df

Ensuite on on cherche le mot qui correspond √† chaque cluster
df : le dataframe 
cluster_kmeans : le nom de la colonne qui contient le num de cluster auquel appartient la ligne
texte_cols : les colonnes o√π y a les textes qu'on va utiliser pour identifier les mots les plus fr√©quent 
top_k : nb de mots √† garder = 1 puisque on cherche un seul mots par cluster

In [None]:
df

In [None]:
from collections import Counter

def cluster_titles(
    df,
    cluster_hdbscan="cluster_hdbscan",
    text_cols=("cleaned_title_stopwords", "cleaned_tags_stopwords"),
    top_k=1
):
    """
    Retourne un titre (mot-cl√©) par cluster
    """
    cluster_labels = {}

    for cluster_id in sorted(df[cluster_hdbscan].unique()):
        if cluster_id == -1:
            continue  # on ignore le bruit

        # sous-dataframe du cluster (seulement les ligne de ce cluster)
        dff = df[df[cluster_hdbscan] == cluster_id]

        # concat√©nation des textes de toutes les lignes
        all_words = []
        #on parcours les deux colonnes de texte
        for col in text_cols:
            texts = dff[col].dropna()
            for t in texts:
                all_words.extend(t)
        # si la liste est vide donc pas de mots
        if not all_words:
            cluster_labels[cluster_id] = "unknown"
            continue

        # comptage des mots
        counts = Counter(all_words)

        # mots les plus fr√©quents
        top_words = [w for w, _ in counts.most_common(top_k)]

        # Retourner le premier mot (string) au lieu d'une liste
        cluster_labels[cluster_id] = top_words[0] if top_words else "unknown"

    return cluster_labels

df['cluster_name'] = df['cluster_hdbscan'].map(
    cluster_titles(df, top_k=1)
).fillna("unknown")

In [None]:
df 

In [None]:
# V√©rifier ce que retourne la fonction
print("Premiers cluster_name:")
print(df[['cluster_hdbscan', 'cluster_name']].head(20))
print("\nTypes:")
print(df['cluster_name'].dtype)
print("\nValeurs uniques (10 premi√®res):")
print(df['cluster_name'].unique()[:10])

In [None]:
# Afficher TOUS les clusters avec leurs titres
cluster_summary = df[df['cluster_hdbscan'] != -1].groupby('cluster_hdbscan').agg({
    'cluster_name': 'first',
    'id': 'count'
}).rename(columns={'id': 'nb_photos'}).sort_values('nb_photos', ascending=False)

print(f"üìã Liste compl√®te des {len(cluster_summary)} clusters avec leurs titres:\n")
print(cluster_summary.to_string())

# Ou en DataFrame pour mieux voir
cluster_summary

In [None]:
# Nombre de clusters HDBSCAN
n_clusters = len(df[df['cluster_hdbscan'] != -1]['cluster_hdbscan'].unique())
n_bruit = len(df[df['cluster_hdbscan'] == -1])
n_total = len(df)

print(f"üìä Statistiques HDBSCAN:")
print(f"  ‚Ä¢ Nombre de clusters: {n_clusters}")
print(f"  ‚Ä¢ Points de bruit (-1): {n_bruit} ({n_bruit/n_total*100:.1f}%)")
print(f"  ‚Ä¢ Points dans des clusters: {n_total - n_bruit} ({(n_total-n_bruit)/n_total*100:.1f}%)")
print(f"\nTaille des 10 plus gros clusters:")
print(df[df['cluster_hdbscan'] != -1]['cluster_hdbscan'].value_counts().head(10))

In [None]:
import folium

sample = df.sample(n=min(30000, len(df)), random_state=0)

m = folium.Map(
    location=[df["lat"].median(), df["long"].median()],
    zoom_start=12,
    tiles="CartoDB positron"
)

palette = ["red", "blue", "green", "purple", "orange", "darkred", "lightred", 
           "beige", "darkblue", "darkgreen", "cadetblue", "darkpurple", 
           "pink", "lightblue", "lightgreen", "gray", "black", "lightgray"]

for _, r in sample.iterrows():
    cluster = r["cluster_hdbscan"]
    if cluster == -1:
        color = "lightgray"
    else:
        color = palette[cluster % len(palette)]
    
    folium.CircleMarker(
        location=[r["lat"], r["long"]],
        radius=2,
        color=color,
        fill=True,
        fill_opacity=0.6,
        popup=folium.Popup(
            f"""<b>Keyword:</b> {r["cluster_name"]}<br/>
               <a href="{r["url"]}" target="_blank">Open Flickr</a>""",
            max_width=250
        )
    ).add_to(m)

m