In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# We can also use Snowpark for our analyses!
from snowflake.snowpark import Session

# Définir ta configuration
connection_parameters = {
    "account": "RSYIXFD-HT53341",   # ex: "xy12345.eu-central-1"
    "user": "username",
    "password": "password",
    "warehouse": "COMPUTE_WH",
    "database": "RAW_DATA",
    "schema": "AMAZING_DATA"
}

# Créer la session
session = Session.builder.configs(connection_parameters).create()

In [None]:
table = session.table("PROCESSED_DATA.AMAZING_DATA.CLIENT_EVENTS_CLEAN")

In [None]:
table.show()

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

from sklearn.metrics import pairwise_distances
import numpy as np

# Récupérer un échantillon représentatif (ici le premier million de lignes)
sample_df = session.table("PROCESSED_DATA.AMAZING_DATA.CLIENT_EVENTS_CLEAN").limit(100000).to_pandas()

sample_df["TOTAL_SPENT"] = np.log1p(sample_df["TOTAL_SPENT"])
sample_df["TOTAL_VIEWS"] = np.log1p(sample_df["TOTAL_VIEWS"])
sample_df["TOTAL_CART"] = np.log1p(sample_df["TOTAL_CART"])
sample_df["TOTAL_PURCHASE"] = np.log1p(sample_df["TOTAL_PURCHASE"])

scaler = RobustScaler()
sample_scaled = scaler.fit_transform(sample_df)

pca = PCA(n_components=3)
pca.fit(sample_scaled)

sample_scaled = scaler.transform(sample_df)
sample_pca = pca.transform(sample_scaled)

In [None]:
sample_df.head()

In [None]:
import pandas as pd
from sklearn.cluster import Birch

# --- Paramètres ---
batch_size = 200_000   # taille des batchs
max_rows = 5_000_000   # limite de données pour le test

# --- Définir le modèle BIRCH ---
birch = Birch(
    n_clusters=None,   # laisse BIRCH construire les micro-clusters
    threshold=0.88,     # rayon max des sous-clusters (à tuner !)
    branching_factor=50
)

offset = 0
processed_rows = 0

while True:
    chunk = session.table("PROCESSED_DATA.AMAZING_DATA.CLIENT_EVENTS_CLEAN") \
                  .limit(batch_size, offset=offset).to_pandas()
    if chunk.empty:
        break

    chunk["TOTAL_SPENT"] = np.log1p(chunk["TOTAL_SPENT"])
    chunk["TOTAL_VIEWS"] = np.log1p(chunk["TOTAL_VIEWS"])
    chunk["TOTAL_CART"] = np.log1p(chunk["TOTAL_CART"])
    chunk["TOTAL_PURCHASE"] = np.log1p(chunk["TOTAL_PURCHASE"])
    
    # scaling avec scaler déjà appris
    chunk_scaled = scaler.transform(chunk)
    
    # projection PCA
    chunk_pca = pca.transform(chunk_scaled)
    
    # mise à jour du clustering BIRCH
    birch.partial_fit(chunk_pca)
    
    processed_rows += len(chunk)
    print(f"Batch {offset // batch_size + 1} traité ({processed_rows} lignes)")
    offset += batch_size

print("Entraînement terminé sur", processed_rows, "lignes")

# --- Résultats ---
labels = birch.labels_
print("Nombre de clusters trouvés :", len(set(labels)))


In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5, random_state=42, n_init="auto")
km.fit(birch.subcluster_centers_)
print("Nombre de micro-clusters créés :", len(birch.subcluster_centers_))

# Labels finaux pour l’échantillon
labels_sample = km.predict(sample_pca)
print("Clusters finaux présents:", np.unique(labels_sample))

In [None]:
labels_final = km.predict(sample_pca)

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# --- Étape 3 : calcul des scores sur les clusters finaux ---
sil_score = silhouette_score(sample_pca[:100000], labels_final[:100000])
ch_score = calinski_harabasz_score(sample_pca[:100000], labels_final[:100000])
db_score = davies_bouldin_score(sample_pca[:100000], labels_final[:100000])

print(f"Silhouette Score    : {sil_score:.3f}")
print(f"Calinski-Harabasz   : {ch_score:.3f}")
print(f"Davies-Bouldin      : {db_score:.3f}")

In [None]:
import pandas as pd
import numpy as np

# Assure-toi que labels_final existe et a la même longueur que sample_df
print("Taille sample_df :", len(sample_df))
print("Taille labels_final :", len(labels_final))

# Ajouter les labels au DataFrame original
df_with_clusters = sample_df.copy()
df_with_clusters = df_with_clusters.reset_index(drop=True)  # reset index pour aligner
df_with_clusters["cluster"] = labels_final

# Sélectionner uniquement les colonnes numériques
numeric_cols = df_with_clusters.select_dtypes(include=np.number).columns.tolist()

# Vérifie que "cluster" est bien dans les colonnes
if "cluster" not in numeric_cols:
    numeric_cols.remove("cluster")  # on ne veut pas inclure la colonne cluster dans les moyennes

# Moyennes par cluster sur les colonnes numériques
cluster_means = df_with_clusters.groupby("cluster")[numeric_cols].mean()

# Moyenne globale sur les mêmes colonnes
global_mean = df_with_clusters[numeric_cols].mean()

# Importance relative = différence par rapport à la moyenne globale
feature_importance = cluster_means - global_mean

# Afficher les top features par cluster
for c in cluster_means.index:
    cluster_size = len(df_with_clusters[df_with_clusters.cluster == c])
    print(f"\n🔹 Cluster {c} ({cluster_size} points)")
    top_features = feature_importance.loc[c].abs().sort_values(ascending=False).head(10)
    print(top_features)


In [None]:
# df = table.limit(500000).to_pandas()

In [None]:
# from sklearn.preprocessing import StandardScaler

# df_scaled = StandardScaler().fit_transform(df) 

In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=3)
# df_pca = pca.fit_transform(df_scaled)

# print(f"Variance expliquée cumulée : {pca.explained_variance_ratio_.cumsum()}")

# df_pca

In [None]:
# from sklearn.neighbors import NearestNeighbors

# min_samples = df_pca.shape[1] + 1

# neighbors = NearestNeighbors(n_neighbors=min_samples)
# neighbors_fit = neighbors.fit(df_pca)
# distances, indices = neighbors_fit.kneighbors(df_pca)

# # On prend la distance au dernier voisin (le k-ième)
# distances = np.sort(distances[:, -1])

# plt.figure(figsize=(8, 4))
# plt.plot(distances)
# plt.xlabel("Points triés")
# plt.ylabel(f"Distance au {min_samples}ème voisin")
# plt.title("k-distance plot")
# plt.show()

In [None]:
# print(f"Min distance: {distances.min()}")
# print(f"Max distance: {distances.max()}")
# print(f"Moyenne distance: {distances.mean()}")
# print(f"25% quantile: {np.percentile(distances, 25)}")
# print(f"50% quantile: {np.percentile(distances, 50)}")
# print(f"75% quantile: {np.percentile(distances, 75)}")

In [None]:
# from kneed import KneeLocator

# sensitivities = [0.5, 1]
# eps_values = []

# for S in sensitivities:
#     kneedle = KneeLocator(range(len(distances)), distances, S=S, curve='convex', direction='increasing')
#     knee_idx = kneedle.knee
#     if knee_idx is not None:
#         eps = distances[knee_idx]
#         print(f"eps détecté avec S={S} : {eps:.4f}")
#         eps_values.append(eps)
#     else:
#         print(f"Aucun coude détecté avec S={S}")
#         eps_values.append(None)

In [None]:
# from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
# from sklearn.cluster import DBSCAN

# max_sample_size = 1000

# best_score = -1
# best_eps = None
# rng = np.random.default_rng(seed=42)

# for eps in eps_values:
#     clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(df_pca)
#     labels = clustering.labels_

#     n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
#     noise_ratio = list(labels).count(-1) / len(labels)
#     print(f"eps={eps:.2f} -> clusters: {n_clusters}, bruit: {noise_ratio*100:.3f}%")

#     mask = labels != -1
#     X_no_noise = df_pca[mask]
#     labels_no_noise = labels[mask]

#     # Vérifier qu'il reste au moins 2 clusters distincts après retrait du bruit
#     if len(set(labels_no_noise)) > 1:
#         sample_size = min(max_sample_size, len(X_no_noise))
#         indices = rng.choice(len(X_no_noise), sample_size, replace=False)
#         X_sample = X_no_noise[indices]
#         labels_sample = labels_no_noise[indices]

#         sil_score = silhouette_score(X_sample, labels_sample)
#         ch_score = calinski_harabasz_score(X_sample, labels_sample)
#         db_score = davies_bouldin_score(X_sample, labels_sample)

#         print(f"  Silhouette Score    : {sil_score:.3f}")
#         print(f"  Calinski-Harabasz   : {ch_score:.3f}")
#         print(f"  Davies-Bouldin      : {db_score:.3f}")

#         if sil_score > best_score:
#             best_score = sil_score
#             best_eps = eps
#     else:
#         print("Pas assez de clusters valides pour calculer les scores.")


In [None]:
# dbscan = DBSCAN(eps=best_eps, min_samples=min_samples)
# clusters = dbscan.fit_predict(df_pca)

In [None]:
# import plotly.express as px

# df_plot = pd.DataFrame({
#     'PC1': df_pca[:, 0],
#     'PC2': df_pca[:, 1],
#     'PC3': df_pca[:, 2],
#     'cluster': clusters
# })

# df_plot['color'] = df_plot['cluster'].apply(lambda x: 'grey' if x == -1 else f'cluster {x}')

# fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3',
#                     color='color',
#                     title='Clusters DBSCAN',
#                     opacity=0.7,
#                     size_max=5,
#                     labels={'color': 'Cluster'})

# fig.show()

In [None]:
# import numpy as np
# import pandas as pd

# # Supposons que df_final soit ton DataFrame des features
# # et que labels soit la sortie de DBSCAN
# # labels = clustering.labels_

# # Ajouter les labels au DataFrame
# df_clusters = df.copy()
# df_clusters['cluster'] = labels

# # On exclut le bruit (-1)
# df_clusters = df_clusters[df_clusters['cluster'] != -1]

# # Sélectionner uniquement les colonnes numériques
# numeric_cols = df_clusters.select_dtypes(include=[np.number]).columns
# numeric_cols = numeric_cols.drop('cluster')  # on exclut la colonne cluster

# # Moyenne par cluster
# cluster_means = df_clusters.groupby('cluster')[numeric_cols].mean()

# # Moyenne globale
# global_mean = df[numeric_cols].mean()

# # Importance relative : écart à la moyenne globale
# feature_importance = cluster_means - global_mean

# # Pour chaque cluster, trier les features par importance absolue
# for c in feature_importance.index:
#     print(f"\nCluster {c}")
#     print(feature_importance.loc[c].abs().sort_values(ascending=False).head(10))


In [None]:
CREATE STAGE IF NOT EXISTS clustering_model;

In [None]:
# import joblib

# # Sauvegarde du modèle
# joblib.dump(km, "birch_kmeans")

In [None]:
# session.file.put("birch_kmeans", "@clustering_model", auto_compress=False, overwrite=True)