In [0]:
%pip install scikit-learn 
%pip install pandas 
%pip install numpy

In [0]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType, StringType, StructType, StructField, LongType
from pyspark.sql.functions import pandas_udf
import matplotlib.pyplot as plt
import seaborn as sns
from databricks.vector_search.client import VectorSearchClient

In [0]:
import numpy as np
from scipy.spatial.distance import cdist

#'calculate_dunn_index_optimized' measures how compact, well-separated clusters are
def calculate_dunn_index(embeddings, cluster_labels):
    unique_clusters = np.unique(cluster_labels)
    clusters = [embeddings[cluster_labels == k] for k in unique_clusters]
    # Calculate inter-cluster distances (minimum between clusters)
    inter_cluster = np.min([
        np.min(cdist(clusters[i], clusters[j]))
        for i in range(len(clusters)) for j in range(len(clusters)) if i != j
    ])
    # Calculate intra-cluster distances (maximum within clusters)
    intra_cluster = np.max([
        np.max(cdist(cluster, cluster)) if len(cluster) > 1 else 0
        for cluster in clusters
    ])
    # Avoid division by zero
    if intra_cluster == 0:
        return 0
    return inter_cluster / intra_cluster

In [0]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

"""Perform K-means clustering with optimal cluster selection"""
def perform_clustering(embeddings, n_clusters_range=range(2, 10)):
    best_davies_bouldin = -1
    davies_bouldin_scores = []
    best_k = 1
    
    # Find optimal nº of clusters using multiple metrics
    for k in n_clusters_range:
        # 'n_init=10' runs 10 times with different centroids¹ and picks the best result
        # ¹They're the algorithm’s guess at where clusters might be
        kmeans = KMeans(n_clusters=k, 
                        random_state=42, 
                        n_init=10)
        #Assigns each embeddings data point to a cluster
        cluster_labels = kmeans.fit_predict(embeddings)
        
        #'davies_bouldin_score' measures cluster separation (with lower numbers being better)
        davies_bouldin = davies_bouldin_score(embeddings, cluster_labels)
        davies_bouldin_scores.append(davies_bouldin)
        
        # The Davies–Bouldin Index is generally considered more comprehensive than the Dunn Index (calculated in another cell)
        if davies_bouldin > best_davies_bouldin:
            best_davies_bouldin = davies_bouldin
            best_k = k

    if best_k != 9:
        # Final clustering with best k
        final_kmeans = KMeans(n_clusters=best_k, 
                              random_state=42, 
                              n_init=10)
        final_labels = final_kmeans.fit_predict(embeddings)
    else:
        final_kmeans = kmeans
        final_labels = cluster_labels
    
    return best_k, final_labels, best_davies_bouldin, davies_bouldin_scores

In [0]:
"""Analyze cluster characteristics"""
def analyze_clusters(df, cluster_labels, descriptions):
    df_clustered = df.copy()
    df_clustered['labels'] = cluster_labels
    df_clustered['descriptions'] = descriptions
    
    cluster_analysis = {}
    
    # Analyze cluster distributions
    #'set' gets all distinct cluster labels, while 'range' lets you iterate over them
    for cluster_label in range(len(set(cluster_labels))):
        cluster_data = df_clustered[df_clustered['labels'] == cluster_label]
        
        analysis = {
            'size': len(cluster_data),                                  # Nº of rows in the cluster
            'percentage': len(cluster_data) / len(df_clustered) * 100,  # Share of this cluster relative to the whole dataset
            'top_origem': cluster_data['origem'].value_counts()
                                                .head(3) # Top 3 most frequent values in the origem column 
                                                .to_dict(),
            'top_formulario': cluster_data['formulario'].value_counts()
                                                        .head(3)
                                                         .to_dict(),
            'top_tipo_de_pedido': cluster_data['tipo_de_pedido'].value_counts()
                                                                .head(3)
                                                                .to_dict(),
            'top_modelo': cluster_data['modelo'].value_counts().head(3) # First 3 most frequent values in the modelo column
                                                               .to_dict(),
            'top_agrupamento': cluster_data['agrupamento_cliente'].value_counts()
                                                                  .head(3)
                                                                  .to_dict(),
            'top_caracterizacao': cluster_data['caracterizacao'].value_counts()
                                                                .head(3)
                                                                .to_dict(),
            'sample_descriptions': cluster_data['descriptions'].head(3) #First 3 Contacto descriptions from this cluster
                                                               .tolist()  
        }

        cluster_analysis[f'Cluster_{cluster_label}'] = analysis
    
    return df_clustered, cluster_analysis

In [0]:
"""Create visualizations for cluster analysis"""
def visualize_clusters(embeddings, cluster_labels, best_k):    
    # Create mapping for cluster names
    cluster_name_map = {0: 'Site Hyundai Portugal', 0.0: 'Site Hyundai Portugal', 
                        1: 'Showroom', 1.0: 'Showroom',
                        2: 'Facebook',  2.0: 'Facebook'}
    
    #Principal Component Analysis (PCA) reduces high-dimensional data down to 2D, so it can be easier to explore it visually.
    pca = PCA(n_components=2, random_state=42)
    embeddings_2d = pca.fit_transform(embeddings)
    
    # Create plots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Define discrete colors for each cluster
    colors = ['#1f77b4', '#8c564b', '#17becf']  # blue, brown, cyan

    # PCA plot
    scatter = axes[0].scatter(embeddings_2d[:, 0], 
                              embeddings_2d[:, 1], 
                              c=cluster_labels, 
                              cmap='tab10', 
                              alpha=0.6)
    axes[0].set_title(f'Contacto Clusters - PCA Visualization')
    
    # Create colorbar with discrete labels
    cbar = plt.colorbar(scatter, ax=axes[0], ticks=[0, 1, 2])
    cbar.set_ticklabels(['Site Hyundai', 'Showroom', 'Facebook'])

    # Cluster distribution
    unique_labels, counts = np.unique(cluster_labels, return_counts=True)
    # Map labels to names for x-axis
    label_names = [cluster_name_map.get(label, label) for label in unique_labels]

    # Use label_names for x-axis and range for x positions
    axes[1].bar(range(len(label_names)), counts, color=colors[:len(label_names)])
    axes[1].set_xticks(range(len(label_names)))
    axes[1].set_xticklabels(label_names)

    axes[1].set_title('Cluster Distribution')
    axes[1].set_xlabel('Cluster ID')
    axes[1].set_ylabel('Number of Contactos')
    
    plt.tight_layout()
    plt.show()

In [0]:
"""Create visualizations for cluster analysis, excluding the noise (i.e., outlier cluster) if needed"""
def visualize_clusters_optimized(embeddings, cluster_labels, best_k):
    unique_labels, counts = np.unique(cluster_labels, return_counts=True)
    # Identify the outliers
    smallest_indices = np.argsort(counts)[:2]  # Get indices of 2 smallest clusters
    smallest_cluster_ids = unique_labels[smallest_indices]
    # smallest_cluster_id = unique_labels[np.argmin(counts)]
    
    # Filter out this outlier
    mask = ~np.isin(cluster_labels, smallest_cluster_ids)
    # mask = cluster_labels != smallest_cluster_id
    # Applies the mask to the embeddings array, meaning it keeps the embeddings that don’t belong to the outlier cluster.
    filtered_embeddings = embeddings[mask]
    filtered_labels = cluster_labels[mask]
    
    # Update remaining cluster labels to be contiguous (0, 1, 2, ...)
    label_mapping = {old_label: new_label for new_label, old_label in 
                    enumerate(np.unique(filtered_labels))}
    mapped_labels = np.array([label_mapping[label] for label in filtered_labels])
    
    pca = PCA(n_components=2, random_state=42)
    embeddings_2d = pca.fit_transform(filtered_embeddings)
    
    # Create plots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # PCA plot
    scatter = axes[0].scatter(embeddings_2d[:, 0], 
                              embeddings_2d[:, 1], 
                              c=mapped_labels, 
                              cmap='tab10', 
                              alpha=0.6)
    axes[0].set_title(f'Contacto - Clusters (PCA) - {best_k-3} clusters')
    plt.colorbar(scatter, ax=axes[0])
    
    # Cluster distribution
    filtered_unique_labels, filtered_counts = np.unique(mapped_labels, return_counts=True)
    axes[1].bar(filtered_unique_labels, filtered_counts)
    axes[1].set_title('Cluster Distribution')
    axes[1].set_xlabel('Cluster ID')
    axes[1].set_ylabel('Number of Contactos')
       
    plt.tight_layout()
    plt.show()

In [0]:
"""Load contactos data to create behavioral descriptions later on"""
def load_and_preprocess_gold_contactos():
    df = spark.table("workspace.sc_gold.contactos_pbs")
    # Convert to Pandas for easier text processing
    df_pd = df.toPandas()
    
    return df_pd

In [0]:
"""Transform contactos data into natural language descriptions"""
def create_contactos_descriptions(df):
    descriptions = []
    
    # 'df.iterrows()' to iterate over DataFrame rows as (index, row) pairs, with this index being ignored as '_'
    for _, row in df.iterrows():
        # Our key behavioral features
        origem        = row['origem']         if pd.notna(row['origem']) \
                                              else 'não especificado'
        formulario    = row['formulario']     if pd.notna(row['formulario']) \
                                              else 'não especificado'
        tipo_de_pedido= row['tipo_de_pedido'] if pd.notna(row['tipo_de_pedido']) \
                                              else 'não especificado'
        modelo        = row['modelo']         if pd.notna(row['modelo']) \
                                              else 'não especificado'
        consentimento = row['consentimento']  if pd.notna(row['consentimento']) \
                                              else 'não especificado'
        email_opt_out = row['email_opt_out']  if pd.notna(row['email_opt_out']) \
                                              else 'não especificado'
        agrupamento   = row['agrupamento_cliente'] if pd.notna(row['agrupamento_cliente'])\
                                                   else 'não especificado'
        caracterizacao= row['caracterizacao'] if pd.notna(row['caracterizacao']) \
                                              else 'não especificado'
        
        # And then we create a natural language description in Portuguese
        description = f"Origem do contacto: {origem}, através do formulário {formulario}. " \
                      f"Tipo de Pedido: {tipo_de_pedido}, solicitado para um modelo {modelo}. " \
                      f"Agrupado em: {agrupamento}, e caracterizado como {caracterizacao}. " \
                      f"Status do consentimento: {consentimento}, e a opção para receber email está em {email_opt_out}."

        descriptions.append(description)
    
    return descriptions

In [0]:
import numpy as np

# Start by loading the table
df = spark.table("workspace.sc_gold.contactos_pbs_embeddings_new")
embeddings_df = df.select("embedding")
# Then, convert to a list of lists (i.e., each row is one embedding vector)
embeddings_list = embeddings_df.toPandas()["embedding"].tolist()
# Lastly, convert to NumPy array
embeddings = np.array(embeddings_list, dtype=float)

In [0]:
'''
Davies-Bouldin index score reference:
≈ 0.0 - 0.5  = Excellent clustering
≈ 0.5 - 1.0	 = Good
≈ 1.0 - 2.0	 = Moderate
> 2.0 - Poor = clustering
'''

In [0]:
print("Performing clustering...")
best_k, cluster_labels, best_davies_bouldin, davies_bouldin_scores = perform_clustering(embeddings)
print(f"Optimal number of clusters: {best_k}")

print(f"\nBest Davies-Bouldin: {best_davies_bouldin:.1f}")
print(f"Best Davies-Bouldin scores for different clusters/Ks: {davies_bouldin_scores}")

In [0]:
'''
Dunn Index score reference:
0.5 - 1.0 = Good Clustering

Observations:
•Distinct cluster separation with reasonable compactness.
•This is often the practical "good" range for real applications.
'''

In [0]:
#'calculate_dunn_index' measures how compact, well-separated clusters are
dunn = calculate_dunn_index(embeddings, cluster_labels)
print(f"\nBest Dunn Index: {dunn:.2f}")

In [0]:
print("Loading contactos data...")
contacto_df = load_and_preprocess_gold_contactos()

print("Creating contactos behavioral descriptions...")
contacto_descriptions = create_contactos_descriptions(contacto_df)

In [0]:
print("Analyzing 3 clusters...")
clustered_df, cluster_analysis = analyze_clusters(contacto_df, cluster_labels, contacto_descriptions)

# Print cluster analysis
for cluster_name, analysis in cluster_analysis.items():
    print(f"\n{cluster_name}:")
    print(f"  Size: {analysis['size']} ({analysis['percentage']:.1f}%)")
    print(f"  Top Origins: {analysis['top_origem']}")
    print(f"  Top Formulario: {analysis['top_formulario']}")
    print(f"  Tipo Pedido: {analysis['top_tipo_de_pedido']}")
    print(f"  Top Models: {analysis['top_modelo']}")
    print(f"  Top Agrupamento: {analysis['top_agrupamento']}")
    print(f"  Top Caracterização: {analysis['top_caracterizacao']}")

In [0]:
visualize_clusters(embeddings, cluster_labels, best_k)

In [0]:
# Visualization with meaningful labels
visualize_clusters(embeddings, cluster_labels, best_k)

In [0]:
display(clustered_df)

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

target_schema = spark.table("workspace.sc_gold.contactos_pbs_clusters").schema

# Converts clustered_df, which is a pandas DataFrame
clustered_spark_df = spark.createDataFrame(clustered_df)
# Reorder and cast DataFrame columns to match the target schema
clustered_spark_df = clustered_spark_df.select(
    [col(field.name).cast(field.dataType) for field in target_schema]
)

clustered_spark_df.write \
                  .format("delta") \
                  .mode("append") \
                  .saveAsTable("workspace.sc_gold.contactos_pbs_clusters")

In [0]:
'''
Additional analysis, confirming that clusters with lower scores are not suited
'''
print("Analyzing 4 clusters...")
clustered_df, cluster_analysis = analyze_clusters(contacto_df, cluster_labels, contacto_descriptions)

# Print cluster analysis for 4 groups
for cluster_name, analysis in cluster_analysis.items():
    print(f"\n{cluster_name}:")
    print(f"  Size: {analysis['size']} ({analysis['percentage']:.1f}%)")
    print(f"  Top Origins: {analysis['top_origem']}")
    print(f"  Top Formulario: {analysis['top_formulario']}")
    print(f"  Tipo Pedido: {analysis['top_tipo_de_pedido']}")
    print(f"  Top Models: {analysis['top_modelo']}")
    print(f"  Top Agrupamento: {analysis['top_agrupamento']}")
    print(f"  Top Caracterização: {analysis['top_caracterizacao']}")

# Visualize results
visualize_clusters(embeddings, cluster_labels, best_k)

In [0]:
# Results for 5 clusters (with noise removed)
visualize_clusters_optimized(embeddings, cluster_labels, best_k)