In [0]:
#'-U' upgrades the package to the latest available version
%pip install -U sentence-transformers

In [0]:
%pip install scikit-learn 
%pip install pandas 
%pip install numpy

In [0]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark.sql.functions import pandas_udf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [0]:
"""Load contactos data to create behavioral descriptions later on"""
def load_and_preprocess_gold_contactos():
    df = spark.table("workspace.sc_gold.contactos_pbs").limit(10000)
    # Convert to Pandas for easier text processing
    df_pd = df.toPandas()
    
    return df_pd

In [0]:
"""Transform contactos data into natural language descriptions"""
def create_contactos_descriptions(df):
    descriptions = []
    
    # 'df.iterrows()' to iterate over DataFrame rows as (index, Series) pairs, with this index being ignored due to the '_'
    for _, row in df.iterrows():
        # Our key behavioral features
        origem        = row['origem']         if pd.notna(row['origem']) \
                                              else 'não especificado'
        formulario    = row['formulario']     if pd.notna(row['formulario']) \
                                              else 'não especificado'
        tipo_de_pedido= row['tipo_de_pedido'] if pd.notna(row['tipo_de_pedido']) \
                                              else 'não especificado'
        modelo        = row['modelo']         if pd.notna(row['modelo']) \
                                              else 'não especificado'
        consentimento = row['consentimento']  if pd.notna(row['consentimento']) \
                                              else 'não especificado'
        email_opt_out = row['email_opt_out']  if pd.notna(row['email_opt_out']) \
                                              else 'não especificado'
        agrupamento   = row['agrupamento_cliente'] if pd.notna(row['agrupamento_cliente'])\
                                                   else 'não especificado'
        caracterizacao= row['caracterizacao'] if pd.notna(row['caracterizacao']) \
                                              else 'não especificado'
        
        # And then we create a natural language description in Portuguese
        description = f"Origem do contacto: {origem}, através do formulário {formulario}, " \
                      f"solicitando {tipo_de_pedido} para o modelo {modelo}. " \
                      f"Status do consentimento: {consentimento}, " \
                      f"prefere receber email: {email_opt_out}, agrupado em: {agrupamento}, " \
                      f"caracterizado como: {caracterizacao}"

        descriptions.append(description)
    
    return descriptions

In [0]:
"""Uses SentenceTransformer model from Hugging Face, a Python framework which performs comparably to OpenAI embeddings"""
def generate_embeddings(descriptions, model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
    model = SentenceTransformer(model_name)
    
    # Generate the embeddings, meaning the numerical vector representations of contactos descriptions
    print(f"Generating embeddings for {len(descriptions)} Contactos...")
    embeddings = model.encode(descriptions, show_progress_bar=True)
    
    return embeddings

In [0]:
"""Perform K-means clustering with optimal cluster selection"""
def perform_clustering(embeddings, n_clusters_range=range(2, 10)):
    best_score = -1
    best_k = 2
    scores = []
    
    # Find optimal nº of clusters using the silhouette score
    for k in n_clusters_range:
        #'n_init=10' runs 10 times with different centroids¹ and picks the best result
        #¹They're the algorithm’s guess at where clusters might be
        kmeans = KMeans(n_clusters=k, 
                        random_state=42, 
                        n_init=10)
        #Assigns each embeddings data point to a cluster
        cluster_labels = kmeans.fit_predict(embeddings)
        #'silhouette_score' measures how well each point fits within its cluster.
        score = silhouette_score(embeddings, cluster_labels)
        scores.append(score)
        
        if score > best_score:
            best_score = score
            best_k = k
    
    # Final clustering with best k
    final_kmeans = KMeans(n_clusters=best_k, 
                          random_state=42, 
                          n_init=10)
    final_labels = final_kmeans.fit_predict(embeddings)
    
    return final_labels, best_k, best_score, scores

In [0]:
"""Analyze cluster characteristics"""
def analyze_clusters(df, cluster_labels, descriptions):
    df_clustered = df.copy()
    df_clustered['labels'] = cluster_labels
    df_clustered['descriptions'] = descriptions
    
    cluster_analysis = {}
    # Analyze cluster distributions
    #'set' gets all distinct cluster labels, while 'range' lets you iterate over them
    for cluster_label in range(len(set(cluster_labels))):
        cluster_data = df_clustered[df_clustered['labels'] == cluster_label]
        
        analysis = {
            'size': len(cluster_data),                                  # Nº of rows in the cluster
            'percentage': len(cluster_data) / len(df_clustered) * 100,  # Share of this cluster relative to the whole dataset
            'top_origem': cluster_data['origem'].value_counts().head(3) # Top 3 most frequent values in the origem column 
                                                               .to_dict(),
            'top_modelo': cluster_data['modelo'].value_counts().head(3) # First 3 most frequent values in the modelo column
                                                               .to_dict(),
            'top_tipo_cliente': cluster_data['tipo_cliente'].value_counts()
                                                            .head(3)    # Top 3 most frequent Contactos types
                                                            .to_dict(),   
            'sample_descriptions': cluster_data['descriptions'].head(3) #First 3 Contacto descriptions from this cluster
                                                               .tolist()  
        }

        cluster_analysis[f'Cluster_{cluster_label}'] = analysis
    
    return df_clustered, cluster_analysis

In [0]:
"""Create visualizations for cluster analysis"""
def visualize_clusters(embeddings, cluster_labels, best_k):    
    # PCA for 2D visualization
    pca = PCA(n_components=2, random_state=42)
    embeddings_2d = pca.fit_transform(embeddings)
    
    # Create plots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # PCA plot
    scatter = axes[0].scatter(embeddings_2d[:, 0], 
                              embeddings_2d[:, 1], 
                              c=cluster_labels, 
                              cmap='tab10', 
                              alpha=0.6)
    axes[0].set_title(f'Contacto Clusters (PCA) - {best_k} clusters')
    axes[0].set_xlabel('First Principal Component')
    axes[0].set_ylabel('Second Principal Component')
    plt.colorbar(scatter, ax=axes[0])
    
    # Cluster size distribution
    unique_labels, counts = np.unique(cluster_labels, return_counts=True)
    axes[1].bar(unique_labels, counts)
    axes[1].set_title('Cluster Size Distribution')
    axes[1].set_xlabel('Cluster ID')
    axes[1].set_ylabel('Number of Contactos')
    
    plt.tight_layout()
    plt.show()

In [0]:
"""Main function to execute the clustering pipeline"""

print("Loading contactos data...")
contacto_df = load_and_preprocess_gold_contactos()

print("Creating contactos behavioral descriptions...")
contacto_descriptions = create_contactos_descriptions(contacto_df)

print("Generating embeddings...")
embeddings = generate_embeddings(contacto_descriptions)

print("Performing clustering...")
cluster_labels, best_k, best_score, scores = perform_clustering(embeddings)

print(f"Optimal number of clusters: {best_k}")
print(f"Best silhouette score: {best_score:.3f}")
print(f"Silhouette scores for different Ks: {scores}")

print("Analyzing clusters...")
clustered_df, cluster_analysis = analyze_clusters(contacto_df, cluster_labels, contacto_descriptions)

# Print cluster analysis
for cluster_name, analysis in cluster_analysis.items():
    print(f"\n{cluster_name}:")
    print(f"  Size: {analysis['size']} ({analysis['percentage']:.1f}%)")
    print(f"  Top Origins: {analysis['top_origem']}")
    print(f"  Top Models: {analysis['top_modelo']}")
    print(f"  Contacto Types: {analysis['top_tipo_cliente']}")

# Visualize results
visualize_clusters(embeddings, cluster_labels, best_k)

In [0]:
"""Slides"""
#SentenceTransformers model from Hugging Face is a Python framework for state-of-the-art embeddings
#Include slides for -Embeddings too
#the silhouette score description as well