In [0]:
%pip install scikit-learn 
%pip install pandas 
%pip install numpy

In [0]:
%pip install -U sentence-transformers #'-U' upgrades the package to the latest available version
%pip install databricks-vectorsearch

dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType, StringType, StructType, StructField, LongType
from pyspark.sql.functions import pandas_udf
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
"""Load contactos data to create behavioral descriptions later on"""
def load_and_preprocess_gold_contactos():
    df = spark.table("workspace.sc_gold.contactos_pbs")
    # Convert to Pandas for easier text processing
    df_pd = df.toPandas()
    
    return df_pd

In [0]:
"""Transform contactos data into natural language descriptions"""
def create_contactos_descriptions(df):
    descriptions = []
    
    # 'df.iterrows()' to iterate over DataFrame rows as (index, row) pairs, with this index being ignored as '_'
    for _, row in df.iterrows():
        # Our key behavioral features
        origem        = row['origem']         if pd.notna(row['origem']) \
                                              else 'não especificado'
        formulario    = row['formulario']     if pd.notna(row['formulario']) \
                                              else 'não especificado'
        tipo_de_pedido= row['tipo_de_pedido'] if pd.notna(row['tipo_de_pedido']) \
                                              else 'não especificado'
        modelo        = row['modelo']         if pd.notna(row['modelo']) \
                                              else 'não especificado'
        consentimento = row['consentimento']  if pd.notna(row['consentimento']) \
                                              else 'não especificado'
        email_opt_out = row['email_opt_out']  if pd.notna(row['email_opt_out']) \
                                              else 'não especificado'
        agrupamento   = row['agrupamento_cliente'] if pd.notna(row['agrupamento_cliente'])\
                                                   else 'não especificado'
        caracterizacao= row['caracterizacao'] if pd.notna(row['caracterizacao']) \
                                              else 'não especificado'
        
        # And then we create a natural language description in Portuguese
        description = f"Origem do contacto: {origem}, através do formulário {formulario}. " \
                      f"Tipo de Pedido: {tipo_de_pedido}, solicitado para um modelo {modelo}. " \
                      f"Agrupado em: {agrupamento}, e caracterizado como {caracterizacao}. " \
                      f"Status do consentimento: {consentimento}, e a opção para receber email está em {email_opt_out}."

        descriptions.append(description)
    
    return descriptions

In [0]:
from sentence_transformers import SentenceTransformer

"""Uses SentenceTransformer model from Hugging Face, a Python framework which performs comparably to OpenAI embeddings"""
def generate_embeddings(descriptions, model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
    model = SentenceTransformer(model_name)
    
    # Generate the embeddings, meaning the numerical vector representations of contactos descriptions
    print(f"Generating embeddings for {len(descriptions)} Contactos...")
    embeddings = model.encode(descriptions, show_progress_bar=True)
    
    return embeddings

In [0]:
"""Begin the execution by creating the description after loading"""

print("Loading contactos data...")
contacto_df = load_and_preprocess_gold_contactos()

print("Creating contactos behavioral descriptions...")
contacto_descriptions = create_contactos_descriptions(contacto_df)

In [0]:
print("Generating embeddings...")
spark = SparkSession.builder.getOrCreate()
embeddings = generate_embeddings(contacto_descriptions)

In [0]:
# First, convert to Pandas DataFrame
pdf = pd.DataFrame({
    "id": contacto_df['_fivetran_id'], # It should be the source's PK to facilitate future joins
    "description": contacto_descriptions,
    "embedding": embeddings.tolist()  # Convert numpy arrays to Python lists
})
# Then, the respective schema for this Spark DataFrame, which should match the existing table definition (the one with PK/enableChangeDataFeed)
schema = StructType([
    StructField("id", LongType(), False), 
    StructField("description", StringType(), True),
    StructField("embedding", ArrayType(FloatType()), True)
])
df = spark.createDataFrame(pdf, schema=schema)

# Lastly, save as a Delta table
print("Creating contactos_pbs_embeddings table...")
df.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable("workspace.sc_gold.contactos_pbs_embeddings")

In [0]:
%sql
--Determininig/Confirming the embedding dimension for the sentence-transformers LM model 
SELECT size(embedding) as embedding_dimension 
FROM workspace.sc_gold.contactos_pbs_embeddings 
LIMIT 1


In [0]:
'''
Sample for assigning unique/sequential IDs
PS: Considering another generate_embeddings has been called for 1000 embeddings
'''
pdf = pd.DataFrame({
    "id": range(1, len(contacto_descriptions) + 1), # Way of assigning a unique, sequential ID to each description
    "description": contacto_descriptions,
    "embedding": embeddings.tolist()  # Convert numpy arrays to Python lists
})

display(pdf)

In [0]:
%sql
'''It should have been created with the PK in it
ALTER TABLE workspace.sc_gold.contactos_pbs_embeddings_new
ALTER COLUMN id SET NOT NULL;
ALTER TABLE workspace.sc_gold.contactos_pbs_embeddings_new 
ADD CONSTRAINT pk_id PRIMARY KEY(id);
'''

--Setting the new embeddings table
ALTER TABLE workspace.sc_gold.contactos_pbs_embeddings_new
SET TBLPROPERTIES (delta.enableChangeDataFeed = true);


In [0]:
# Saving into a new embeddings table
print("Creating contactos_pbs_embeddings table...")
df.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable("workspace.sc_gold.contactos_pbs_embeddings_new")

In [0]:
'''
Search to find query_vector with best score
PS: Considering df_pd has rows from the contactos_pbs_embeddings table
'''
min_score = 10000
for idx, row in df_pd.iterrows():
    print(f"Getting the min_score for the embedding {row['id']} (the current min_score is {min_score})")

    results = index.similarity_search(
        num_results=10000, # Max nº of results allowed
        columns=["id"], # Requests only the id/pk from the matched records
        query_vector=embeddings[idx].tolist(), # Reference for the search engine to compare against
        disable_notice=True
    )
        
    aux = min([r[1] for r in results['result']['data_array']])
    if aux < min_score:
        min_score = aux

print(f"Final min_score: {min_score}")

In [0]:
#Updating variables to load one of the id with a good score (found in another search)
df = spark.table("workspace.sc_gold.contactos_pbs_embeddings")  \
          .where("id = '690'")
df_pd = df.toPandas()
embeddings = df_pd['embedding']

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient()

'''If it exists, here is how to delete the index
vsc.delete_index(
    endpoint_name="contactos_pbs_embeddings_new_endpoint",
    index_name="workspace.sc_gold.contactos_pbs_embeddings_new_index"
)
'''

# Create a Vector Search¹ index on the embedding column
# ¹A type of search optimized to retrieve embeddings
vsc.create_delta_sync_index(
    endpoint_name="contactos_pbs_embeddings_endpoint",
    index_name="workspace.sc_gold.contactos_pbs_embeddings_index", # Needed to execute the "similarity_search()" later on,
                                                                   # with Vector Search being the underlying method performing it
    source_table_name="workspace.sc_gold.contactos_pbs_embeddings",
    pipeline_type="TRIGGERED", # This index is only updated when the trigger is explicitly sync/called
    primary_key="id",
    embedding_vector_column="embedding",
    embedding_dimension=384
)

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient()
index = vsc.get_index(
    endpoint_name="contactos_pbs_embeddings_endpoint", 
    index_name="workspace.sc_gold.contactos_pbs_embeddings_index"
)
index.describe()

In [0]:
from databricks.vector_search.client import VectorSearchClient
import pandas as pd

vsc = VectorSearchClient()
index = vsc.get_index(
    endpoint_name="contactos_pbs_embeddings_endpoint", 
    index_name="workspace.sc_gold.contactos_pbs_embeddings_index"
)

results = index.similarity_search(
    num_results=10000, # Max nº of results allowed
    columns=["id", "description"],
    query_vector=embeddings[0].tolist(), # Reference for the search engine to compare against
                                         # PS: Loaded in a previouss cell
    disable_notice=True
)

In [0]:
'''
Here is the reference for distance-based scores (used by Databricks Vector Search) in embeddings:

< 0.01 = Excellent matches (very similar content)
0.01 - 0.05 = Good matches (related content)
0.05 - 0.15 = Fair matches (somewhat related)
> 0.15 = Poor matches (likely unrelated)
'''

In [0]:
fair_matches = []
for result in results['result']['data_array']:
    # Filter by fair matches only
    if result[2] <= 0.15:
        fair_matches.append({
            'id': int(result[0]), # id has the row nº in sequential order, as it was loaded from contactos_pbs to contactos_pbs_embeddings
                                  # (added to support possible joins)
            'Description': result[1],
            'score': int(result[2] * 100) / 100, # int automatically scales to fit the nº size
        })

df = pd.DataFrame(fair_matches)

#Display the best score
df_sorted = df.sort_values('score', ascending=True)
display(df_sorted.head(1))