In [None]:
!pip uninstall vertexai

In [None]:
!pip install vertexai

In [None]:
!pip install utils

In [None]:

from utils import authenticate
credentials, PROJECT_ID = authenticate() #Get credentials and project ID

In [None]:
# Import and initialize the Vertex AI Python SDK
import vertexai
vertexai.init(project=PROJECT_ID, 
              location=REGION, 
              credentials = credentials)

In [None]:
import numpy as np
from vertexai.language_models import TextEmbeddingModel

In [None]:
in_1 = "Hemos observado un ibis volando hacia la laguna"

in_2 = "Avistada una nutria en la costa"

in_3 = "Un perro parece divertirse con los delfines"


in_4 = "La ensalada de aguacate está deliciosa!"

in_5 = "Me encanta la comida japonesa!"


in_6 = "Los programadores de Python son gente genial"

in_7 = "TypeScript, C++ o Java? Todos molan!"


input_text_lst_news = [in_1, in_2, in_3, in_4, in_5, in_6, in_7]

In [None]:
embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko-multilingual@001")

- Obtenemos embeddings para todas las piezas de texto.
- Las almacenamos en un array NumPy 2D (una fila por cada embedding).

In [None]:
embeddings = []
for input_text in input_text_lst_news:
    emb = embedding_model.get_embeddings(
        [input_text])[0].values
    embeddings.append(emb)

embeddings_array = np.array(embeddings)

In [None]:
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

Shape: (7, 768)
[[ 0.04559246 -0.01472285 -0.02949955 ...  0.04057328 -0.03193641
  -0.01936668]
 [-0.01995482  0.00037652  0.0116593  ...  0.02617216 -0.03978169
  -0.02036468]
 [ 0.01030084  0.02219611  0.02433357 ...  0.03538613 -0.0273955
  -0.04193578]
  
 ...
 
 [-0.0263201  -0.01767797 -0.01261324 ... -0.01372547  0.00060259
   0.01581882]
 [-0.00561961 -0.02237099 -0.03271009 ... -0.02777804 -0.03388645
  -0.01553735]
 [ 0.00867064 -0.0131854   0.04283332 ... -0.04224638  0.01800203
   0.01088098]]

#### Reducimos embeddings de 768 a 2 dimensiones para visualización
- Usamos Análisis de Componente Principal (PCA).


In [None]:
from sklearn.decomposition import PCA

# PCA para visualiz. 2D
PCA_model = PCA(n_components = 2)
PCA_model.fit(embeddings_array)
new_values = PCA_model.transform(embeddings_array)

In [None]:
print("Shape: " + str(new_values.shape))
print(new_values)

Shape: (7, 2)

[[-0.40980753 -0.10084478]
 [-0.39561909 -0.18401444]
 [-0.29958523  0.07514691]
 [ 0.16077688  0.32879395]
 [ 0.1893873   0.48556638]
 [ 0.31516547 -0.23624716]
 [ 0.4396822  -0.36840086]]

In [None]:
import matplotlib.pyplot as plt
import mplcursors


def plot_2D(x_values, y_values, labels):

    # Creamos scatter plot
    fig, ax = plt.subplots()
    scatter = ax.scatter(x_values,
                         y_values,
                         alpha = 0.5,
                         edgecolors='k',
                         s = 40)

    # Creamos a mplcursors object to manage the data point interaction
    cursor = mplcursors.cursor(scatter, hover=True)

    #aes
    ax.set_title('Visulaizacion del embedding en 2D')
    ax.set_xlabel('X_1')  # Add x-axis label
    ax.set_ylabel('X_2')  # Add y-axis label

    # Define how each annotation should look
    @cursor.connect("add")
    def on_add(sel):
        sel.annotation.set_text(labels[sel.target.index])
        sel.annotation.get_bbox_patch().set(facecolor='white', alpha=0.5) # Set annotation's background color
        sel.annotation.set_fontsize(12)

    plt.show()


plot_2D(new_values[:,0], new_values[:,1], input_text_lst_news)

<img src="images/Embedding_visualization_2D.png">

#### Embeddings and Similarity
- Plot a heat map to compare the embeddings of sentences that are similar and sentences that are dissimilar.

In [None]:
in_1 = """La cebra come de la planta."""

in_2 = """En la planta surgió un llama"""

in_3 = """Las bacterias pueden aguantar periodos largos de falta de agua."""

in_4 = """Algunos microorganismos son resistentes
          a la escasez de agua."""

input_text_lst_sim = [in_1, in_2, in_3, in_4]

In [None]:
embeddings = []
for input_text in input_text_lst_sim:
    emb = embedding_model.get_embeddings([input_text])[0].values
    embeddings.append(emb)

embeddings_array = np.array(embeddings)

In [None]:
def plot_heatmap(data, x_labels=None, y_labels=None, title=None):
    fig, ax = plt.subplots(figsize=(50, 3))
    heatmap = ax.pcolor(data, cmap='coolwarm', edgecolors='k', linewidths=0.1)

    # Add color bar to the right of the heatmap
    cbar = plt.colorbar(heatmap, ax=ax)
    cbar.remove()

    # Set labels for each axis
    if x_labels:
        ax.set_xticks(np.arange(data.shape[1]) + 0.5, minor=False)
        ax.set_xticklabels(x_labels, rotation=45, ha="right")
    if y_labels:
        ax.set_yticks(np.arange(data.shape[0]) + 0.5, minor=False)
        ax.set_yticklabels(y_labels, va="center")

    # Set title
    if title:
        ax.set_title(title)

    plt.tight_layout()

    # Show the plot
    plt.show()


y_labels = input_text_lst_sim

# Plot the heatmap
plot_heatmap(embeddings_array, y_labels = y_labels, title = "Embeddings Heatmap")

#### Compute cosine similarity
Esperamos un array 2D en coseno similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def compare(embeddings,idx1,idx2):
    return cosine_similarity([embeddings[idx1]],[embeddings[idx2]])

In [None]:
print(in_1)
print(in_2)
print(compare(embeddings,0,1))

In [None]:
print(in_3)
print(in_4)
print(compare(embeddings,2,3))