# Semantic Text Similarity

In [1]:
# Requisites
from gensim.models import TfidfModel
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
import numpy as np
from scipy import spatial
from typing import Tuple, List

In [2]:
# Load stopwords in spanish
STOPWORDS_ES = {"yo", "tú", "él", "ella", "nosotros", "vosotros", "ellos", "de", "a", }

In [3]:
# Define preprocessing
def preprocess(sentence: str) -> List[str]:
    preprocessed = simple_preprocess(sentence)
    preprocessed = [token for token in preprocessed if token not in STOPWORDS_ES]
    return preprocessed

# Load Vectors

In [6]:
WORD_EMBEDDING_FILE = './models/fasttext/cc.ca.300.bin.gz'

In [7]:
# Load with gensim
from gensim.models import fasttext
wv_model = fasttext.load_facebook_vectors(WORD_EMBEDDING_FILE)

In [8]:
# If you want, you can use mmaps
USE_MMAP = False
if USE_MMAP:
    from gensim.models.fasttext import FastTextKeyedVectors
    MMAP_PATH = 'cc.es.gensim.bin'
    # wv_model.save(MMAP_PATH)
    wv_model = FastTextKeyedVectors.load(MMAP_PATH, mmap='r')

In [9]:
# Sample data
input_pairs_example = [
    ('Me gusta el futbol', 'Disfruto viendo partidos de futbol', 4),
    ('El cielo está despejado', 'Hace un día bonito', 4.5),
    ('Me encanta viajar', 'Explorar nuevos lugares es una pasión', 3.5),
    ('Prefiero el verano', 'No me gusta el frío del invierno', 2.5),
    ('Tengo hambre', '¿Qué hay para cenar?', 2),
    ('La música me relaja', 'Escuchar música es una terapia', 3),
    ('El libro es emocionante', 'No puedo dejar de leerlo', 4),
    ('Me gusta la pizza', 'Es mi comida favorita', 4.5),
    ('Estoy cansado', 'Necesito hacer una siesta', 1.5),
    ('Hoy hace mucho calor', 'Es un día sofocante', 3.5)
]

In [10]:
# Real data
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("PlanTL-GOB-ES/sts-es")



Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
     ------------------------------------- 542.0/542.0 kB 11.3 MB/s eta 0:00:00
Collecting pyarrow>=12.0.0
  Downloading pyarrow-16.1.0-cp311-cp311-win_amd64.whl (25.9 MB)
     --------------------------------------- 25.9/25.9 MB 13.1 MB/s eta 0:00:00
Collecting pyarrow-hotfix
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
     -------------------------------------- 116.3/116.3 kB 7.1 MB/s eta 0:00:00
Collecting pandas
  Using cached pandas-2.2.2-cp311-cp311-win_amd64.whl (11.6 MB)
Collecting xxhash
  Downloading xxhash-3.4.1-cp311-cp311-win_amd64.whl (29 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
     ---------------------------------------- 143.5/143.5 kB ? eta 0:00:00
Collecting fsspec[http]<=2024.3.1,>=2023.1.0
  Using cached fsspec-2024.3.1-py3-none-any.whl (171 kB)
C

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 343k/343k [00:00<00:00, 1.11MB/s]
Downloading data: 100%|██████████| 25.3k/25.3k [00:00<00:00, 151kB/s]
Downloading data: 100%|██████████| 46.8k/46.8k [00:00<00:00, 271kB/s]
Generating train split: 100%|██████████| 1320/1320 [00:00<00:00, 218100.50 examples/s]
Generating validation split: 100%|██████████| 77/77 [00:00<00:00, 76840.69 examples/s]
Generating test split: 100%|██████████| 155/155 [00:00<00:00, 47589.28 examples/s]


In [11]:
input_pairs = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["train"].to_list()]
input_pairs_val = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["validation"].to_list()]
input_pairs_test = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["test"].to_list()]

In [12]:
all_input_pairs = input_pairs + input_pairs_val + input_pairs_test
# Preprocesamiento de las oraciones y creación del diccionario
sentences_1_preproc = [simple_preprocess(sentence_1) for sentence_1, _, _ in all_input_pairs]
sentences_2_preproc = [simple_preprocess(sentence_2) for _, sentence_2, _ in all_input_pairs]
sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc))
# Versión aplanada para poder entrenar el modelo
sentences_pairs_flattened = sentences_1_preproc + sentences_2_preproc
diccionario = Dictionary(sentences_pairs_flattened)

In [13]:
print(sentence_pairs[0])

(['según', 'el', 'sondeo', 'de', 'los', 'católicos', 'cree', 'que', 'francisco', 'está', 'haciendo', 'un', 'buen', 'excelente', 'trabajo'], ['el', 'de', 'los', 'católicos', 'del', 'mundo', 'aprobaron', 'el', 'trabajo', 'del', 'papa', 'reveló', 'una', 'encuesta', 'divulgada', 'el', 'domingo'])


In [14]:
# Cálculo de los pesos TF-IDF para las oraciones pre-procesadas
corpus = [diccionario.doc2bow(sent) for sent in sentences_pairs_flattened]
modelo_tfidf = TfidfModel(corpus)

In [15]:
def map_tf_idf(sentence_preproc: List[str], dictionary: Dictionary, tf_idf_model: TfidfModel) -> Tuple[List[np.ndarray], List[float]]:
    bow = dictionary.doc2bow(sentence_preproc)
    tf_idf = tf_idf_model[bow]
    vectors, weights = [], []
    for word_index, weight in tf_idf:
        word = dictionary.get(word_index)
        if word in wv_model:
            vectors.append(wv_model[word])
            weights.append(weight)
    return vectors, weights

def map_pairs(
        sentence_pairs: List[Tuple[str, str, float]],
        dictionary: Dictionary = None,
        tf_idf_model: TfidfModel = None,
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        sentence_1_preproc = preprocess(sentence_1)
        sentence_2_preproc = preprocess(sentence_2)
        # Si usamos TF-IDF
        if tf_idf_model is not None:
            # Cálculo del promedio ponderado por TF-IDF de los word embeddings
            vectors1, weights1 = map_tf_idf(sentence_1_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model, )
            vectors2, weights2 = map_tf_idf(sentence_2_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model, )
            vector1 = np.average(vectors1, weights=weights1, axis=0, )
            vector2 = np.average(vectors2, weights=weights2, axis=0, )
        else:
            # Cálculo del promedio de los word embeddings
            vectors1 = [wv_model[word] for word in sentence_1_preproc if word in wv_model]
            vectors2 = [wv_model[word] for word in sentence_2_preproc if word in wv_model]
            vector1 = np.mean(vectors1, axis=0)
            vector2 = np.mean(vectors2, axis=0)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [16]:
# Imprimir los pares de vectores y la puntuación de similitud asociada
mapped = map_pairs(input_pairs, tf_idf_model=modelo_tfidf, dictionary=diccionario, )
# Imprimir los pares de vectores y la puntuación de similitud asociada
mapped_train = map_pairs(input_pairs,  tf_idf_model=modelo_tfidf, dictionary=diccionario, )
mapped_val = map_pairs(input_pairs_val, tf_idf_model=modelo_tfidf, dictionary=diccionario, )
mapped_test = map_pairs(input_pairs_test, tf_idf_model=modelo_tfidf, dictionary=diccionario, )

In [17]:
for vectors, similitud in mapped[:5]:
    print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
    print(f"Puntuación de similitud: {similitud}")

Pares de vectores: (300,), (300,)
Puntuación de similitud: 3.75
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.799999952316284
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.4000000953674316
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.200000047683716
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.200000047683716


In [18]:
# Define the Model
import tensorflow as tf

def build_and_compile_model(embedding_size: int = 300, learning_rate: float = 1e-3) -> tf.keras.Model:
    # Capa de entrada para los pares de vectores
    input_1 = tf.keras.Input(shape=(embedding_size,))
    input_2 = tf.keras.Input(shape=(embedding_size,))

    # Hidden layer
    first_projection = tf.keras.layers.Dense(
        embedding_size,
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    projected_1 = first_projection(input_1)
    projected_2 = first_projection(input_2)
    
    # Compute the cosine distance using a Lambda layer
    def cosine_distance(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return 2.5 * (1.0 + tf.reduce_sum(x1_normalized * x2_normalized, axis=1))

    output = tf.keras.layers.Lambda(cosine_distance)([projected_1, projected_2])
    # Define output
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adamax(learning_rate))
    return model

In [19]:
def build_and_compile_model_better(embedding_size: int = 300, learning_rate: float = 1e-3) -> tf.keras.Model:
    # Capa de entrada para los pares de vectores
    input_1 = tf.keras.Input(shape=(embedding_size,))
    input_2 = tf.keras.Input(shape=(embedding_size,))

    # Hidden layer
    first_projection = tf.keras.layers.Dense(
        embedding_size,
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    projected_1 =  first_projection(input_1)
    projected_2 = first_projection(input_2)
    
    # Compute the cosine distance using a Lambda layer
    def normalized_product(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return x1_normalized * x2_normalized

    output = tf.keras.layers.Lambda(normalized_product)([projected_1, projected_2])
    output = tf.keras.layers.Dropout(0.1)(output)
    output = tf.keras.layers.Dense(
        16,
        activation="relu",
    )(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(
        1,
        activation="sigmoid",
    )(output)
    
    output = tf.keras.layers.Lambda(lambda x: x * 5)(output)
    
    # Define output
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(learning_rate))
    return model

In [20]:
# Define training constants
batch_size: int = 64
num_epochs: int = 64

In [21]:
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.array(_x_1), np.array(_x_2)), np.array(_y, dtype=np.float32, )

# Obtener las listas de train y test
x_train, y_train = pair_list_to_x_y(mapped_train)
x_val, y_val = pair_list_to_x_y(mapped_val)

In [22]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

In [23]:
# Show shapes
x_train[0].shape, x_train[1].shape, y_train.shape

((1320, 300), (1320, 300), (1320,))

In [24]:
# Construir y compilar el modelo
model = build_and_compile_model()
#tf.keras.utils.plot_model(model, show_shapes=True, show_layer_activations=True, )
print(model.summary())




None


In [25]:
# Train the model
model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 5.5868 - val_loss: 3.1592
Epoch 2/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.9214 - val_loss: 2.5053
Epoch 3/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.4842 - val_loss: 2.4196
Epoch 4/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.3547 - val_loss: 2.3638
Epoch 5/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.2385 - val_loss: 2.3218
Epoch 6/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.1317 - val_loss: 2.2877
Epoch 7/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.0391 - val_loss: 2.2589
Epoch 8/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.9492 - val_loss: 2.2361
Epoch 9/64
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x28bef3d4fd0>

In [26]:
from scipy.stats import pearsonr
x_test, y_test = pair_list_to_x_y(mapped_test)

In [27]:
# Baseline
def compute_pearson_baseline(x_, y_):
    y_pred_baseline = []
    for v1, v2 in zip(*x_):
        d = 1.0 - spatial.distance.cosine(v1, v2)
        y_pred_baseline.append(d)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred_baseline, y_.flatten())
    return correlation
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train, y_train)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val, y_val)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test, y_test)}")

Correlación de Pearson (baseline-train): 0.47647824669929484
Correlación de Pearson (baseline-validation): 0.6433742067923268
Correlación de Pearson (baseline-test): 0.5237363494968837


In [28]:
def compute_pearson(x_, y_):
    # Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
    y_pred = model.predict(x_)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train, y_train)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val, y_val)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test, y_test)}")

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Correlación de Pearson (train): 0.919097165208314
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 999us/step
Correlación de Pearson (validation): 0.6045465458759391
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 878us/step
Correlación de Pearson (test): 0.49699049811932405
