## Libraries and initialization

In [13]:
# Requisites
from gensim.models import TfidfModel
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
import numpy as np
from scipy import spatial
from typing import Tuple, List
from datasets import load_dataset
import tensorflow as tf
from gensim.models import fasttext

In [11]:
dataset = load_dataset("projecte-aina/sts-ca", trust_remote_code=True)

Downloading data: 100%|██████████| 97.0k/97.0k [00:00<00:00, 670kB/s]
Generating train split: 2073 examples [00:00, 12696.87 examples/s]
Generating validation split: 500 examples [00:00, 34212.40 examples/s]
Generating test split: 500 examples [00:00, 22037.94 examples/s]


In [9]:
WORD_EMBEDDING_FILE = "models/fasttext_100.bin"

In [None]:

wv_model = fasttext.load_facebook_vectors(WORD_EMBEDDING_FILE)

## Preprocess text

In [110]:
# Define preprocessing
def preprocess(sentence: str) -> List[str]:
    preprocessed = simple_preprocess(sentence)
    return preprocessed

In [111]:
input_pairs = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["train"].to_list()]
input_pairs_val = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["validation"].to_list()]
input_pairs_test = [(e["sentence1"], e["sentence2"], e["label"], ) for e in dataset["test"].to_list()]

In [112]:
all_input_pairs = input_pairs + input_pairs_val + input_pairs_test
# Preprocesamiento de las oraciones y creación del diccionario
sentences_1_preproc = [simple_preprocess(sentence_1) for sentence_1, _, _ in all_input_pairs]
sentences_2_preproc = [simple_preprocess(sentence_2) for _, sentence_2, _ in all_input_pairs]
sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc))
# Versión aplanada para poder entrenar el modelo
sentences_pairs_flattened = sentences_1_preproc + sentences_2_preproc
diccionari = Dictionary(sentences_pairs_flattened)

In [113]:
# Cálculo de los pesos TF-IDF para las oraciones pre-procesadas
corpus = [diccionari.doc2bow(sent) for sent in sentences_pairs_flattened]
modelo_tfidf = TfidfModel(corpus)

In [114]:
corpus = [diccionari.doc2bow(sent) for sent in sentences_pairs_flattened]
modelo_tfidf = TfidfModel(corpus)

In [115]:
def map_tf_idf(sentence_preproc: List[str], dictionary: Dictionary, tf_idf_model: TfidfModel) -> Tuple[List[np.ndarray], List[float]]:
    bow = dictionary.doc2bow(sentence_preproc)
    tf_idf = tf_idf_model[bow]
    vectors, weights = [], []
    for word_index, weight in tf_idf:
        word = dictionary.get(word_index)
        if word in wv_model:
            vectors.append(wv_model[word])
            weights.append(weight)
    return vectors, weights

def map_pairs(
        sentence_pairs: List[Tuple[str, str, float]],
        dictionary: Dictionary = None,
        tf_idf_model: TfidfModel = None,
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        sentence_1_preproc = preprocess(sentence_1)
        sentence_2_preproc = preprocess(sentence_2)
        # Si usamos TF-IDF
        if tf_idf_model is not None:
            # Cálculo del promedio ponderado por TF-IDF de los word embeddings
            vectors1, weights1 = map_tf_idf(sentence_1_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model, )
            vectors2, weights2 = map_tf_idf(sentence_2_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model, )
            vector1 = np.average(vectors1, weights=weights1, axis=0, )
            vector2 = np.average(vectors2, weights=weights2, axis=0, )
        else:
            # Cálculo del promedio de los word embeddings
            vectors1 = [wv_model[word] for word in sentence_1_preproc if word in wv_model]
            vectors2 = [wv_model[word] for word in sentence_2_preproc if word in wv_model]
            vector1 = np.mean(vectors1, axis=0)
            vector2 = np.mean(vectors2, axis=0)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [129]:
# Imprimir los pares de vectores y la puntuación de similitud asociada
mapped = map_pairs(input_pairs, tf_idf_model=modelo_tfidf, dictionary=diccionari, )
# Imprimir los pares de vectores y la puntuación de similitud asociada
mapped_train = map_pairs(input_pairs,  tf_idf_model=modelo_tfidf, dictionary=diccionari, )
mapped_val = map_pairs(input_pairs_val, tf_idf_model=modelo_tfidf, dictionary=diccionari, )
mapped_test = map_pairs(input_pairs_test, tf_idf_model=modelo_tfidf, dictionary=diccionari, )

In [117]:
# Define training constants
batch_size: int = 64
num_epochs: int = 64

In [118]:
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.array(_x_1), np.array(_x_2)), np.array(_y, dtype=np.float32, )

# Obtener las listas de train y test
x_train, y_train = pair_list_to_x_y(mapped_train)
x_val, y_val = pair_list_to_x_y(mapped_val)

In [130]:
x_test, y_test = pair_list_to_x_y(mapped_test)

In [119]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

In [131]:
test_dataset= tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.batch(batch_size)

## Baseline - cosinus distance

In [132]:
from scipy.stats import pearsonr
x_test, y_test = pair_list_to_x_y(mapped_test)
# Baseline
def compute_pearson_baseline(x_, y_):
    y_pred_baseline = []
    for v1, v2 in zip(*x_):
        d = 1.0 - spatial.distance.cosine(v1, v2)
        y_pred_baseline.append(d)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred_baseline, y_.flatten())
    return correlation
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train, y_train)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val, y_val)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test, y_test)}")

Correlación de Pearson (baseline-train): 0.48386491416305377
Correlación de Pearson (baseline-validation): 0.4834044400848323
Correlación de Pearson (baseline-test): 0.5239545933780229


## Model profe 1

In [28]:
# Define the Model
import tensorflow as tf

def build_and_compile_model(embedding_size: int = 100, learning_rate: float = 1e-3) -> tf.keras.Model:
    # Capa de entrada para los pares de vectores
    input_1 = tf.keras.Input(shape=(embedding_size,))
    input_2 = tf.keras.Input(shape=(embedding_size,))

    # Hidden layer
    first_projection = tf.keras.layers.Dense(
        embedding_size,
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    projected_1 = first_projection(input_1)
    projected_2 = first_projection(input_2)
    
    # Compute the cosine distance using a Lambda layer
    def cosine_distance(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return 2.5 * (1.0 + tf.reduce_sum(x1_normalized * x2_normalized, axis=1))

    output = tf.keras.layers.Lambda(cosine_distance)([projected_1, projected_2])
    # Define output
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adamax(learning_rate))
    return model

In [32]:
# Construir y compilar el modelo
model = build_and_compile_model()
#tf.keras.utils.plot_model(model, show_shapes=True, show_layer_activations=True, )
print(model.summary())




None


In [33]:
# Train the model
model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 5.0399 - val_loss: 3.9373
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.0053 - val_loss: 3.7528
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.8080 - val_loss: 3.6193
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3.6569 - val_loss: 3.5082
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3.5324 - val_loss: 3.4154
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3.4244 - val_loss: 3.3368
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.3349 - val_loss: 3.2694
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3.2596 - val_loss: 3.2111
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x1c494c20390>

In [36]:
def compute_pearson(x_, y_):
    # Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
    y_pred = model.predict(x_)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train, y_train)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val, y_val)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test, y_test)}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Correlación de Pearson (train): 0.6363557041608164
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Correlación de Pearson (validation): 0.524443370063694
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Correlación de Pearson (test): 0.5073036010251575


## Model baseline presentació

In [126]:
import tensorflow as tf
def build_and_compile_model(hidden_size: int = 64) -> tf.keras.Model:
  model = tf.keras.Sequential([
      tf.keras.layers.Concatenate(axis=-1, ),
      tf.keras.layers.Dense(hidden_size, activation='relu'),
      tf.keras.layers.Dense(1)
  ])
  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model
m = build_and_compile_model()

In [127]:
# Train the model baseline
m.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 1.4266 - val_loss: 0.6887
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6839 - val_loss: 0.6492
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6576 - val_loss: 0.6483
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6453 - val_loss: 0.6479
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6374 - val_loss: 0.6465
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6308 - val_loss: 0.6441
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6248 - val_loss: 0.6422
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6197 - val_loss: 0.6407
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x1c4ad9bc790>

In [128]:
def compute_pearson(x_, y_):
    # Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
    y_pred = m.predict(x_)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train, y_train)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val, y_val)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test, y_test)}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Correlación de Pearson (train): 0.7870097195317041
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Correlación de Pearson (validation): 0.13711057059664167


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_33" is incompatible with the layer: expected axis -1 of input shape to have value 200, but received input with shape (32, 192)[0m

Arguments received by Sequential.call():
  • inputs=('tf.Tensor(shape=(32, 96), dtype=int32)', 'tf.Tensor(shape=(32, 96), dtype=int32)')
  • training=False
  • mask=('None', 'None')

## Model profe better

In [40]:
def build_and_compile_model_better(embedding_size: int = 300, learning_rate: float = 1e-3) -> tf.keras.Model:
    # Capa de entrada para los pares de vectores
    input_1 = tf.keras.Input(shape=(embedding_size,))
    input_2 = tf.keras.Input(shape=(embedding_size,))

    # Hidden layer
    first_projection = tf.keras.layers.Dense(
        embedding_size,
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    projected_1 =  first_projection(input_1)
    projected_2 = first_projection(input_2)
    
    # Compute the cosine distance using a Lambda layer
    def normalized_product(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return x1_normalized * x2_normalized

    output = tf.keras.layers.Lambda(normalized_product)([projected_1, projected_2])
    output = tf.keras.layers.Dropout(0.1)(output)
    output = tf.keras.layers.Dense(
        16,
        activation="relu",
    )(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(
        1,
        activation="sigmoid",
    )(output)
    
    output = tf.keras.layers.Lambda(lambda x: x * 5)(output)
    
    # Define output
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(learning_rate))
    return model

In [41]:
model_better = build_and_compile_model()

In [42]:
# Train the model baseline
model_better.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.1022 - val_loss: 0.6819
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6824 - val_loss: 0.6530
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6595 - val_loss: 0.6471
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6477 - val_loss: 0.6419
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6394 - val_loss: 0.6384
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6309 - val_loss: 0.6358
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6228 - val_loss: 0.6353
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6151 - val_loss: 0.6327
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x1c49573db10>

In [45]:
def compute_pearson(x_, y_):
    # Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
    y_pred = model_better.predict(x_)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train, y_train)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val, y_val)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test, y_test)}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Correlación de Pearson (train): 0.7836179282078864
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (validation): 0.15264317042013342
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (test): 0.2990447546531585


## Model propi

In [138]:
from tensorflow.keras.regularizers import l2

In [151]:
import tensorflow as tf
def build_and_compile_model(hidden_size: int = 200) -> tf.keras.Model:
  model = tf.keras.Sequential([
      tf.keras.layers.Concatenate(axis=-1, ),
      tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
      tf.keras.layers.Dense(hidden_size, activation='relu', kernel_regularizer=l2(0.01)),
      tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
      tf.keras.layers.Dense(1)
  ])
  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model
model_propi = build_and_compile_model()

In [152]:
# Train the model baseline
model_propi.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 2.3523 - val_loss: 1.4296
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.3628 - val_loss: 1.1436
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0974 - val_loss: 0.9779
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.9475 - val_loss: 0.8892
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.8578 - val_loss: 0.8403
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.8110 - val_loss: 0.8059
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7610 - val_loss: 0.7756
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7241 - val_loss: 0.7598
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x1c4b4ab9a90>

In [153]:
def compute_pearson(x_, y_):
    # Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
    y_pred = model_propi.predict(x_)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train, y_train)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val, y_val)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test, y_test)}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Correlación de Pearson (train): 0.8818743726391146
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (validation): 0.19074062771705408
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Correlación de Pearson (test): 0.30669896991460527


Els models del profe són els del fitxer single vector model

# Del fitxer sequence model 

In [49]:
from typing import Optional

In [47]:
REMAP_EMBEDDINGS: bool = True
USE_PRETRAINED: bool = True
MAX_LEN: int = 96

In [50]:
def map_word_embeddings(
        sentence: str,
        sequence_len: int = MAX_LEN,
        fixed_dictionary: Optional[Dictionary] = None
) -> np.ndarray:
    """
    Map to word-embedding indices
    :param sentence:
    :param sequence_len:
    :param fixed_dictionary:
    :return:
    """
    sentence_preproc = simple_preprocess(sentence)[:sequence_len]
    _vectors = np.zeros(sequence_len, dtype=np.int32)
    index = 0
    for word in sentence_preproc:
        if fixed_dictionary is not None:
            if word in fixed_dictionary.token2id:
                # Sumo 1 porque el valor 0 está reservado a padding
                _vectors[index] = fixed_dictionary.token2id[word] + 1
                index += 1
        else:
            if word in wv_model.key_to_index:
                _vectors[index] = wv_model.key_to_index[word] + 1
                index += 1
    return _vectors


def map_pairs(
    sentence_pairs: List[Tuple[str, str, float]],
    sequence_len: int = MAX_LEN,
    fixed_dictionary: Optional[Dictionary] = None
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        vector1 = map_word_embeddings(sentence_1, sequence_len, fixed_dictionary)
        vector2 = map_word_embeddings(sentence_2, sequence_len, fixed_dictionary)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [52]:
mapped_train = map_pairs(input_pairs, fixed_dictionary=diccionari if REMAP_EMBEDDINGS else None)
mapped_val = map_pairs(input_pairs_val, fixed_dictionary=diccionari if REMAP_EMBEDDINGS else None)
mapped_test = map_pairs(input_pairs_test, fixed_dictionary=diccionari if REMAP_EMBEDDINGS else None)

In [53]:
# MIRAR BÉ PQ FALLA PQ AIXÒ NO SERIA LO MILLOR
import tensorflow as tf
from tensorflow.keras import Layer
class MyLayer(Layer):
    def call(self, x):
        return tf.not_equal(x,0)

In [54]:
# Define model 1

def model_1(
    input_length: int = MAX_LEN,
    dictionary_size: int = 1000,
    embedding_size: int = 16,
    pretrained_weights: Optional[np.ndarray] = None,
    learning_rate: float = 1e-3,
    trainable: bool = False,
    use_cosine: bool = False,
) -> tf.keras.Model:
    # Input layers
    input_1 = tf.keras.Input(shape=(input_length,), dtype=tf.int32)
    input_2 = tf.keras.Input(shape=(input_length,), dtype=tf.int32)

    # Embedding layer
    if pretrained_weights is None:
        embedding = tf.keras.layers.Embedding(
            dictionary_size, embedding_size, input_length=input_length, mask_zero=True
        )
    else:
        dictionary_size = pretrained_weights.shape[0]
        embedding_size = pretrained_weights.shape[1]
        initializer = tf.keras.initializers.Constant(pretrained_weights)
        embedding = tf.keras.layers.Embedding(
            dictionary_size,
            embedding_size,
            input_length=input_length,
            mask_zero=True,
            embeddings_initializer=initializer,
            trainable=trainable,
        )

    # Apply embedding to input sequences
    embedded_1 = embedding(input_1)
    embedded_2 = embedding(input_2)

    # Global average pooling
    _input_mask_1, _input_mask_2 = MyLayer()(input_1), MyLayer()(input_2)
    pooled_1 = tf.keras.layers.GlobalAveragePooling1D()(embedded_1, mask=_input_mask_1)
    pooled_2 = tf.keras.layers.GlobalAveragePooling1D()(embedded_2, mask=_input_mask_2)

    # Compute similarity/distance
    if use_cosine:   
        # Compute the cosine distance using a Lambda layer
        def cosine_distance(x):
            x1, x2 = x
            x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
            x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
            return 2.5 * (1.0 + tf.reduce_sum(x1_normalized * x2_normalized, axis=1))
        output = tf.keras.layers.Lambda(cosine_distance)([pooled_1, pooled_2])
    else:
        # Compute the cosine distance using a Lambda layer
        def normalized_product(x):
            x1, x2 = x
            x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
            x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
            return x1_normalized * x2_normalized
    
        output = tf.keras.layers.Lambda(normalized_product)([pooled_1, pooled_2])
        output = tf.keras.layers.Dropout(0.1)(output)
        output = tf.keras.layers.Dense(
            16,
            activation="relu",
        )(output)
        output = tf.keras.layers.Dropout(0.2)(output)
        output = tf.keras.layers.Dense(
            1,
            activation="sigmoid",
        )(output)
        
        output = tf.keras.layers.Lambda(lambda x: x * 5)(output)

    # Define the model
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate))

    return model

In [55]:
# Definir constantes de entrenamiento
batch_size: int = 64
num_epochs: int = 128

In [65]:
# Obtener x_train e y_train
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.row_stack(_x_1), np.row_stack(_x_2)), np.array(_y)

# Obtener las listas de train y test
x_train, y_train = pair_list_to_x_y(mapped_train)
x_val, y_val = pair_list_to_x_y(mapped_val)

In [66]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

In [67]:
_pretrained_weights: Optional[np.ndarray] = None
if USE_PRETRAINED:
    if REMAP_EMBEDDINGS:
        _pretrained_weights = np.zeros(
            (len(diccionari.token2id) + 1, wv_model.vector_size),  dtype=np.float32)
        for token, _id in diccionari.token2id.items():
            if token in wv_model:
             _pretrained_weights[_id + 1] = wv_model[token]

            else:
                # In W2V, OOV will not have a representation. We will use 0.
                pass
    else:
        # Not recommended (this will consume A LOT of RAM)
        _pretrained_weights = np.zeros((wv_model.vectors.shape[0] + 1, wv_model.vector_size,),  dtype=np.float32)
        _pretrained_weights[1:, :] = wv_model.vectors

In [68]:
# Build and compile the model
model_nou = model_1(pretrained_weights=_pretrained_weights, trainable=False, use_cosine=False, )
model_nou.summary()



In [69]:
# Train the model
model_nou.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 0.7322 - val_loss: 0.7246
Epoch 2/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.7323 - val_loss: 0.7228
Epoch 3/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7291 - val_loss: 0.7214
Epoch 4/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7289 - val_loss: 0.7195
Epoch 5/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7234 - val_loss: 0.7174
Epoch 6/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.7220 - val_loss: 0.7156
Epoch 7/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7167 - val_loss: 0.7133
Epoch 8/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7141 - val_loss: 0.7111
Epoch 9/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1c4991950d0>

In [70]:
from scipy.stats import pearsonr

x_test, y_test = pair_list_to_x_y(mapped_test)
def compute_pearson(x_, y_, model):
    # Get predictions for the model
    y_pred = model_nou.predict(x_)
    # Compute pearson correlation
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation

# Print results
print(f"Correlación de Pearson (train): {compute_pearson(x_train, y_train, model)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val, y_val, model)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test, y_test, model)}")


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
Correlación de Pearson (train): 0.5128442363001666
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Correlación de Pearson (validation): 0.363246755664028
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Correlación de Pearson (test): 0.41946570342474443


In [75]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

In [78]:
class Cast_layer(Layer):
    def call(self, x):
        return tf.cast(x, tf.float32)
    
class Exp_layer(Layer):
    def call(self, x):
        return tf.exp(x)
    
class Reduce_layer_keep(Layer):
    def call(self, x):
        return tf.reduce_sum(x, axis=1, keepdims=True)
    
class Reduce_layer(Layer):
    def call(self, x):
        return tf.reduce_sum(x, axis=1)

In [79]:
import tensorflow as tf

def model_2(
    input_length: int = MAX_LEN,
    dictionary_size: int = 1000,
    embedding_size: int = 16,
    learning_rate: float = 1e-3,
    pretrained_weights: Optional[np.ndarray] = None,
    trainable: bool = False,
    use_cosine: bool = False,
) -> tf.keras.Model:
    # Inputs
    input_1 = tf.keras.Input((input_length,), dtype=tf.int32)
    input_2 = tf.keras.Input((input_length,), dtype=tf.int32)

    # Embedding Layer
    if pretrained_weights is None:
        embedding = tf.keras.layers.Embedding(
            dictionary_size, embedding_size, input_length=input_length, mask_zero=True
        )
    else:
        dictionary_size = pretrained_weights.shape[0]
        embedding_size = pretrained_weights.shape[1]
        initializer = tf.keras.initializers.Constant(pretrained_weights)
        embedding = tf.keras.layers.Embedding(
            dictionary_size,
            embedding_size,
            input_length=input_length,
            mask_zero=True,
            embeddings_initializer=initializer,
            trainable=trainable,
        )

    # Embed the inputs
    embedded_1 = embedding(input_1)
    embedded_2 = embedding(input_2)
    # Pass through the embedding layer
    _input_mask_1, _input_mask_2 = input_1, input_2

    # Attention Mechanism
    attention_mlp = tf.keras.Sequential([
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(16, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1)
    ])

    # Apply attention to each embedding
    attention_weights_1 = attention_mlp(embedded_1)  
    attention_weights_2 = attention_mlp(embedded_2) 
    # Mask the attention weights
    attention_weights_1 = Exp_layer()(attention_weights_1) * Cast_layer()(_input_mask_1[:, :, None])
    attention_weights_2 = Exp_layer()(attention_weights_2) * Cast_layer()(_input_mask_2[:, :, None])
    # Normalize attention weights
    attention_weights_1 = attention_weights_1 / Reduce_layer_keep()(attention_weights_1)
    attention_weights_2 = attention_weights_2 / Reduce_layer_keep()(attention_weights_2)
    # Compute context vectors
    projected_1 = Reduce_layer()(embedded_1 * attention_weights_1) 
    projected_2 = Reduce_layer()(embedded_2 * attention_weights_2) 
    
    
    if use_cosine:
        # Compute the cosine distance using a Lambda layer
        def cosine_distance(x):
            x1, x2 = x
            x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
            x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
            return 2.5 * (1.0 + tf.reduce_sum(x1_normalized * x2_normalized, axis=1))
        output = tf.keras.layers.Lambda(cosine_distance)([projected_1, projected_2])
    else:
         # Compute the cosine distance using a Lambda layer
        def normalized_product(x):
            x1, x2 = x
            x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
            x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
            return x1_normalized * x2_normalized
    
        output = tf.keras.layers.Lambda(normalized_product)([projected_1, projected_2])
        output = tf.keras.layers.Dropout(0.1)(output)
        output = tf.keras.layers.Dense(
            16,
            activation="relu",
        )(output)
        output = tf.keras.layers.Dropout(0.2)(output)
        output = tf.keras.layers.Dense(
            1,
            activation="sigmoid",
        )(output)
        
        output = tf.keras.layers.Lambda(lambda x: x * 5)(output)
    # Model Definition
    model = tf.keras.Model(inputs=(input_1, input_2), outputs=output)
    model.compile(
        loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate)
    )
    return model

In [80]:
# Build Model 2
model2 = model_2(pretrained_weights=_pretrained_weights, trainable=False, use_cosine=False)
model2.summary()



In [81]:
# Train model 2
model2.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/128




[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - loss: 0.7340 - val_loss: 0.7275
Epoch 2/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.7301 - val_loss: 0.7250
Epoch 3/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.7274 - val_loss: 0.7229
Epoch 4/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.7239 - val_loss: 0.7204
Epoch 5/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.7230 - val_loss: 0.7172
Epoch 6/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7172 - val_loss: 0.7133
Epoch 7/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7126 - val_loss: 0.7081
Epoch 8/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7023 - val_loss: 0.7023
Epoch 9/128
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x1c49856b5d0>

In [82]:
# Print results
print(f"Correlación de Pearson (train): {compute_pearson(x_train, y_train, model2)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val, y_val, model2)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test, y_test, model2)}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Correlación de Pearson (train): 0.5128442363001666
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Correlación de Pearson (validation): 0.363246755664028
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Correlación de Pearson (test): 0.41946570342474443
