# **Pràctica 4**

# Entrenament de models de Word2Vec

In [16]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

Importem el dataset de la manera en la que s'indica a la pàgina web.

In [2]:
from datasets import load_dataset

dataset = load_dataset("projecte-aina/catalan_general_crawling")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Observem que en el dataset hi ha una part de train, per tant obtenim aquesta part anomenant-la train_dataset, i observem que el contingut de text es troba a la columna 'text' de train_dataset

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1016113
    })
})

In [4]:
train_dataset = dataset['train']

In [8]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 1016113
})

Definim una funció per preprocessar el dataset. Aquesta funció neteja i normalitza el text, convertint-lo tot a minúscules, eliminant caràcters especials, i dividint-lo en paraules abans de tornar-lo a unir en un sol string.

In [36]:
import os
import re
from nltk.tokenize import word_tokenize

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    return ' '.join(tokens)

A continuació es defineix una funció que s'utilitza per dividir el conjunt de dades en diverses parts i realitzar el preprocessament a cada part. Pren com a parametres d'entrada el conjunt de dades a preprocessarm el directori on es guradaran els arxius dividits i una llista de mides desitjades per cada part (en bytres), i com a sortida s'obté arxius de text preprocessats, dividits segons les mides especificades i guardats al directori de sortida.

Explicació del contingut de la funció:
- Es crea un directori de sortida, en cas de que no existeixi, per assegurar que es poden guardar els arxius resultants.
- Es defineixen diverses variables per mantenir el compte de la part que s'està preprocessant, la seva mida i per afegir el text preprocessat.
- S'extreu el text de la fila, es preprocessa utilitzant la funció anterior.
- Si s'arriba a la mida dessitjada, es guarda el text preprocessat a un arxiu de text en el directori de sortida.


In [37]:
def dividir_y_preprocesar_dataset(dataset, output_dir, tamano_partes):
    
    if not os.path.exists(output_dir): 
        os.makedirs(output_dir)
        
    total_bytes = 0
    contador = 1
    current_size = 0
    current_part = []
    
    for i, row in enumerate(dataset):
        text = row['text']
        preprocessed_text = preprocess(text)
        current_size += len(preprocessed_text.encode('utf-8'))
        current_part.append(preprocessed_text)
        
        if current_size >= tamano_partes[contador - 1]:
            with open(os.path.join(output_dir, f'parte_{contador}.txt'), 'w', encoding='utf-8') as f:
                for line in current_part:
                    f.write(line + '\n')
            current_part = []
            current_size = 0
            contador += 1
            
            if contador > len(tamano_partes):
                break

    if current_part:
        with open(os.path.join(output_dir, f'parte_{contador}.txt'), 'w', encoding='utf-8') as f:
            for line in current_part:
                f.write(line + '\n')


Una vegada definida la funció 'dividir_y_preprocesar_dataset', la cridem amb les mides dessitjades (100MB, 500MB i 1GB) i amb el directori de sortida corresponent.

In [38]:
tamano_partes = [100 * 1024 * 1024, 500 * 1024 * 1024, 1 * 1024 * 1024 * 1024] # 100MB, 500MB, 1GB
output_dir = 'divided_datasets'
dividir_y_preprocesar_dataset(train_dataset, output_dir, tamano_partes)

Ara passem a entrenar un model Word2Vec per a cada part del conjunt de dades dividit i preprocessat. 

Primer, es crea una llista de rutes als arxius dividits i preprocessats, per tal de poder accedir als textos i poder entrenar el model amb ells. 

Hem decidit utilitzar LineSentence de gensim per llegir les frases, ja que d'aquesta manera es converteix cada línea en una llista de paraules per a l'entrenament del model Word2vec.

Es crea un model Word2Vec amb els següents paràmetres:
- sentences: les frases preprocesades llegides del fitxer.
- vector_size=100: la dimensió dels vectors de paraules.
- window=5: la mida de la finestra de context.
- min_count=10: només les paraules que apareixen almenys 10 vegades seran considerades.
- workers=4: el nombre de fils per al processament.
- sg=1: utilitzar el model Skip-Gram (en lloc de CBOW).
- epochs=25: nombre d'iteracions sobre el conjunt de dades.

Finalment, el model entrenat es guarda en un fitxer.

In [39]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

dataset_parts = [f'divided_datasets/parte_{i}.txt' for i in range(1, len(tamano_partes) + 1)]

for i, part in enumerate(dataset_parts):
    sentences = LineSentence(part)
    
    model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=10, workers=4, sg=1, epochs=25)
    
    model.save(f'word2vec_model_part_{i+1}.model')

    print(f'Model for part {i+1} trained and saved.')

Model for part 1 trained and saved.
Model for part 2 trained and saved.
Model for part 3 trained and saved.


In [None]:
for i, row in enumerate(train_dataset):
    text = row['text']
    preprocessed_text = preprocess(text)

sentences = LineSentence(preprocessed_text)
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=10, workers=4, sg=1, epochs=25)
model.save(f'word2vec_model_original.model')

Després d'entrenar els models, volem comprovar que estiguin funcionant correctament, per tant afegim aquest procès de validació pel model de mida 100MB.

Comprovem les paraules més similars a 'informàtica' i la similitud entre 'informàtica' i 'digital'.

In [14]:
from gensim.models import Word2Vec

model = Word2Vec.load('word2vec_model_part_1.model')

similar_words = model.wv.most_similar('informàtica', topn=10)
print("Paraules similars a 'informàtica':")
for word, similarity in similar_words:
    print(f'{word}: {similarity:.4f}')

similarity = model.wv.similarity('informàtica', 'coordinador')
print(f"Similitud entre 'informàtica' i 'coordinador': {similarity:.4f}")


Paraules similars a 'informàtica':
enginyeria: 0.7088
instrumentació: 0.6706
tecnologia: 0.6700
sig: 0.6690
telecomunicació: 0.6620
ub: 0.6593
aplicacions: 0.6580
informàtic: 0.6461
automàtica: 0.6351
tecnologies: 0.6276
Similitud entre 'informàtica' i 'coordinador': 0.4499


# Model de Similitud de Text Semàntic 

In [None]:
# Función para calcular la correlación de Pearson
def compute_pearson(x_, y_, model):
    y_pred = model.predict(x_)
    print(f"y_pred shape: {y_pred.shape}, y_ shape: {y_.shape}")  # Agregar impresión para depuración
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation


## One hot

### Preprocesamiento y Creación del Vocabulario

In [44]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Tokenizar las oraciones
def tokenize_sentences(data):
    return [[word for word in sentence.split()] for sentence in data]

# Obtener todas las oraciones del dataset
all_sentences = []
for s1, s2, _ in train_data + val_data + test_data:
    all_sentences.extend([s1, s2])

# Tokenizar todas las oraciones
tokenized_sentences = tokenize_sentences([' '.join(sent) for sent in all_sentences])

# Crear el vocabulario
vocab = list(set(word for sentence in tokenized_sentences for word in sentence))
word_to_index = {word: i for i, word in enumerate(vocab)}

# Convertir las oraciones tokenizadas en índices
def sentences_to_indices(sentences, word_to_index, max_length):
    indices = np.zeros((len(sentences), max_length))
    for i, sentence in enumerate(sentences):
        for j, word in enumerate(sentence.split()[:max_length]):
            indices[i, j] = word_to_index.get(word, 0)
    return indices

max_length = 50  # Define el máximo número de palabras por oración

# Convertir el dataset a índices
def pair_list_to_x_y_onehot(data, word_to_index, max_length):
    X1 = sentences_to_indices([' '.join(s1) for s1, _, _ in data], word_to_index, max_length)
    X2 = sentences_to_indices([' '.join(s2) for _, s2, _ in data], word_to_index, max_length)
    y = np.array([label for _, _, label in data])
    return (X1, X2), y

(x_train_1_onehot, x_train_2_onehot), y_train = pair_list_to_x_y_onehot(train_data, word_to_index, max_length)
(x_val_1_onehot, x_val_2_onehot), y_val = pair_list_to_x_y_onehot(val_data, word_to_index, max_length)
(x_test_1_onehot, x_test_2_onehot), y_test = pair_list_to_x_y_onehot(test_data, word_to_index, max_length)

# Verificar las formas de los datos
print(f"x_train_1_onehot shape: {x_train_1_onehot.shape}, x_train_2_onehot shape: {x_train_2_onehot.shape}, y_train shape: {y_train.shape}")


x_train_1_onehot shape: (2073, 50), x_train_2_onehot shape: (2073, 50), y_train shape: (2073,)


### Construir el Modelo con One-Hot Encoding

In [45]:
import tensorflow as tf

# Definir el modelo de regresión de similitud con One-Hot Encoding
def build_and_compile_model_onehot(vocab_size, max_length, hidden_size=64):
    input_1 = tf.keras.Input(shape=(max_length,))
    input_2 = tf.keras.Input(shape=(max_length,))
    
    # One-Hot Encoding y Embedding
    one_hot = tf.keras.layers.Embedding(vocab_size, vocab_size, input_length=max_length, trainable=False)
    
    encoded_1 = one_hot(input_1)
    encoded_2 = one_hot(input_2)
    
    # Pooling para obtener una representación fija de las oraciones
    pooling = tf.keras.layers.GlobalAveragePooling1D()
    pooled_1 = pooling(encoded_1)
    pooled_2 = pooling(encoded_2)
    
    # Concatenar las representaciones
    concatenated = tf.keras.layers.Concatenate(axis=1)([pooled_1, pooled_2])
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu')(concatenated)
    output = tf.keras.layers.Dense(1)(hidden)
    
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

# Construir y compilar el modelo
vocab_size = len(vocab)
model_onehot = build_and_compile_model_onehot(vocab_size, max_length)

# Entrenar el modelo
model_onehot.fit([x_train_1_onehot, x_train_2_onehot], y_train, epochs=10, batch_size=32)

# Evaluar el modelo
print(f"Correlación de Pearson (train): {compute_pearson([x_train_1_onehot, x_train_2_onehot], y_train, model_onehot)}")
print(f"Correlación de Pearson (validation): {compute_pearson([x_val_1_onehot, x_val_2_onehot], y_val, model_onehot)}")
print(f"Correlación de Pearson (test): {compute_pearson([x_test_1_onehot, x_test_2_onehot], y_test, model_onehot)}")




Epoch 1/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 46ms/step - loss: 0.9008
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - loss: 0.7326
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - loss: 0.7492
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss: 0.7210
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - loss: 0.6662
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss: 0.6646
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - loss: 0.7017
Epoch 8/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss: 0.6855
Epoch 9/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - loss: 0.6828
Epoch 10/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - loss: 0.7039

## Word2vec

### Cargar y preprocesar los datos:

In [22]:
import numpy as np
import tensorflow as tf
from gensim.models import Word2Vec
from scipy.stats import pearsonr

In [37]:
from datasets import load_dataset
import re
from nltk.tokenize import word_tokenize

# Cargar el dataset
dataset_ts = load_dataset("projecte-aina/sts-ca")
train_data = dataset_ts['train']
val_data = dataset_ts['validation']
test_data = dataset_ts['test']

# Preprocesar el texto
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    return tokens

train_data = [(preprocess(s1), preprocess(s2), label) for s1, s2, label in zip(train_data['sentence1'], train_data['sentence2'], train_data['label'])]
val_data = [(preprocess(s1), preprocess(s2), label) for s1, s2, label in zip(val_data['sentence1'], val_data['sentence2'], val_data['label'])]
test_data = [(preprocess(s1), preprocess(s2), label) for s1, s2, label in zip(test_data['sentence1'], test_data['sentence2'], test_data['label'])]


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


### Generar embeddings (ejemplo con Word2Vec):

S'HA DE FER LES PROVES PER CADA PART DEL WORD2VEC QUE HEM PREENTRENAT I HEM DE MIRAR LO DE MEAN I MEAN PONDERADA QUE NOSE QUE ÉS

In [38]:
from gensim.models import Word2Vec
import numpy as np

# Cargar el modelo de Word2Vec preentrenado
word2vec_model = Word2Vec.load('word2vec_model_part_1.model') ## S'ha de canviar aixó per cada part que fem
vector_size = word2vec_model.vector_size
max_length = 50  # Ajusta según tus necesidades

# Función de preprocesamiento para Word2Vec
def word2vec_encode(tokens, model, max_length):
    word2vec_vector = np.zeros((max_length, model.vector_size))
    for i, token in enumerate(tokens):
        if i >= max_length:
            break
        if token in model.wv:
            word2vec_vector[i] = model.wv[token]
    return word2vec_vector

### Entrenar y evaluar el modelo:

In [20]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp311-cp311-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.16.1
  Downloading tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl (377.0 MB)
     ---------------------------------------- 0.0/377.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/377.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/377.0 MB ? eta -:--:--
     -------------------------------------- 0.0/377.0 MB 281.8 kB/s eta 0:22:18
     -------------------------------------- 0.1/377.0 MB 416.7 kB/s eta 0:15:05
     -------------------------------------- 0.1/377.0 MB 568.9 kB/s eta 0:11:03
     -------------------------------------- 0.3/377.0 MB 983.9 kB/s eta 0:06:23
     ---------------------------------------- 0.4/377.0 MB 1.3 MB/s eta 0:04:55
     ---------------------------------------- 0.6/377.0 MB 1.6 MB/s eta 0:03:56
     ---------------------------------------- 0.8/377.0 MB 1.9 MB/s eta 0:03:22
     ----------------


[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
# Convertir el dataset a vectores Word2Vec
def pair_list_to_x_y(data):
    X1 = np.array([word2vec_encode(s1, word2vec_model, max_length) for s1, _, _ in data])
    X2 = np.array([word2vec_encode(s2, word2vec_model, max_length) for _, s2, _ in data])
    y = np.array([label for _, _, label in data])
    return (X1, X2), y

In [40]:
(x_train_1, x_train_2), y_train = pair_list_to_x_y(train_data)
(x_val_1, x_val_2), y_val = pair_list_to_x_y(val_data)
(x_test_1, x_test_2), y_test = pair_list_to_x_y(test_data)

In [41]:
# Verificar las formas de los datos
print(f"x_train_1 shape: {x_train_1.shape}, x_train_2 shape: {x_train_2.shape}, y_train shape: {y_train.shape}")
print(f"x_val_1 shape: {x_val_1.shape}, x_val_2 shape: {x_val_2.shape}, y_val shape: {y_val.shape}")
print(f"x_test_1 shape: {x_test_1.shape}, x_test_2 shape: {x_test_2.shape}, y_test shape: {y_test.shape}")


x_train_1 shape: (2073, 50, 100), x_train_2 shape: (2073, 50, 100), y_train shape: (2073,)
x_val_1 shape: (500, 50, 100), x_val_2 shape: (500, 50, 100), y_val shape: (500,)
x_test_1 shape: (500, 50, 100), x_test_2 shape: (500, 50, 100), y_test shape: (500,)


In [42]:
import tensorflow as tf

# Definir el modelo de regresión de similitud
def build_and_compile_model(input_length, vector_size, hidden_size=64):
    input_1 = tf.keras.Input(shape=(input_length, vector_size))
    input_2 = tf.keras.Input(shape=(input_length, vector_size))
    
    concatenated = tf.keras.layers.Concatenate(axis=1)([input_1, input_2])
    flatten = tf.keras.layers.Flatten()(concatenated)  # Aplanar la entrada concatenada
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu')(flatten)
    output = tf.keras.layers.Dense(1)(hidden)
    
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

# Construir y compilar el modelo
model = build_and_compile_model(max_length, vector_size)

# Entrenar el modelo
model.fit([X_train_1, X_train_2], y_train, epochs=10, batch_size=32)

Epoch 1/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.0027
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.5579
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.4785
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.4013
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.3628
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.3026
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2867
Epoch 8/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2475
Epoch 9/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2350
Epoch 10/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2211


<keras.src.callbacks.history.History at 0x271fe4c2010>

In [43]:
# Evaluar el modelo
print(f"Correlación de Pearson (train): {compute_pearson([x_train_1, x_train_2], y_train, model)}")
print(f"Correlación de Pearson (validation): {compute_pearson([x_val_1, x_val_2], y_val, model)}")
print(f"Correlación de Pearson (test): {compute_pearson([x_test_1, x_test_2], y_test, model)}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
y_pred shape: (2073, 1), y_ shape: (2073,)
Correlación de Pearson (train): 0.9454760079822447
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
y_pred shape: (500, 1), y_ shape: (500,)
Correlación de Pearson (validation): 0.11742938462054067
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
y_pred shape: (500, 1), y_ shape: (500,)
Correlación de Pearson (test): 0.19640451822516342


## One Hot

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Crear un vocabulario
vocab = list(set([word for sentence in dataset['train']['sentence1'] + dataset['train']['sentence2'] for word in word_tokenize(sentence)]))
vocab_dict = {word: i for i, word in enumerate(vocab)}

# Función de preprocesamiento para One-Hot
def one_hot_encode(sentence, vocab_dict, max_length):
    tokens = word_tokenize(sentence)
    one_hot_vector = np.zeros((max_length, len(vocab_dict)))
    for i, token in enumerate(tokens):
        if i >= max_length:
            break
        if token in vocab_dict:
            one_hot_vector[i, vocab_dict[token]] = 1
    return one_hot_vector

max_length = 50  # Longitud máxima de las oraciones

# Ejemplo de uso:
sentence1 = dataset['train']['sentence1'][0]
sentence2 = dataset['train']['sentence2'][0]

one_hot_vector1 = one_hot_encode(sentence1, vocab_dict, max_length)
one_hot_vector2 = one_hot_encode(sentence2, vocab_dict, max_length)


## Word2Vec

In [None]:
from gensim.models import Word2Vec

# Cargar el modelo de Word2Vec entrenado
word2vec_model = Word2Vec.load('word2vec_model_part_1.model') ## s'ha de canviar per la part del model que volem comprovar

# Función de preprocesamiento para Word2Vec
def word2vec_encode(sentence, model, max_length):
    tokens = word_tokenize(sentence)
    vector_size = model.vector_size
    word2vec_vector = np.zeros((max_length, vector_size))
    for i, token in enumerate(tokens):
        if i >= max_length:
            break
        if token in model.wv:
            word2vec_vector[i] = model.wv[token]
    return word2vec_vector

# Ejemplo de uso:
word2vec_vector1 = word2vec_encode(sentence1, word2vec_model, max_length)
word2vec_vector2 = word2vec_encode(sentence2, word2vec_model, max_length)


## SpaCy

In [None]:
import spacy

# Cargar el modelo de spaCy
nlp = spacy.load('ca_core_news_md')

# Función de preprocesamiento para spaCy
def spacy_encode(sentence, nlp, max_length):
    doc = nlp(sentence)
    vector_size = len(doc.vector)
    spacy_vector = np.zeros((max_length, vector_size))
    for i, token in enumerate(doc):
        if i >= max_length:
            break
        spacy_vector[i] = token.vector
    return spacy_vector

# Ejemplo de uso:
spacy_vector1 = spacy_encode(sentence1, nlp, max_length)
spacy_vector2 = spacy_encode(sentence2, nlp, max_length)


## Uso de los Embeddings en el Modelo de Similitud:


In [None]:
import tensorflow as tf

# Definir el modelo de regresión de similitud
def build_and_compile_model(input_length, vector_size, hidden_size=64):
    input_1 = tf.keras.Input(shape=(input_length, vector_size))
    input_2 = tf.keras.Input(shape=(input_length, vector_size))
    
    concatenated = tf.keras.layers.Concatenate(axis=1)([input_1, input_2])
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu')(concatenated)
    output = tf.keras.layers.Dense(1)(hidden)
    
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

# Construir y compilar el modelo
vector_size = word2vec_model.vector_size  # Cambia esto según el modelo de embeddings que estés usando
model = build_and_compile_model(max_length, vector_size)

# Ejemplo de entrenamiento
sentence_pairs = [(s1, s2) for s1, s2 in zip(dataset['train']['sentence1'], dataset['train']['sentence2'])]
labels = dataset['train']['label']

# Convertir las oraciones a vectores Word2Vec (cambia esta función según el método de embeddings)
X1 = np.array([word2vec_encode(s1, word2vec_model, max_length) for s1, s2 in sentence_pairs])
X2 = np.array([word2vec_encode(s2, word2vec_model, max_length) for s1, s2 in sentence_pairs])
y = np.array(labels)

# Entrenar el modelo
model.fit([X1, X2], y, epochs=10, batch_size=32)


per cada frase un unic vector
TF-IDF per descartar paraules uq no aporten info

In [None]:
import tensorflow as tf
def build_and_compile_model(hidden_size: int = 64) -> tf.keras.Model:
  model = tf.keras.Sequential([
      tf.keras.layers.Concatenate(axis=-1, ),
      tf.keras.layers.Dense(hidden_size, activation='relu'),
      tf.keras.layers.Dense(1)
  ])
  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model
m = build_and_compile_model()
# E.g.
import numpy as np
y = m((np.ones((1, 100)), np.ones((1,100)), ), )

el primer 10 s'ha de canviar per la long maxima del vector d'entrada

In [None]:
import tensorflow as tf
def build_and_compile_model(
        input_length: int = 10, hidden_size: int = 64, dictionary_size: int = 1000, embedding_size: int = 16,
) -> tf.keras.Model:
    input_1, input_2 = tf.keras.Input((input_length, ), dtype=tf.int32, ), tf.keras.Input((input_length, ), dtype=tf.int32, )
    # Define Layers
    embedding = tf.keras.layers.Embedding(
        dictionary_size, embedding_size, input_length=input_length, mask_zero=True, )
    pooling = tf.keras.layers.GlobalAveragePooling1D()
    concatenate = tf.keras.layers.Concatenate(axis=-1, )
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu')
    output = tf.keras.layers.Dense(1)
    # Pass through the layers
    _input_mask_1, _input_mask_2 = tf.not_equal(input_1, 0), tf.not_equal(input_2, 0)
    _embedded_1, _embedded_2 = embedding(input_1, ), embedding(input_2, )
    _pooled_1, _pooled_2 = pooling(_embedded_1, mask=_input_mask_1), pooling(_embedded_2, mask=_input_mask_2)
    _concatenated = concatenate((_pooled_1, _pooled_2, ))
    _hidden_output = hidden(_concatenated)
    _output = output(_hidden_output)
    # Define the model
    model = tf.keras.Model(inputs=(input_1, input_2, ), outputs=_output, )
    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model