In [None]:
# PART D'IMPLEMENTAR MODEL DE REGRESSIO DE SIMILITUD 
# Preprocesamiento del texto y carga del dataset
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from datasets import load_dataset
import re
import os
from nltk.tokenize import word_tokenize

# Función de preprocesamiento del texto
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    return ' '.join(tokens)

# Cargar el dataset
dataset = load_dataset("projecte-aina/catalan_general_crawling")

# Agarrar la parte del dataset que sirve para entrenamiento
train_dataset = dataset['train']

# Preprocesar el dataset
preprocessed_texts = [preprocess(row['text']) for row in train_dataset]

# Guardar el texto preprocesado en un archivo temporal para entrenamiento de Word2Vec
with open('preprocessed_texts.txt', 'w', encoding='utf-8') as f:
    for text in preprocessed_texts:
        f.write(text + '\n')

# Entrenar el modelo Word2Vec
sentences = LineSentence('preprocessed_texts.txt')
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=10, workers=4, epochs=25)
word2vec_model.save('word2vec_model.model')

# Crear el modelo de regresión de similitud
import tensorflow as tf

def build_similarity_model(input_dim):
    input_1 = tf.keras.Input(shape=(input_dim,))
    input_2 = tf.keras.Input(shape=(input_dim,))
    concatenated = tf.keras.layers.Concatenate()([input_1, input_2])
    dense = tf.keras.layers.Dense(64, activation='relu')(concatenated)
    output = tf.keras.layers.Dense(1, activation='linear')(dense)
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Dimensión de los embeddings
input_dim = word2vec_model.vector_size

# Construir el modelo
similarity_model = build_similarity_model(input_dim)

# Resumen del modelo
similarity_model.summary()


In [None]:
# COMPARACIÓ MODELS
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

# Función para construir un modelo de similitud
def build_similarity_model(input_dim):
    input_1 = tf.keras.Input(shape=(input_dim,))
    input_2 = tf.keras.Input(shape=(input_dim,))
    concatenated = tf.keras.layers.Concatenate()([input_1, input_2])
    dense = tf.keras.layers.Dense(64, activation='relu')(concatenated)
    output = tf.keras.layers.Dense(1, activation='linear')(dense)
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Datos de ejemplo
text_pairs = [
    ("Aquest és un text d'exemple.", "Aquest és un altre text d'exemple."),
    ("Aquest és un text d'exemple.", "Aquest és un text diferent."),
    ("Aquest és un text molt semblant.", "Aquest és un text molt semblant."),
    ("Aquest és un text d'exemple.", "Aquest és un text completament diferent.")
]
similarity_scores = [0.9, 0.1, 1.0, 0.0]

# Preprocesar los textos
processed_pairs = [(preprocess(pair[0]), preprocess(pair[1])) for pair in text_pairs]

# Función para obtener one-hot encoding
def get_one_hot_encodings(pairs, vocab_size=1000):
    all_text = ' '.join([text for pair in pairs for text in pair]).split()
    most_common_words = [word for word, _ in Counter(all_text).most_common(vocab_size)]
    word_index = {word: idx for idx, word in enumerate(most_common_words)}
    
    def encode(text):
        tokens = text.split()
        encoding = np.zeros(vocab_size)
        for token in tokens:
            if token in word_index:
                encoding[word_index[token]] = 1
        return encoding
    
    X1 = np.array([encode(pair[0]) for pair in pairs])
    X2 = np.array([encode(pair[1]) for pair in pairs])
    return X1, X2

# Obtener one-hot encodings
X1_one_hot, X2_one_hot = get_one_hot_encodings(processed_pairs)

# Función para obtener embeddings de Word2Vec y GloVe
def get_embeddings(model, text, use_weights=False):
    tokens = text.split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if use_weights:
        weights = np.array([model.wv.vocab[token].count for token in tokens if token in model.wv])
        if len(vectors) > 0:
            return np.average(vectors, axis=0, weights=weights)
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Cargar modelos preentrenados
word2vec_model = Word2Vec.load('word2vec_model.model')
glove_model = None  # Cargar el modelo GloVe según sea necesario

# Obtener embeddings de Word2Vec y GloVe
X1_word2vec_mean = np.array([get_embeddings(word2vec_model, pair[0]) for pair in processed_pairs])
X2_word2vec_mean = np.array([get_embeddings(word2vec_model, pair[1]) for pair in processed_pairs])
X1_word2vec_weighted = np.array([get_embeddings(word2vec_model, pair[0], use_weights=True) for pair in processed_pairs])
X2_word2vec_weighted = np.array([get_embeddings(word2vec_model, pair[1], use_weights=True) for pair in processed_pairs])

# Modelo de regresión para cada tipo de embedding
def train_and_evaluate(X1, X2, y, model_name):
    input_dim = X1.shape[1]
    model = build_similarity_model(input_dim)
    model.fit([X1, X2], y, epochs=10)
    predictions = model.predict([X1, X2])
    print(f"Predicciones de similitud para {model_name}: {predictions}")

# Entrenar y evaluar los modelos
train_and_evaluate(X1_one_hot, X2_one_hot, similarity_scores, "One-Hot Encoding")
train_and_evaluate(X1_word2vec_mean, X2_word2vec_mean, similarity_scores, "Word2Vec Mean")
train_and_evaluate(X1_word2vec_weighted, X2_word2vec_weighted, similarity_scores, "Word2Vec Weighted Mean")


In [1]:
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import spacy
from transformers import RobertaModel, RobertaTokenizer
import torch

# Función para construir un modelo de similitud
def build_similarity_model(input_dim):
    input_1 = tf.keras.Input(shape=(input_dim,))
    input_2 = tf.keras.Input(shape=(input_dim,))
    concatenated = tf.keras.layers.Concatenate()([input_1, input_2])
    dense = tf.keras.layers.Dense(64, activation='relu')(concatenated)
    output = tf.keras.layers.Dense(1, activation='linear')(dense)
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Datos de ejemplo
text_pairs = [
    ("Aquest és un text d'exemple.", "Aquest és un altre text d'exemple."),
    ("Aquest és un text d'exemple.", "Aquest és un text diferent."),
    ("Aquest és un text molt semblant.", "Aquest és un text molt semblant."),
    ("Aquest és un text d'exemple.", "Aquest és un text completament diferent.")
]
similarity_scores = [0.9, 0.1, 1.0, 0.0]

# Preprocesar los textos
processed_pairs = [(preprocess(pair[0]), preprocess(pair[1])) for pair in text_pairs]

# Función para obtener one-hot encoding
def get_one_hot_encodings(pairs, vocab_size=1000):
    all_text = ' '.join([text for pair in pairs for text in pair]).split()
    most_common_words = [word for word, _ in Counter(all_text).most_common(vocab_size)]
    word_index = {word: idx for idx, word in enumerate(most_common_words)}
    
    def encode(text):
        tokens = text.split()
        encoding = np.zeros(vocab_size)
        for token in tokens:
            if token in word_index:
                encoding[word_index[token]] = 1
        return encoding
    
    X1 = np.array([encode(pair[0]) for pair in pairs])
    X2 = np.array([encode(pair[1]) for pair in pairs])
    return X1, X2

# Obtener one-hot encodings
X1_one_hot, X2_one_hot = get_one_hot_encodings(processed_pairs)

# Función para obtener embeddings de Word2Vec y GloVe
def get_embeddings(model, text, use_weights=False):
    tokens = text.split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if use_weights:
        weights = np.array([model.wv.vocab[token].count for token in tokens if token in model.wv])
        if len(vectors) > 0:
            return np.average(vectors, axis=0, weights=weights)
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Cargar modelos preentrenados
word2vec_model = Word2Vec.load('word2vec_model.model')
glove_model = None  # Cargar el modelo GloVe según sea necesario

# Obtener embeddings de Word2Vec y GloVe
X1_word2vec_mean = np.array([get_embeddings(word2vec_model, pair[0]) for pair in processed_pairs])
X2_word2vec_mean = np.array([get_embeddings(word2vec_model, pair[1]) for pair in processed_pairs])
X1_word2vec_weighted = np.array([get_embeddings(word2vec_model, pair[0], use_weights=True) for pair in processed_pairs])
X2_word2vec_weighted = np.array([get_embeddings(word2vec_model, pair[1], use_weights=True) for pair in processed_pairs])

# Cargar SpaCy y obtener embeddings
nlp = spacy.load('en_core_web_md')

def get_spacy_embeddings(pairs):
    X1, X2 = [], []
    for pair in pairs:
        doc1 = nlp(pair[0])
        doc2 = nlp(pair[1])
        X1.append(doc1.vector)
        X2.append(doc2.vector)
    return np.array(X1), np.array(X2)

X1_spacy, X2_spacy = get_spacy_embeddings(processed_pairs)

# Cargar RoBERTa y obtener embeddings
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

def get_roberta_embeddings(pairs, strategy='CLS'):
    X1, X2 = [], []
    for pair in pairs:
        inputs1 = tokenizer(pair[0], return_tensors='pt', truncation=True, padding=True)
        inputs2 = tokenizer(pair[1], return_tensors='pt', truncation=True, padding=True)
        outputs1 = roberta_model(**inputs1)
        outputs2 = roberta_model(**inputs2)
        
        if strategy == 'CLS':
            X1.append(outputs1.last_hidden_state[:, 0, :].detach().numpy().flatten())
            X2.append(outputs2.last_hidden_state[:, 0, :].detach().numpy().flatten())
        elif strategy == 'MEAN':
            X1.append(outputs1.last_hidden_state.mean(dim=1).detach().numpy().flatten())
            X2.append(outputs2.last_hidden_state.mean(dim=1).detach().numpy().flatten())
    
    return np.array(X1), np.array(X2)

# Obtener embeddings de RoBERTa usando CLS y MEAN
X1_roberta_cls, X2_roberta_cls = get_roberta_embeddings(processed_pairs, strategy='CLS')
X1_roberta_mean, X2_roberta_mean = get_roberta_embeddings(processed_pairs, strategy='MEAN')

# Función para entrenar y evaluar el modelo
def train_and_evaluate(X1, X2, y, model_name):
    input_dim = X1.shape[1]
    model = build_similarity_model(input_dim)
    model.fit([X1, X2], y, epochs=10)
    predictions = model.predict([X1, X2])
    print(f"Predicciones de similitud para {model_name}: {predictions}")

# Entrenar y evaluar los modelos
train_and_evaluate(X1_one_hot, X2_one_hot, similarity_scores, "One-Hot Encoding")
train_and_evaluate(X1_word2vec_mean, X2_word2vec_mean, similarity_scores, "Word2Vec Mean")
train_and_evaluate(X1_word2vec_weighted, X2_word2vec_weighted, similarity_scores, "Word2Vec Weighted Mean")
train_and_evaluate(X1_spacy, X2_spacy, similarity_scores, "SpaCy Embeddings")
train_and_evaluate(X1_roberta_cls, X2_roberta_cls, similarity_scores, "RoBERTa CLS")
train_and_evaluate(X1_roberta_mean, X2_roberta_mean, similarity_scores, "RoBERTa MEAN")


ModuleNotFoundError: No module named 'tensorflow'