In [1]:
from text_similarity_class import TextSimilarity
import numpy as np
from scipy import spatial
from typing import Tuple, List, Optional
from datasets import load_dataset
import tensorflow as tf
from gensim.models import TfidfModel, fasttext
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from scipy.stats import pearsonr
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
dataset = load_dataset("projecte-aina/sts-ca", trust_remote_code=True)

# Load the word embedding model
WORD_EMBEDDING_FILE = "models/fasttext_100.bin"
wv_model = fasttext.load_facebook_vectors(WORD_EMBEDDING_FILE)

# Load spaCy models for RoBERTa
spacy_roberta = spacy.load("ca_core_news_trf")

# Preprocess function
def preprocess(sentence: str) -> List[str]:
    return simple_preprocess(sentence)

# Data extraction and preprocessing
input_pairs = [(e["sentence1"], e["sentence2"], e["label"]) for e in dataset["train"]]
input_pairs_val = [(e["sentence1"], e["sentence2"], e["label"]) for e in dataset["validation"]]
input_pairs_test = [(e["sentence1"], e["sentence2"], e["label"]) for e in dataset["test"]]

In [None]:

# Instantiate the class and run the methods
text_similarity = TextSimilarity(wv_model, spacy_model=spacy_roberta)

# Prepare data
text_similarity.prepare_data(input_pairs + input_pairs_val + input_pairs_test)

# Use TF-IDF weighted FastText embeddings
mapped_train_tfidf = text_similarity.map_pairs(input_pairs, use_tfidf=True)
mapped_val_tfidf = text_similarity.map_pairs(input_pairs_val, use_tfidf=True)
mapped_test_tfidf = text_similarity.map_pairs(input_pairs_test, use_tfidf=True)


# Prepare datasets for TF-IDF weighted FastText embeddings
batch_size = 64
train_dataset_tfidf, val_dataset_tfidf, test_dataset_tfidf = text_similarity.prepare_datasets(mapped_train_tfidf, mapped_val_tfidf, mapped_test_tfidf, batch_size)

# Build and compile the model for TF-IDF weighted FastText embeddings
model_tfidf = text_similarity.build_and_compile_model()

# Train the model for TF-IDF weighted FastText embeddings
num_epochs = 64
text_similarity.train_model(model_tfidf, train_dataset_tfidf, val_dataset_tfidf, num_epochs)

# Compute Pearson correlation for TF-IDF weighted FastText embeddings
x_test_tfidf, y_test_tfidf = text_similarity.pair_list_to_x_y(mapped_test_tfidf)
print(f"Pearson Correlation (TF-IDF weighted FastText): {text_similarity.compute_pearson(model_tfidf, x_test_tfidf, y_test_tfidf)}")



In [None]:

# Use RoBERTa embeddings
mapped_train_roberta = text_similarity.map_pairs(input_pairs, use_roberta=True)
mapped_val_roberta = text_similarity.map_pairs(input_pairs_val, use_roberta=True)
mapped_test_roberta = text_similarity.map_pairs(input_pairs_test, use_roberta=True)
# Prepare datasets for RoBERTa embeddings
train_dataset_roberta, val_dataset_roberta, test_dataset_roberta = text_similarity.prepare_datasets(mapped_train_roberta, mapped_val_roberta, mapped_test_roberta, batch_size)

# Build and compile the model for RoBERTa embeddings
model_roberta = text_similarity.build_and_compile_model(embedding_size=768)  # RoBERTa has 768 dimensions

# Train the model for RoBERTa embeddings
text_similarity.train_model(model_roberta, train_dataset_roberta, val_dataset_roberta, num_epochs)

# Compute Pearson correlation for RoBERTa embeddings
x_test_roberta, y_test_roberta = text_similarity.pair_list_to_x_y(mapped_test_roberta)
print(f"Pearson Correlation (RoBERTa): {text_similarity.compute_pearson(model_roberta, x_test_roberta, y_test_roberta)}")

# Prepare datasets for trainable embeddings
max_len = 100  # max length of input sentences
dictionary_size = len(text_similarity.dictionary) + 1  # +1 for padding
embedding_size = 100  # embedding size for trainable embeddings
mapped_train_ids = text_similarity.map_pairs(input_pairs)
mapped_val_ids = text_similarity.map_pairs(input_pairs_val)
mapped_test_ids = text_similarity.map_pairs(input_pairs_test)
train_dataset_ids, val_dataset_ids, test_dataset_ids = text_similarity.prepare_datasets(mapped_train_ids, mapped_val_ids, mapped_test_ids, batch_size)

# Build and compile the model for trainable embeddings
model_trainable = text_similarity.build_and_compile_trainable_model(input_length=max_len, dictionary_size=dictionary_size, embedding_size=embedding_size)

# Train the model for trainable embeddings
text_similarity.train_model(model_trainable, train_dataset_ids, val_dataset_ids, num_epochs)

# Compute Pearson correlation for trainable embeddings
x_test_ids, y_test_ids = text_similarity.pair_list_to_x_y(mapped_test_ids)
print(f"Pearson Correlation (Trainable embeddings): {text_similarity.compute_pearson(model_trainable, x_test_ids, y_test_ids)}")
