In [1]:
from scipy.spatial.distance import cdist
from src.Common import get_pickle
from tqdm import tqdm

import pandas as pd
import numpy as np
import nvgpu
import os

gpu = np.argmin([g["mem_used_percent"] for g in nvgpu.gpu_info()])
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = TFAutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

  from .autonotebook import tqdm as notebook_tqdm
2023-07-04 13:05:14.388445: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Crear un tf.data a partir de train_dev o test

In [2]:
def pickle_to_tfdata(base_path, test=False, padding=None):
    data_path = f'{base_path}/ALL_DATA'

    all_data = pd.read_pickle(data_path)
    if test: all_data = all_data[all_data["test"]==1]
    else: all_data = all_data[all_data["test"]!=1]

    text_data = all_data["text"]
    text_seqs = tokenizer.batch_encode_plus(text_data.tolist(), max_length=padding, padding="max_length", truncation=True, return_tensors='tf')

    text_enco = text_seqs["input_ids"].numpy()
    text_mask = text_seqs["attention_mask"].numpy()

    text_enco_ds = tf.data.Dataset.from_tensor_slices(text_enco)
    text_mask_ds = tf.data.Dataset.from_tensor_slices(text_mask)

    text_dataset = tf.data.Dataset.zip((text_enco_ds, text_mask_ds))
    text_dataset = text_dataset.map(lambda x, y: ((x, y),))

    return all_data, text_dataset, text_enco.shape[-1]

Obtener los embeddings de todos los textos que hay en train_dev

In [3]:
base_path = "data/RestaurantDataset/c82f2182f12c05a57df709ccac06cf14/" # Gijón sin lematización
base_path = "data/RestaurantDataset/573535f43b572904548c57e8661d9e3a/" # Barcelona sin lematización

# padding = get_pickle(base_path, "MAX_LEN_PADDING") *2
train_dev_data, train_dev_dataset, padding = pickle_to_tfdata(base_path)

# Modelo
text_in = tf.keras.Input(shape=(padding,), dtype='int32', name="in_text")
text_mask_in = tf.keras.Input(shape=(padding,), dtype='int32', name="in_mask")

# Un embedding por palabra
token_embeddings = model((text_in, text_mask_in))[0]

# Repetir la máscara para el tamaño del embedding
input_mask_expanded = tf.expand_dims(text_mask_in, axis=-1)
input_mask_expanded = tf.cast(input_mask_expanded, dtype=tf.float32)
input_mask_expanded = tf.tile(input_mask_expanded, [1, 1, token_embeddings.shape[-1]])

# Multiplicar cada embedding por la máscara (anular embeddings que sean 0)
masked_token_embeddings = token_embeddings * input_mask_expanded
# Sumar todos los embeddings de las palabras restantes en un solo vector
summed_token_embeddings = tf.reduce_sum(masked_token_embeddings, axis=1)
mask_sum = tf.reduce_sum(input_mask_expanded, axis=1)
mask_sum = tf.maximum(mask_sum, 1e-9)

mean_pooled_embeddings = summed_token_embeddings / mask_sum

keras_model = tf.keras.models.Model(inputs=[text_in, text_mask_in], outputs=tf.reduce_sum(token_embeddings, axis=1), name="my_model")

batch_size = 300
train_dev_dataset = train_dev_dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
pred = keras_model.predict(train_dev_dataset, verbose=1)



In [4]:
test_data, test_dataset, _= pickle_to_tfdata(base_path, test=True, padding=padding)
test_dataset = test_dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
test_preds = keras_model.predict(test_dataset, verbose=1)



In [5]:
from sklearn.metrics import pairwise_distances_chunked, ndcg_score

# Calcular distancias de forma paralela
distances = pairwise_distances_chunked(test_preds, pred, metric='cosine', n_jobs=-1, working_memory=500)

true_targets_list = []
pred_item_ids_list = []

tst_id = 0
with tqdm(total=len(test_preds)) as pbar:
    for dist_chunk in distances:
        for dist_item in range(len(dist_chunk)):
            distances = dist_chunk[dist_item]       
            train_dev_data["cosine_prox"] = 1-distances # El NDCG ordena de mayor a menor
            pred_item_dist = train_dev_data.groupby("id_item")["cosine_prox"].mean().values

            tst_item_id = test_data.iloc[tst_id]["id_item"]
            true_target = np.zeros(len(pred_item_dist)) 
            true_target[tst_item_id]=1
            true_targets_list.append(true_target)
            pred_item_ids_list.append(pred_item_dist)
        
            tst_id+=1
            pbar.update(1)

true_targets = np.array(true_targets_list)
pred_item_ids = np.array(pred_item_ids_list)

ndcg = ndcg_score(true_targets, pred_item_ids, k=10)
print(ndcg)

100%|██████████| 31291/31291 [02:29<00:00, 209.88it/s]


0.07987630767533206


In [8]:
def ndcg_at_k(y_true, y_pred_ranked, k):
    # Obtener los índices de los k elementos principales
    top_k_indices = y_pred_ranked[:k]
    # Obtener las relevancias correspondientes a los índices
    relevances = y_true[top_k_indices]
    # Calcular los descuentos y la posición ideal de relevancia
    discounts = np.log2(np.arange(2, len(relevances) + 2))
    ideal_relevances = np.sort(y_true)[::-1]
    ideal_dcg = np.sum(ideal_relevances[:k] / discounts)
    # Calcular el nDCG
    dcg = np.sum(relevances / discounts)
    ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.0
    return ndcg

In [30]:
from sklearn.metrics import ndcg_score

k = 10

true_targets_list = []
pred_item_ids_list = []

for tst_id in tqdm(range(len(test_preds))):
    text_encoded = test_preds[tst_id]

    distances = cdist([text_encoded], pred, metric='cosine')
    # sorted_ids = np.argsort(distances)[0][::-1] # Ordenamos de peor a mejor distancia
    # pred_item_id = train_dev_data.iloc[sorted_ids]["id_item"].drop_duplicates().values
    train_dev_data["cosine_prox"] = 1-distances[0] # El NDCG ordena de mayor a menor
    pred_item_dist = train_dev_data.groupby("id_item")["cosine_prox"].mean().values

    tst_item_id = test_data.iloc[tst_id]["id_item"]

    true_target = np.zeros(len(pred_item_dist)) 
    true_target[tst_item_id]=1
    true_targets_list.append(true_target)
    # true_targets_list.append(pred_item_id == tst_item_id)
    pred_item_ids_list.append(pred_item_dist)

true_targets = np.array(true_targets_list)
pred_item_ids = np.array(pred_item_ids_list)

# Calcular nDCG at 10 utilizando la función ndcg_score de scikit-learn
ndcg_scores = ndcg_score(true_targets, pred_item_ids, k=k)
ndcg_scores

100%|██████████| 3934/3934 [01:22<00:00, 47.87it/s]


0.17459540526516643

In [13]:
import numpy as np

# Texto de ejemplo a codificar y buscar
input_text = "quiero comer un cachopo con cecina y queso de cabra"

text_seq = tokenizer.batch_encode_plus([input_text],  max_length=padding, truncation=True, padding='max_length', return_tensors='tf')
text_enco = text_seq["input_ids"].numpy()
text_mask = text_seq["attention_mask"].numpy()
text_encoded = keras_model.predict([text_enco, text_mask], verbose=0)

distances = cdist(text_encoded, pred, metric='cosine')
min_distance_indices = np.argsort(distances)[0][:5]  # Obtiene los índices de los 5 elementos más cercanos

for idx in min_distance_indices:
    distance = round(distances[0][idx], 2)  # Redondea la distancia a 2 dígitos
    print(f'[{distances[0][idx]:0.2f}][{train_dev_data["name"][idx]}] -> {train_dev_data.iloc[idx]["text"]} (Distancia: {distance})')


[0.06][Living Barcelona 1925] -> solomillo de ternera con queso de cabra exelente sabor y presentacion sitio en la rambla de barelona para no perdelseo (Distancia: 0.06)
[0.07][La Pizza del Born] -> les recomiendo la de perejil y queso de cabra o pesto la comida esta riquisima al igual que ka sangria gracias (Distancia: 0.07)
[0.07][Bruc33Tapas] -> deliciosas tapas las croquetas de rabo exquisitas muy buena ensalada de queso de cabra y los dados de ternera suaves y jugosos buen lugar para repetir acogedor y agradable (Distancia: 0.07)
[0.08][Bar Bodega l'Electricitat] -> pedimos ensalada de queso de cabra carpaccio de ternera y espinacas todo resulto estar riquisimo recomiendo el lugar (Distancia: 0.08)
[0.08][Restaurant Vegetalia (Plaza Fossar de les Moreres)] -> la comida estupenda la fajita de espinacas con queso de cabra y el librito de seitan muy buenos buena atencion y trato un sitio para repetir sin duda (Distancia: 0.08)


In [2]:
# Sentences we want sentence embeddings for
sentences = ['this is a sample phrase with eighty words']

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='tf')

# Compute token embeddings
model_output = model(encoded_input)

model_output[0][0].shape
encoded_input[0].tokens

['<s>',
 '▁this',
 '▁is',
 '▁a',
 '▁sample',
 '▁phrase',
 '▁with',
 '▁eight',
 'y',
 '▁words',
 '</s>']

In [3]:
import tensorflow as tf

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv2D(32, 3, activation='relu', padding='same', input_shape=(32, 32,3)))
model.add(tf.keras.layers.Conv2D(32, 3, activation='relu', padding='same'))
model.add(tf.keras.layers.AveragePooling2D())

model.add(tf.keras.layers.Conv2D(64, 3, activation='relu', padding='same'))
model.add(tf.keras.layers.Conv2D(64, 3, activation='relu', padding='same'))
model.add(tf.keras.layers.AveragePooling2D())

model.add(tf.keras.layers.Dense(2024))

model.add(tf.keras.layers.Dense(1024, activation='relu'))
model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.compile(loss='mean_squared_error',
              optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 32, 32, 32)        896       
                                                                 
 conv2d_9 (Conv2D)           (None, 32, 32, 32)        9248      
                                                                 
 average_pooling2d_4 (Averag  (None, 16, 16, 32)       0         
 ePooling2D)                                                     
                                                                 
 conv2d_10 (Conv2D)          (None, 16, 16, 64)        18496     
                                                                 
 conv2d_11 (Conv2D)          (None, 16, 16, 64)        36928     
                                                                 
 average_pooling2d_5 (Averag  (None, 8, 8, 64)         0         
 ePooling2D)                                          