In [1]:
import pandas as pd
import numpy as np
import nvgpu
import os

gpu = np.argmin([g["mem_used_percent"] for g in nvgpu.gpu_info()])
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = TFAutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [92]:
data_path = 'data/RestaurantDataset/417c135d6d69672cd3dccf9f3d7bc369/ALL_DATA'
all_data = pd.read_pickle(data_path)
text_data = all_data["text"]
text_seqs = tokenizer.batch_encode_plus(text_data.tolist(),  padding=True, truncation=True, return_tensors='tf')

text_enco = text_seqs["input_ids"].numpy()
text_mask = text_seqs["attention_mask"].numpy()

text_enco_ds = tf.data.Dataset.from_tensor_slices(text_enco)
text_mask_ds = tf.data.Dataset.from_tensor_slices(text_mask)

text_dataset = tf.data.Dataset.zip((text_enco_ds, text_mask_ds))
text_dataset = text_dataset.map(lambda x, y: ((x, y),))

# text_dataset = text_enco_ds

# Modelo
text_in = tf.keras.Input(shape=(text_enco.shape[-1],), dtype='int32', name="in_text")
text_mask_in = tf.keras.Input(shape=(text_mask.shape[-1],), dtype='int32', name="in_mask")

# Un embedding por palabra
token_embeddings = model((text_in, text_mask_in))[0]

# Repetir la máscara para el tamaño del embedding
input_mask_expanded = tf.expand_dims(text_mask_in, axis=-1)
input_mask_expanded = tf.cast(input_mask_expanded, dtype=tf.float32)
input_mask_expanded = tf.tile(input_mask_expanded, [1, 1, token_embeddings.shape[-1]])

# Multiplicar cada embedding por la máscara (anular embeddings que sean 0)
masked_token_embeddings = token_embeddings * input_mask_expanded
# Sumar todos los embeddings de las palabras restantes en un solo vector
summed_token_embeddings = tf.reduce_sum(masked_token_embeddings, axis=1)
mask_sum = tf.reduce_sum(input_mask_expanded, axis=1)
mask_sum = tf.maximum(mask_sum, 1e-9)

mean_pooled_embeddings = summed_token_embeddings / mask_sum

keras_model = tf.keras.models.Model(inputs=[text_in, text_mask_in], outputs=tf.reduce_sum(token_embeddings, axis=1), name="my_model")

batch_size = 300
text_dataset = text_dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
pred = keras_model.predict(text_dataset, verbose=1)



In [95]:
from scipy.spatial.distance import cdist

# Texto de ejemplo a codificar y buscar
input_text = "quiero comer un cachopo barato"

text_seq = tokenizer.batch_encode_plus([input_text],  max_length=text_enco.shape[-1], truncation=True, padding='max_length', return_tensors='tf')
text_enco = text_seq["input_ids"].numpy()
text_mask = text_seq["attention_mask"].numpy()
text_encoded = keras_model.predict([text_enco, text_mask], verbose=0)

distances = cdist(text_encoded, pred, metric='cosine')
min_distance_idx = np.argmin(distances)
print(f'[{all_data["name"][min_distance_idx]}] -> {text_data[min_distance_idx]} ')


[SIDRERIA CANDASU] -> uno ambiente agradable buen precio plato abundante el sidra mucho buen y el pincho gratis 


In [2]:
# Sentences we want sentence embeddings for
sentences = ['this is a sample phrase with eighty words']

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='tf')

# Compute token embeddings
model_output = model(encoded_input)

model_output[0][0].shape
encoded_input[0].tokens

['<s>',
 '▁this',
 '▁is',
 '▁a',
 '▁sample',
 '▁phrase',
 '▁with',
 '▁eight',
 'y',
 '▁words',
 '</s>']

In [3]:
import tensorflow as tf

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv2D(32, 3, activation='relu', padding='same', input_shape=(32, 32,3)))
model.add(tf.keras.layers.Conv2D(32, 3, activation='relu', padding='same'))
model.add(tf.keras.layers.AveragePooling2D())

model.add(tf.keras.layers.Conv2D(64, 3, activation='relu', padding='same'))
model.add(tf.keras.layers.Conv2D(64, 3, activation='relu', padding='same'))
model.add(tf.keras.layers.AveragePooling2D())

model.add(tf.keras.layers.Dense(2024))

model.add(tf.keras.layers.Dense(1024, activation='relu'))
model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.compile(loss='mean_squared_error',
              optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 32, 32, 32)        896       
                                                                 
 conv2d_9 (Conv2D)           (None, 32, 32, 32)        9248      
                                                                 
 average_pooling2d_4 (Averag  (None, 16, 16, 32)       0         
 ePooling2D)                                                     
                                                                 
 conv2d_10 (Conv2D)          (None, 16, 16, 64)        18496     
                                                                 
 conv2d_11 (Conv2D)          (None, 16, 16, 64)        36928     
                                                                 
 average_pooling2d_5 (Averag  (None, 8, 8, 64)         0         
 ePooling2D)                                          