In [54]:
from src.datasets.text_datasets.RestaurantDataset import RestaurantDataset
from src.datasets.text_datasets.AmazonDataset import AmazonDataset
from src.datasets.text_datasets.POIDataset import POIDataset

dataset = "restaurants".lower().replace(" ", "") 
subset = "gijon".lower().replace(" ", "") 

seed = 100 
l_rate = 1e-4
n_epochs = 1000 
b_size = 256 
early_stop_patience = 10

min_reviews_rst = 100
min_reviews_usr = 1
bow_pct_words = 10 
language = "es" if subset in ["gijon", "madrid", "barcelona"] else "fr" if subset in ["paris"] else "en"

remove_stopwords = 2  # 0, 1 o 2 (No quitar, quitar manual, quitar automático)
lemmatization = True
remove_accents = True
remove_numbers = True
truncate_padding = True

if dataset == "restaurants":
    base_path = "/media/nas/datasets/tripadvisor/restaurants/"
elif dataset == "pois":
    base_path = "/media/nas/datasets/tripadvisor/pois/"
    language = "es"  # Están todas en español
elif dataset == "amazon":
    base_path = "/media/nas/datasets/amazon/"

# DATASET CONFIG #######################################################################################################

dts_cfg = {"dataset": dataset, "subset": subset, "language": language, "seed": seed, "data_path": base_path, "save_path": "data/",  # base_path + "Datasets/",
            "remove_stopwords": remove_stopwords, "remove_accents": remove_accents, "remove_numbers": remove_numbers,
            "lemmatization": lemmatization,
            "min_reviews_rst": min_reviews_rst, "min_reviews_usr": min_reviews_usr,
            "min_df": 5, "bow_pct_words": bow_pct_words, "presencia": False, "text_column": "text",  # BOW
            "n_max_words": -50, "test_dev_split": .1, "truncate_padding": truncate_padding}

if dataset == "restaurants": text_dataset = RestaurantDataset(dts_cfg)
elif dataset == "pois": text_dataset = POIDataset(dts_cfg)
elif dataset == "amazon": text_dataset = AmazonDataset(dts_cfg)
else: raise ValueError

In [92]:
import pandas as pd
import numpy as np
import re

tokenizer = text_dataset.DATA["TEXT_TOKENIZER"]
all_data = pd.read_pickle(text_dataset.DATASET_PATH+"ALL_DATA")[["reviewId", "text"]]
sequences = tokenizer.texts_to_sequences(all_data.text.values)
text_sequences = [re.split(r'[\s|_]', sentence) for sentence in all_data.text.values] # [sentence.split() for sentence in tokenizer.sequences_to_texts(sequences)]

In [None]:
text_sequences

In [None]:
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

class TrainingCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print(f"Epoch {self.epoch:03d} => Loss:", loss)
        self.epoch+=1

# Definir los parámetros del modelo
embedding_dim = 128  # Dimensión del espacio de vectores
window_size = 10      # Tamaño de la ventana de contexto
epochs = 10         # Número de épocas de entrenamiento

# Entrenar el modelo Word2Vec
w2v_model = Word2Vec(min_count=0, sentences=text_sequences, vector_size=embedding_dim, epochs=epochs, window=window_size, callbacks=[TrainingCallback()], compute_loss=True, workers=20, seed=seed)

print(len(w2v_model.wv.key_to_index))

In [146]:
list(tokenizer.word_index.keys())

['el',
 'de',
 'y',
 'que',
 'uno',
 'ser',
 'mucho',
 'en',
 'a',
 'con',
 'yo',
 'no',
 'para',
 'por',
 'buen',
 'este',
 'haber',
 'todo',
 'pero',
 'comida',
 'estar',
 'ir',
 'del',
 'precio',
 'bien',
 'tener',
 'mas',
 'comer',
 'al',
 'calidad',
 'tu',
 'plato',
 'si',
 'como',
 'su',
 'sitio',
 'restaurante',
 'servicio',
 'sin',
 'pedir',
 'gijon',
 'poder',
 'mejor',
 'mi',
 'menu',
 'postre',
 'carta',
 'hacer',
 'vez',
 'trato',
 'mesa',
 'poco',
 'rico',
 'otro',
 'camarero',
 'local',
 'ya',
 'o',
 'decir',
 'dos',
 'tanto',
 'ese',
 'atencion',
 'dia',
 'excelente',
 'volver',
 'bueno',
 'sidra',
 'siempre',
 'nuestro',
 'racion',
 'recomendable',
 'tambien',
 'cachopo',
 'probar',
 'dar',
 'algo',
 'porque',
 'carne',
 'lugar',
 'bastante',
 'cenar',
 'nada',
 'primero',
 'persona',
 'agradable',
 'personal',
 'gustar',
 'tomar',
 'ni',
 'aunque',
 'cuando',
 'cocina',
 'arroz',
 'esperar',
 'vino',
 'poner',
 'queso',
 'ver',
 'asi',
 'repetir',
 'sabor',
 'solo',
 '

In [119]:
# Los index del word2vec son diferentes a los del tokenizador de keras
# Hay que obtener una matriz con los embeddings del w2v en orden keras y con el padding
new_order = [w2v_model.wv.key_to_index[wrd] for wrd in tokenizer_wordlist]
assert len(new_order) == len(tokenizer.index_word)
w2v_embeddings = w2v_model.wv.vectors[new_order,:]
w2v_embeddings = np.vstack((np.zeros((1, w2v_embeddings.shape[1])), w2v_embeddings))

In [143]:
# Word2vec original
word = "vegetariano"
w2v_wpos = w2v_model.wv.key_to_index[word]
w2v_emb = w2v_model.wv.vectors[w2v_wpos]

keras_wpos = tokenizer.word_index[word]
kers_emb = w2v_embeddings[keras_wpos]

print(w2v_wpos, keras_wpos)
print(np.mean(w2v_emb-kers_emb))



2805 2815
0.0


In [71]:
w2v_model.wv.most_similar(positive="arroz")

[('caleya', 0.6448435187339783),
 ('echabn', 0.6083150506019592),
 ('beilis', 0.5839464664459229),
 ('salpicon', 0.5799888372421265),
 ('fabser', 0.5489625930786133),
 ('fab', 0.546390950679779),
 ('arror', 0.5432391166687012),
 ('alubia', 0.5405559539794922),
 ('capoa', 0.5381730198860168),
 ('fideaua', 0.530558705329895)]