In [1]:
from locale import setlocale, LC_TIME
import tensorflow.keras.backend as K
from tqdm.auto import trange
from cornac import models
import tensorflow as tf
import pandas as pd
import numpy as np
import cornac
import os

city = "gijon"
setlocale(LC_TIME, 'es_ES.UTF-8')

data_path = f"/media/nas/datasets/tripadvisor/restaurants/{city}/reviews.pkl"
data = pd.read_pickle(data_path)
# Ordenar por fecha (- a +) y quedarse con la última (si hay repeticiones)
data["date"] =  pd.to_datetime(data["date"] , format='%d de %B de %Y')
data["timestamp"] = data["date"].values.astype(np.int64) // 10 ** 9
data = data.sort_values("date").reset_index(drop=True)
data = data.drop_duplicates(subset=["userId", "restaurantId"], keep='last', inplace=False)

In [None]:
from cornac.datasets import amazon_digital_music
from cornac.eval_methods import RatioSplit
from cornac.data.text import BaseTokenizer
from cornac.data import ReviewModality
feedback = list(zip(data["userId"], data["restaurantId"], data["rating"]/10))
# feedback = amazon_digital_music.load_feedback()
# reviews = amazon_digital_music.load_review()
reviews = list(zip(data["userId"], data["restaurantId"], data["text"].values.tolist()))

cold_start = False
ratio_split = RatioSplit(data=feedback, test_size=0.2, val_size=0.2, exclude_unknowns=not cold_start, verbose=True, seed=123)

In [3]:
tokenizer = BaseTokenizer()
max_vocab = 3000
max_doc_freq = 0.5

train_text = ReviewModality(data=reviews, tokenizer=tokenizer, max_vocab=max_vocab, max_doc_freq=max_doc_freq)
train_text.build(ratio_split.train_set.uid_map, ratio_split.train_set.iid_map, ratio_split.train_set.dok_matrix)
ratio_split.add_modalities(item_text=train_text)

val_text = ReviewModality(data=reviews, tokenizer=tokenizer, max_vocab=max_vocab, max_doc_freq=max_doc_freq)
val_text.build(ratio_split.val_set.uid_map, ratio_split.val_set.iid_map, ratio_split.val_set.dok_matrix)

test_text = ReviewModality(data=reviews, tokenizer=tokenizer, max_vocab=max_vocab, max_doc_freq=max_doc_freq)
test_text.build(ratio_split.test_set.uid_map, ratio_split.test_set.iid_map, ratio_split.test_set.dok_matrix)

<cornac.data.text.ReviewModality at 0x7fb54a90f430>

### Create Model

In [None]:
from cornac.metrics import MAE, RMSE, Precision, Recall, NDCG, AUC, MAP, FMeasure
from cornac.hyperopt import GridSearch, RandomSearch, Discrete, Continuous

from cornac.models import Recommender


class ATTREX(Recommender):

    def __init__(self, name="ATTREX", learning_rate=5e-4, max_iter=50, batch_size=512, vocab_size=3000, max_text_size=200, seed=2032, embedding_dim=16, train_text=None, val_text=None, test_text=None, trainable=True, verbose=False):
        Recommender.__init__(self, name=name, trainable=trainable, verbose=verbose)
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.max_text_size = max_text_size
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.train_text, self.val_text, self.test_text = (train_text, val_text, test_text)
        self.max_iter = max_iter
        self.seed = seed
        self.gpu=0

        self.test_predictions  = None

    def __config_session__(self, mixed_precision=True):
        # Selecciona una de las gpu dispobiles
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu)

        if mixed_precision:
            tf.keras.mixed_precision.set_global_policy('mixed_float16')
        
        gpus = tf.config.experimental.list_physical_devices("GPU")
        for g in gpus:
            tf.config.experimental.set_memory_growth(g, True)

    def __get_model__(self):
        
        def root_mean_squared_error(y_true, y_pred):
            return K.sqrt(K.mean(K.square(y_pred - y_true))) 

        itm_no = self.train_set.total_items
        pad_len = self.max_text_size
        vocab_size = self.vocab_size
        
        text_in = tf.keras.Input(shape=(pad_len,), dtype='int32', name="input_text")
        rest_in = tf.keras.Input(shape=(1,), dtype='int32', name="input_item")
        
        emb_size = 128

        # init = tf.keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=None)
        use_bias = True
        
        # word_importance = tf.keras.layers.Embedding(vocab_size, 1, name="word_importance", embeddings_initializer="ones", mask_zero=True)(text_in)

        query_emb = tf.keras.layers.Embedding(vocab_size, emb_size*3, mask_zero=True)
        mask_query = query_emb.compute_mask(text_in)
        mask_query = tf.expand_dims(tf.cast(mask_query, dtype=tf.float32), -1)
        mask_query = tf.tile(mask_query, [1, 1, 1])
        ht_emb = query_emb(text_in)
        # ht_emb = tf.keras.layers.Activation("tanh")(ht_emb)
        ht_emb = tf.keras.layers.Dense(emb_size*2, use_bias=use_bias)(ht_emb)
        # ht_emb = tf.keras.layers.Activation("tanh")(ht_emb)
        ht_emb = tf.keras.layers.Dense(emb_size, use_bias=use_bias)(ht_emb)

        # ht_emb = tf.keras.layers.Activation("tanh")(ht_emb)
        ht_emb = tf.keras.layers.Lambda(lambda x: x, name="word_emb")(ht_emb)

        rests_emb = tf.keras.layers.Embedding(itm_no, emb_size*3, name=f"in_rsts")
        hr_emb = rests_emb(rest_in)
        # hr_emb = tf.keras.layers.Activation("tanh")(hr_emb)
        hr_emb = tf.keras.layers.Dense(emb_size*2, use_bias=use_bias)(hr_emb)
        # hr_emb = tf.keras.layers.Activation("tanh")(hr_emb)
        hr_emb = tf.keras.layers.Dense(emb_size, use_bias=use_bias)(hr_emb)
        # hr_emb = tf.keras.layers.Activation("tanh")(hr_emb)
        hr_emb = tf.keras.layers.Lambda(lambda x: x, name="itm_emb")(hr_emb)

        model = tf.keras.layers.Lambda(lambda x: tf.matmul(x[0], x[1], transpose_b=True))([ht_emb, hr_emb])
        # model = tf.keras.layers.BatchNormalization()(model)
        # model = tf.keras.layers.GaussianNoise(.5)(model)
        
        model = tf.keras.layers.Lambda(lambda x: x[0]*x[1])([model, mask_query])
        model = tf.keras.layers.Activation("tanh", name="dotprod")(model)
        # model = tf.keras.layers.Lambda(lambda x: x[0]*x[1], name="importance")([model, word_importance])
        # model = tf.keras.layers.Lambda(lambda x: tf.nn.softmax(x[0], 2), name="dotprod")([model])

        model = tf.keras.layers.Dropout(.4)(model)

        model = tf.keras.layers.Lambda(lambda x: tf.math.reduce_sum(x, 1),  name="sum")(model)
        model_out = tf.keras.layers.Activation("linear", name="out", dtype='float32')(model)
    
        model = tf.keras.models.Model(inputs=[text_in, rest_in], outputs=[model_out], name=self.name)

        optimizer = tf.keras.optimizers.Adam(self.learning_rate)
        model.compile(loss=root_mean_squared_error, optimizer=optimizer)

        return model
    
    def fit(self, train_set, val_set=None):

        Recommender.fit(self, train_set, val_set)
        # Config session
        self.__config_session__()

        # Create Model
        self.model = self.__get_model__()
        
        # Text data
        all_train_data = [(k, k1, v1) for k, v in self.train_text.user_review.items() for k1, v1 in v.items()]
        all_train_data = pd.DataFrame(all_train_data, columns=['user', 'item', 'review']).sort_values(["user", "item"]).reset_index(drop=True)
        all_train_data["rating"] = pd.DataFrame(zip(train_set.uir_tuple[0],train_set.uir_tuple[1],train_set.uir_tuple[2]), columns=["user", "item", "rating"]).sort_values(["user", "item"])["rating"].values
        all_train_seqs = self.train_text.batch_seq(all_train_data["review"], max_length=self.max_text_size)
        all_train_seqs = tf.keras.utils.pad_sequences(all_train_seqs, maxlen=self.max_text_size, padding="post")
        
        data_x = tf.data.Dataset.from_tensor_slices(all_train_seqs)
        data_x_rsts =  tf.data.Dataset.from_tensor_slices(all_train_data["item"])
        data_x = tf.data.Dataset.zip((data_x, data_x_rsts))
        data_y = tf.data.Dataset.from_tensor_slices(all_train_data["rating"])
        train_tfset = tf.data.Dataset.zip((data_x, data_y))
        train_tfset = train_tfset.batch(self.batch_size).cache().prefetch(tf.data.AUTOTUNE)


        val_tfset = None
        if val_set is not None:
            # TODO
            val_tfset = None

        # Training loop
        # self.model.fit(train_tfset, epochs=self.n_epochs, validation_data=val_tfset)
        loop = trange(self.max_iter)
        for n_epoch in loop:
            epoch_hist = self.model.fit(train_tfset, epochs=1, validation_data=val_tfset, verbose=1)
            log_line = f', train_loss: {epoch_hist.history["loss"][-1]:0.4f}'
            if val_tfset is not None: log_line+= f', val_loss: {epoch_hist.history["val_loss"][-1]:0.4f} '
            loop.set_postfix_str(log_line,refresh=True)

        loop.close()

        # Obtener todos los resultados para el conjunto de test para acelerar
        all_test_data = [(k1, v1) for k, v in self.test_text.user_review.items() for k1, v1 in v.items()]
        all_test_data = pd.DataFrame(all_test_data, columns=['item', 'review']).sort_values(["review"]).reset_index(drop=True)
        all_test_seqs = self.test_text.batch_seq(all_test_data["review"], max_length=self.max_text_size)
        all_test_seqs = tf.keras.utils.pad_sequences(all_test_seqs, maxlen=self.max_text_size, padding="post")
        data_test_x_seq = tf.data.Dataset.from_tensor_slices(all_test_seqs)
        data_test_x_rsts =  tf.data.Dataset.from_tensor_slices(all_test_data["item"])
        data_test_x = tf.data.Dataset.from_tensor_slices({"input_text":all_test_seqs, "input_item":all_test_data["item"]})
        data_test_x = data_test_x.batch(self.batch_size).cache().prefetch(tf.data.AUTOTUNE)

        self.test_predictions = self.model.predict(data_test_x, verbose=0)
        # user_embs = self.model.get_layer("user_emb").weights[0]
        # item_embs = self.model.get_layer("item_emb").weights[0]
        # self.dot_prods = tf.tensordot(user_embs, tf.transpose(item_embs), axes=1).numpy()

        return self

    def score(self, user_id, item_id=None):
        """
        if item_id is None:
            return self.dot_prods[user_id]
        else:
            return [self.dot_prods[user_id, item_id]]
        """
        if item_id is not None:
            review_id = self.test_text.user_review[user_id][item_id]
            return self.test_predictions[review_id]
        else:
            print(user_id)
            return [0]


# Register your model in Cornac's model dictionary
models.ATTREX = ATTREX
from cornac.models import ATTREX

K.clear_session()

# Define metrics to evaluate the models
metrics_k = 100
metrics = [MAE(), RMSE()]#, MAE(), RMSE(), AUC(), MAP(), FMeasure(k=metrics_k), Precision(k=metrics_k), Recall(k=metrics_k), NDCG(k=metrics_k)]
# Instantiate models
m_attrex = cornac.models.ATTREX(max_iter=1, vocab_size=max_vocab, learning_rate=1e-4, batch_size=512, seed=10, train_text=train_text, val_text=val_text, test_text=test_text)
m_cdl = cornac.models.CDL(max_iter=5, vocab_size=max_vocab)

cornac.Experiment(eval_method=ratio_split, 
                    models=[
                        cornac.models.ATTREX(max_iter=10, vocab_size=max_vocab, learning_rate=1e-4, batch_size=512, seed=10, train_text=train_text, val_text=val_text, test_text=test_text),
                        cornac.models.MF(k=10, max_iter=25, learning_rate=0.01, lambda_reg=0.02, use_bias=True, seed=123),
                        #cornac.models.PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001, seed=123),
                        #cornac.models.BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
                    ],
                    metrics=metrics,
                    user_based=False, show_validation=False).run()

In [18]:
# RandomSearch
rs_attrex = RandomSearch(
    model= cornac.models.ATTREX(max_iter=1, vocab_size=max_vocab, learning_rate=1e-4, batch_size=512, seed=10, train_text=train_text, val_text=val_text, test_text=test_text),
    space=[
        Continuous("learning_rate", low=1e-5, high=1e-3),
        # Discrete("max_iter", np.linspace(5,100,20, dtype=int)),
        Discrete("batch_size", [32,64,128,256,512])
    ],
    metric=RMSE(),
    eval_method=ratio_split,
    n_trails=20,
)

cornac.Experiment( eval_method=ratio_split, models=[rs_attrex], metrics=[MAE(), RMSE()], user_based=False, show_validation=False).run()


[RandomSearch_ATTREX] Training started!


In [None]:
print('Grid search: max_iter = {:.2f}'.format(gs_hft.best_params.get('max_iter')))
print('Grid search: lambda_reg = {:.2f}'.format(gs_hft.best_params.get('batch_size')))
print('Grid search: learning_rate = {:.2f}'.format(gs_cdl.best_params.get('learning_rate')))

In [None]:
# initialize models, here we are comparing: Biased MF, PMF, and BPR
models = [
    ATTEX(),
    MF(k=10, max_iter=25, learning_rate=0.01, lambda_reg=0.02, use_bias=True, seed=123),
    # PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001, seed=123),
    # NeuMF(seed=123),
    # BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
    # HFT(k=10, max_iter=200, seed=123),
    # BiVAECF(k=10, n_epochs=100, learning_rate=0.001, seed=123)
]

# put it together in an experiment, voilà!
cornac.Experiment(eval_method=ratio_split, models=models, metrics=metrics, user_based=True).run()