In [1]:
import tensorflow_hub as hub
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np
import cv2
import os

2023-06-06 13:09:22.717331: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-06 13:09:23.980709: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-06 13:09:26.908342: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2023-06-06 13:09:26.909094: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot o

In [None]:
reviews_p_path = f"out/gijon/restaurants/reviews.pkl"
reviews = pd.read_pickle(reviews_p_path)
reviews.groupby("itemId")["itemId"].count().sort_values()

In [None]:
# Barcelona (RST) -> Paris (POI)
# NewYork (RST) -> Paris (POI)

### Obtener los ejemplos para entrenar el modelo

In [None]:
def get_data(city_r, city_p):
    # Obtener el conjunto de datos para entrenar (comieron en r y fueron a p)
    top_n_pois = 50

    city_r = city_r.lower().replace(" ", "")
    city_p = city_p.lower().replace(" ", "")

    reviews_r_path = f"out/{city_r}/restaurants/reviews.pkl"
    reviews_p_path = f"out/{city_p}/pois/reviews.pkl"

    reviews_r = pd.read_pickle(reviews_r_path)
    reviews_p = pd.read_pickle(reviews_p_path)

    reviews_r = reviews_r[reviews_r["userId"]!=-1]
    reviews_p = reviews_p[reviews_p["userId"]!=-1]

    # Quedarse con los POIs más populares
    reviews_p_popular = reviews_p.groupby("itemId")["itemId"].count().sort_values(ascending=False).head(top_n_pois).index.values
    reviews_p = reviews_p[reviews_p["itemId"].isin(reviews_p_popular)]

    r_data = set(reviews_r["userId"].unique())
    p_data = set(reviews_p["userId"].unique())

    common_users = r_data.intersection(p_data)

    # ToDo: ¿Por que hay menos usuarios en el conjunto users.pkl que en reviews.pkl?
    # ToDo: En R, tienen que ser solo los comunes???
    reviews_r = reviews_r.loc[reviews_r["userId"].isin(common_users)]
    reviews_p = reviews_p.loc[reviews_p["userId"].isin(common_users)]

    out_data = []

    for rst_id, rst_data in reviews_r.groupby("itemId"):
        rst_users = rst_data["userId"].unique()
        poi_revws = reviews_p.loc[reviews_p["userId"].isin(rst_users)]
        poi_revws = poi_revws.groupby("userId")["itemId"].unique().reset_index()
        # ToDo: Ojo, que los usuarios de r van a más de un POI en p
        poiId, times = np.unique(np.concatenate(poi_revws["itemId"].values), return_counts=True)

        print(rst_id, len(rst_users), poiId )

    return reviews_r, reviews_p

dataset = get_data("barcelona", "paris")

### Formas de codificar un restaurante por sus fotos

In [17]:
def read_image(path):
    try:
        # Carga la imagen desde el path.
        img = cv2.imread(path)
        # BGR a RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        # Normaliza la imagen entre -1 y 1.
        img = (img / 127.5) - 1
        # Redimensiona la imagen a 150x150 píxeles.
        img = cv2.resize(img, (150, 150))
        # Agrega una dimensión adicional para el batch.
        img = np.expand_dims(img, axis=0)
    except Exception as e:
        print(path)
        print(e)
    return img

def encode_items(city, method="ImageNet", category="restaurants", min_images=4):
    """Codificar cada item haciendo una media de sus imágenes"""
    out_file = f"{city}_itm_emb.pkl"

    if not os.path.exists(out_file):
        # Cargar datos
        city = city.lower().replace(" ", "")
        city_path = f"out/{city}/{category}"
        items_path = f"{city_path}/items.pkl"
        reviews_path = f"{city_path}/reviews.pkl" # OJO QUE HAY DUPLICADOS
        items = pd.read_pickle(items_path)
        reviews = pd.read_pickle(reviews_path)

        # Combinar reviews e items
        reviews = reviews.merge(items[["itemId", "name"]], on="itemId", how="left")
        reviews["n_images"] = reviews["images"].apply(lambda x: len(x))

        # Solo items con imágenes
        reviews = reviews.loc[reviews["n_images"]>0]

        # Seleccionar el encoder
        encoder = None
        if method == "ImageNet":
            encoder = tf.keras.Sequential([hub.KerasLayer("https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_m/feature_vector/2", trainable=False)])
            encoder.build([None, 150, 150, 3]) # Batch input shape.
        elif method == "CLIP": 
            encoder = None
        else: 
            raise ValueError # Añadir SemPic??

        # Crear, para cada item, los vectores a partir de sus imágenes
        res = []
        for iid, idata in tqdm(reviews.groupby("itemId")):
            iname = idata["name"].unique()[0]

            if idata["n_images"].sum()<min_images: continue

            # Quedarse solo con reviews con imágenes y explotar los vectores de imágenes
            idata_images = idata.explode("images").drop_duplicates("reviewId")
            idata_images["image_id"] = idata_images.groupby(["itemId", "reviewId"]).cumcount()
            idata_images = idata_images.rename(columns={"images": "image_url", "image": "image"})

            idata_image_paths = idata_images.apply(lambda x: f'{city_path}/images/sd/{iid}/{x["reviewId"]}/{x["image_id"]:04d}.jpg',1).values

            img_mtx = []
            for path in idata_image_paths:
                img_data = read_image(path)
                img_mtx.append(img_data)
            img_mtx = np.concatenate(img_mtx)
            encodings = encoder.predict(img_mtx, verbose=0)
            ienc = np.mean(encodings, 0)

            res.append((iid, iname,  idata["n_images"].sum(), len(idata), ienc))
        res = pd.DataFrame(res, columns=["itemId", "name", "n_images", "n_reviews", "item_encoding"])
        res.to_pickle(f"{city}_itm_emb.pkl")
    else:
        res = pd.read_pickle(out_file)
    return res

dataset = encode_items("barcelona")
print(dataset)

        itemId                        name  n_images  n_reviews   
0       693801                   Bar Celta       203        107  \
1       693822          Restaurante Manolo        18         10   
2       693855                   Can Lluis        80         35   
3       693967                   Elisabets       316        143   
4       697396                    7 Portes      2676       1172   
...        ...                         ...       ...        ...   
6509  25338856             Velvet Room BCN         4          1   
6510  25361641       MiMi Tapas Restaurant         6          4   
6511  25362471             Mimi Restaurant         7          3   
6512  25363246           Fonda Can Portell         4          1   
6513  25386551  McDonalds - Som Multiespai         4          1   

                                          item_encoding  
0     [0.0011015198, -0.045368917, 0.06882931, -0.06...  
1     [-0.08390628, 0.0139746815, 0.18736298, -0.102...  
2     [-0.049717955, 

#### t-SNE de restaurantes

In [19]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
data_2d = tsne.fit_transform(np.array(dataset['item_encoding'].tolist()))

In [12]:
from bokeh.models import LinearColorMapper
from bokeh.models import ColumnDataSource
#from bokeh.plotting import figure, show
#from bokeh.resources import INLINE

#output_notebook(INLINE)


In [13]:
source = ColumnDataSource(data=dict(
    x=data_2d[:,0],
    y=data_2d[:,1],
    name=dataset['name'],
    color=dataset['n_images']
))

TOOLTIPS = [("Name", "@name"),("n_images", "@color")]
p = figure(width=1000, height=1000, tooltips=TOOLTIPS)

lc = LinearColorMapper(palette="Greys256", low=dataset['n_images'].max(), high=dataset['n_images'].min())
p.circle('x', 'y', source=source, size=10, fill_color={"field": "color", "transform": lc})
show(p)

NameError: name 'data_2d' is not defined