In [23]:
import tensorflow_hub as hub
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np
import cv2

In [None]:
# Barcelona (RST) -> Paris (POI)
# NewYork (RST) -> Paris (POI)

#### Obtener los ejemplos para entrenar el modelo

In [None]:
def get_data(city_r, city_p):
    # Obtener el conjunto de datos para entrenar (comieron en r y fueron a p)
    top_n_pois = 50

    city_r = city_r.lower().replace(" ", "")
    city_p = city_p.lower().replace(" ", "")

    reviews_r_path = f"out/{city_r}/restaurants/reviews.pkl"
    reviews_p_path = f"out/{city_p}/pois/reviews.pkl"

    reviews_r = pd.read_pickle(reviews_r_path)
    reviews_p = pd.read_pickle(reviews_p_path)

    reviews_r = reviews_r[reviews_r["userId"]!=-1]
    reviews_p = reviews_p[reviews_p["userId"]!=-1]

    # Quedarse con los POIs más populares
    reviews_p_popular = reviews_p.groupby("itemId")["itemId"].count().sort_values(ascending=False).head(top_n_pois).index.values
    reviews_p = reviews_p[reviews_p["itemId"].isin(reviews_p_popular)]

    r_data = set(reviews_r["userId"].unique())
    p_data = set(reviews_p["userId"].unique())

    common_users = r_data.intersection(p_data)

    # ToDo: ¿Por que hay menos usuarios en el conjunto users.pkl que en reviews.pkl?
    # ToDo: En R, tienen que ser solo los comunes???
    reviews_r = reviews_r.loc[reviews_r["userId"].isin(common_users)]
    reviews_p = reviews_p.loc[reviews_p["userId"].isin(common_users)]

    out_data = []

    for rst_id, rst_data in reviews_r.groupby("itemId"):
        rst_users = rst_data["userId"].unique()
        poi_revws = reviews_p.loc[reviews_p["userId"].isin(rst_users)]
        poi_revws = poi_revws.groupby("userId")["itemId"].unique().reset_index()
        # ToDo: Ojo, que los usuarios de r van a más de un POI en p
        poiId, times = np.unique(np.concatenate(poi_revws["itemId"].values), return_counts=True)

        print(rst_id, len(rst_users), poiId )

    return reviews_r, reviews_p

dataset = get_data("barcelona", "paris")

#### Formas de codificar un restaurante por sus fotos

In [30]:
def get_image_encoding(path):
    try:
        # Carga la imagen desde el path.
        img = cv2.imread(path)
        # BGR a RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        # Normaliza la imagen entre -1 y 1.
        img = (img / 127.5) - 1
        # Redimensiona la imagen a 150x150 píxeles.
        img = cv2.resize(img, (150, 150))
        # Agrega una dimensión adicional para el batch.
        img = np.expand_dims(img, axis=0)
    except Exception as e:
        print(path)
        print(e)
    return img

def encode_items(city, method="ImageNet", category="restaurants"):
    """Codificar cada item haciendo una media de sus imágenes"""
    # Cargar datos
    city = city.lower().replace(" ", "")
    city_path = f"out/{city}/{category}"
    items_path = f"{city_path}/items.pkl"
    reviews_path = f"{city_path}/reviews.pkl"
    items = pd.read_pickle(items_path)
    reviews = pd.read_pickle(reviews_path)

    # Combinar reviews e items
    reviews = reviews.merge(items[["itemId", "name"]], on="itemId", how="left")
    reviews["n_images"] = reviews["images"].apply(lambda x: len(x))

    # Solo items con imágenes
    reviews = reviews.loc[reviews["n_images"]>0]

    # Seleccionar el encoder
    encoder = None
    if method == "ImageNet":
        encoder = tf.keras.Sequential([hub.KerasLayer("https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_m/feature_vector/2", trainable=False)])
        encoder.build([None, 150, 150, 3]) # Batch input shape.
    elif method == "CLIP": 
        encoder = None
    else: 
        raise ValueError # Añadir SemPic??

    # Crear, para cada item, los vectores a partir de sus imágenes
    res = []
    for iid, idata in tqdm(reviews.groupby("itemId")):
        iname = idata["name"].unique()[0]

        # Quedarse solo con reviews con imágenes y explotar los vectores de imágenes
        idata_images = idata.explode("images")
        idata_images["image_id"] = idata_images.groupby(["itemId", "reviewId"]).cumcount()
        idata_images = idata_images.rename(columns={"images": "image_url", "image": "image"})

        idata_image_paths = idata_images.apply(lambda x: f'{city_path}/images/sd/{iid}/{x["reviewId"]}/{x["image_id"]:04d}.jpg',1).values

        img_mtx = []
        for path in idata_image_paths:
            img_enc = get_image_encoding(path)
            img_mtx.append(img_enc)
        img_mtx = np.concatenate(img_mtx)
        encodings = model.predict(img_mtx, verbose=0)
        ienc = np.mean(encodings, 0)

        res.append((iid, iname,  idata["n_images"].sum(), len(idata), ienc))
    res = pd.DataFrame(res, columns=["itemId", "name", "n_images", "n_reviews", "item_encoding"])
    return res

dataset = encode_items("gijon")
print(dataset)

  6%|▋         | 43/671 [00:10<02:40,  3.91it/s]

out/gijon/restaurants/images/sd/1640774/543354041/0000.jpg
OpenCV(4.5.5) /io/opencv/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'

out/gijon/restaurants/images/sd/1640774/505639455/0000.jpg
OpenCV(4.5.5) /io/opencv/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'






ValueError: zero-dimensional arrays cannot be concatenated

: 