# Set up

In [5]:
# Reviews CSV (updated)
%%capture
!wget https://www.dropbox.com/scl/fi/6u1yfcnnf4jqmhedx519u/Reviews.csv?rlkey=xqmvvohkq0i0k7hho79fs43b6&st=mexudbu2&dl=0
!mv Reviews.csv?rlkey=xqmvvohkq0i0k7hho79fs43b6 reviews.csv
# Metadata for each restaurant
!wget https://www.dropbox.com/scl/fi/cxckzuj81gsnlsvclqnza/metadata.json.gz?rlkey=d4xerrcwbeyt09oi01f9f4wru&st=sv6cnpzh&dl=0
!mv metadata.json.gz?rlkey=d4xerrcwbeyt09oi01f9f4wru metadata.json.gz
# LLaVa Image Descriptions
!wget https://www.dropbox.com/scl/fi/50pmwvytozpz0cl1p054f/tiny_LLaVa_images_descriptions.json.gz?rlkey=7vreygmtd16lohs3bx6yvmwdk&st=9568qz84&dl=0
!mv tiny_LLaVa_images_descriptions.json.gz?rlkey=7vreygmtd16lohs3bx6yvmwdk tiny_LLaVa_images_descriptions.json.gz

# Libraries

In [6]:
# HuggingFace requirements
!pip install transformers datasets evaluate accelerate



In [7]:
!pip install nltk



In [8]:
# HuggingFace
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
# Data visualization and manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# Metadata
import gzip
import json

In [10]:
# Evaluation
import copy
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk import word_tokenize
from nltk.translate import meteor

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Preprocessing

In [11]:
df = pd.read_csv('reviews.csv')
print(f"Tamaño dataset: {df.shape}")

# Eliminamos datos nulos ...
df = df.dropna()
df.info()

Tamaño dataset: (8334, 9)
<class 'pandas.core.frame.DataFrame'>
Index: 8038 entries, 0 to 8333
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    8038 non-null   int64 
 1   user_id       8038 non-null   object
 2   gmap_id       8038 non-null   object
 3   rating        8038 non-null   int64 
 4   text          8038 non-null   object
 5   img_url       8038 non-null   object
 6   img_filename  8038 non-null   object
 7   state         8038 non-null   object
 8   rest_id       8038 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 628.0+ KB


In [12]:
# Obtenemos el json de la metadata
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

metadata = list(parse('metadata.json.gz'))
metadata = metadata[0]
# Obtenemos json de descripciones de imagenes
descriptions = list(parse('tiny_LLaVa_images_descriptions.json.gz'))
descriptions = descriptions[0]

In [13]:
# indice metadata
new_metadata = []
for value in metadata.values():
    new_metadata.extend(value)
idx2metadata = {}
for data in new_metadata:
    gmap_id = data['gmap_id']
    del data['gmap_id']
    idx2metadata[gmap_id] = data
# indice descripcion
idx2description = {}
for key, data in descriptions.items():
    key = key.split("/")[-1].split(".")[0]
    idx2description[key] = data

## Review Generation


#### Flan T5

In [14]:
def generate_review(user_id, item_id, image, previous_reviews, model, tokenizer, print_prompt):
    """
    Genera una review a partir de uid, itid, y una concatenación de previous reviews. Retorna la review generada.
    """
    service_options = "N/A"
    if 'MISC' in idx2metadata[item_id] and 'Service options' in idx2metadata[item_id]['MISC']:
        service_options = ", ".join(idx2metadata[item_id]['MISC']['Service options'])

    prompt = f"""Provide a detailed and unique recommendation for the following restaurant. Use the provided details and avoid repeating information unnecessarily. Highlight the restaurant's features, customer experience, and what makes it stand out. Tailor the recommendation based on the context and information available.

### Restaurant Details ###
Name: {idx2metadata[item_id]['name']}
Average Rating: {idx2metadata[item_id]['avg_rating']}
Description: {idx2metadata[item_id]['description']}
Categories: {", ".join(idx2metadata[item_id]['category'])}
Service Options: {service_options}
Image Description: {image}

### Previous Reviews ###
{previous_reviews if previous_reviews else 'No reviews available'}

### Guideline for Writing the Recommendation ###
- Highlight the unique features of the restaurant (e.g., menu items, ambiance, service quality).
- Tailor the response based on available service options (e.g., delivery, takeaway, or dine-in).
- If relevant, mention scenarios or audiences for whom the restaurant is ideal (e.g., families, couples, groups).
- Avoid using the exact same phrasing repeatedly, and do not copy this example directly.

Now, write a tailored recommendation based on the above information, keep it concise."""
    if (print_prompt):
      print("Prompt:\n----------------------\n" + prompt + "\n----------------------\n")
    inputs = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(
        inputs,
        max_new_tokens=250,
        no_repeat_ngram_size=3,
        temperature=0.7,
        top_k=50,
        top_p=0.9
    )
    generated_review = tokenizer.decode(outputs[0], skip_special_tokens=True)
    #print("Tokenized Input:", tokenizer.decode(inputs[0]))
    #print("Model Raw Output:", tokenizer.decode(outputs[0]))
    return generated_review.strip()

def generate_review_for_user_item_pair(user_id, item_id, df, model, tokenizer, print_prompt):
    """
    Recolecta hasta 3 reviews previas del usuario y genera una review a partir de uid y itid. Retorna la review generada.
    """
    # Par usuario-restaurante específico
    row = df[(df['user_id'] == user_id) & (df['gmap_id'] == item_id)]
    row = row.iloc[0]

    rating = row.get('rating')
    if rating is None or np.isnan(rating):  # Si rating no existe, utilizamos promedio
        rating = idx2metadata[item_id]['avg_rating']
    image_description = idx2description[item_id]
    # Obtenemos reviews pasadas del restaurante con ratings similares
    restaurant_reviews = df[(df['gmap_id'] == item_id) & (df['rating'].isin([rating - 1, rating, rating + 1]))].head(3)
    # Eliminamos la review del usuario objetivo
    restaurant_reviews = restaurant_reviews[restaurant_reviews['user_id'] != user_id]
    previous_reviews = "\n* ".join(restaurant_reviews['text'])
    context = previous_reviews

    return generate_review(user_id, item_id, image_description, context, model, tokenizer, print_prompt)


In [15]:
# Cargamos modelo flat T5 de huggingface
model_name = "google/flan-t5-large"
model_flan_t5 = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer_flan_t5 = T5Tokenizer.from_pretrained(model_name)
#tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Generar Explicación Con Flan-T5

### Elegir par usuario, restaurante

In [16]:
# Obtenemos row aleatoriamente
random_row = df.sample(1) # Para elegir usuario manualmente cambiar esta linea
user_id = random_row['user_id'].values[0]
item_id = random_row['gmap_id'].values[0]

In [17]:
print(idx2metadata[item_id]) # Imprimimos metadata del restaurante para saber que estamos recomendando

{'name': 'Dairy Queen', 'address': 'Dairy Queen, 1804 N 13th St, Bismarck, ND 58501', 'description': 'Soft-serve ice cream & signature shakes top the menu at this classic burger & fries fast-food chain.', 'latitude': 46.8252063, 'longitude': -100.77257279999999, 'category': ['Fast food restaurant', 'Ice cream shop'], 'avg_rating': 4, 'num_of_reviews': 333, 'price': '$', 'hours': [['Monday', '10AM–10PM'], ['Tuesday', '10AM–10PM'], ['Wednesday', '10AM–10PM'], ['Thursday', '10AM–10PM'], ['Friday', '10AM–10PM'], ['Saturday', '10AM–10PM'], ['Sunday', '10AM–10PM']], 'MISC': {'Service options': ['Outdoor seating', 'Delivery', 'Takeout'], 'Highlights': ['Fast service'], 'Popular for': ['Lunch', 'Solo dining'], 'Accessibility': ['Wheelchair accessible entrance', 'Wheelchair accessible parking lot', 'Wheelchair accessible restroom', 'Wheelchair accessible seating'], 'Offerings': ['Coffee', "Kids' menu", 'Quick bite'], 'Dining options': ['Dessert', 'Seating'], 'Amenities': ['Good for kids', 'High

In [18]:
# Generamos Explicación
generated_review = generate_review_for_user_item_pair(user_id, item_id, df, model_flan_t5, tokenizer_flan_t5)
print(f"\n\nExplicación generada: \n{generated_review}")
print(f"\n\nExplicación real: \n{df[(df['user_id'] == user_id) & (df['gmap_id'] == item_id)]['text'].values[0]}")

TypeError: generate_review_for_user_item_pair() missing 1 required positional argument: 'print_prompt'



1.   Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., ... & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res., 21(140), 1-67.
https://huggingface.co/google-t5/t5-base#uses




## HuggingFace SMOL

In [None]:
def generate_review_hugging_face(user_id, item_id, image, previous_reviews, model, tokenizer, print_prompt):
    """
    Genera una review a partir de uid, itid, y una concatenación de previous reviews. Retorna la review generada.
    """
    service_options = ""
    if 'MISC' in idx2metadata[item_id] and 'Service options' in idx2metadata[item_id]['MISC']:
        service_options = ", ".join(idx2metadata[item_id]['MISC']['Service options'])

    prompt = f"""Provide a detailed and unique recommendation for the following restaurant. Use the provided details and avoid repeating information unnecessarily. Highlight the restaurant's features, customer experience, and what makes it stand out. Tailor the recommendation based on the context and information available.

### Restaurant Details ###
Name: {idx2metadata[item_id]['name']}
Average Rating: {idx2metadata[item_id]['avg_rating']}
Description: {idx2metadata[item_id]['description']}
Categories: {", ".join(idx2metadata[item_id]['category'])}
Service Options: {service_options}
Image Description: {image}

### Previous Reviews ###
{previous_reviews if previous_reviews else 'No reviews available'}

### Guideline for Writing the Recommendation ###
- Highlight the unique features of the restaurant (e.g., menu items, ambiance, service quality).
- Tailor the response based on available service options (e.g., delivery, takeaway, or dine-in).
- If relevant, mention scenarios or audiences for whom the restaurant is ideal (e.g., families, couples, groups).
- Avoid using the exact same phrasing repeatedly, and do not copy this example directly.

Now, write a tailored recommendation based on the above information, keep it concise."""
    if (print_prompt):
      print("Prompt:\n----------------------\n" + prompt + "\n----------------------\n")
    messages = [{"role": "user", "content": prompt}]
    input_text= hugging_face_tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = hugging_face_tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = hugging_face_model.generate(inputs, max_new_tokens=250, temperature=0.2, top_p=0.9, do_sample=True)
    result = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return result.strip()



def generate_review_for_user_item_pair_hugging_face(user_id, item_id, df, model, tokenizer, print_prompt):
    """
    Recolecta hasta 3 reviews previas del usuario y genera una review a partir de uid y itid. Retorna la review generada.
    """
    # Par usuario-restaurante específico
    row = df[(df['user_id'] == user_id) & (df['gmap_id'] == item_id)]
    row = row.iloc[0]

    rating = row.get('rating')
    if rating is None or np.isnan(rating):  # Si rating no existe, utilizamos promedio
        rating = idx2metadata[item_id]['avg_rating']
    image_url = row['img_url']
    image_description = descriptions.get(image_url, "NA")
    # Obtenemos reviews pasadas del usuario con ratings similares
    user_reviews = df[(df['user_id'] == user_id) & (df['rating'].isin([rating - 1, rating, rating + 1]))].head(3)
    # Eliminamos la review del restaurant objetivo
    user_reviews = user_reviews[user_reviews['gmap_id'] != item_id]
    previous_reviews = "\n* ".join(user_reviews['text'])
    context = previous_reviews

    return generate_review_hugging_face(user_id, item_id, image_description, context, model, tokenizer, print_prompt)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

device = "cpu" # for GPU usage or "cpu" for CPU usage
hugging_face_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
hugging_face_model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)


### Generar Explicación con SmolLM

### Definir par usuario, restaurante

In [None]:
# Obtenemos row aleatoriamente
random_row = df.sample(1) # Para elegir usuario manualmente cambiar esta linea
user_id = random_row['user_id'].values[0]
item_id = random_row['gmap_id'].values[0]

In [None]:
print(idx2metadata[item_id]) # Imprimimos metadata del restaurante para saber que estamos recomendando

In [None]:
# Generamos review
generated_review = generate_review_for_user_item_pair_hugging_face(user_id, item_id, df, hugging_face_model, hugging_face_tokenizer)
print(f"\n\nReview generada: \n{generated_review}")
print(f"\n\nReview real: \n{df[(df['user_id'] == user_id) & (df['gmap_id'] == item_id)]['text'].values[0]}")

1. Allal, L. B., Lozhkov, A., Bakouch, E., Blázquez, G. M., Tunstall, L., Piqueres, A., Marafioti, A., Zakka, C., von Werra, L., & Wolf, T. (2024). SmolLM2 - with great data, comes great performance.


# Evaluation


In [34]:
# https://www.digitalocean.com/community/tutorials/automated-metrics-for-evaluating-generated-text
def calculate_bleu(candidate, reference):
    '''
    candidate, reference: generated and ground-truth sentences
    '''
    reference = word_tokenize(reference)
    candidate = word_tokenize(candidate)
    smoothing = SmoothingFunction().method1
    score = sentence_bleu([reference], candidate, smoothing_function=smoothing)
    return round(score, 4)

def calculate_meteor(candidate, reference):
  '''
  candidate, reference: tokenized list of words in the sentence
  '''
  reference = word_tokenize(reference)
  candidate = word_tokenize(candidate)
  meteor_score = round(meteor([candidate],reference), 4)
  return meteor_score

### Evaluation of single case

In [35]:
# Buscamos review del user a ese item
user_reviews = df[(df['user_id'] == user_id) & (df['gmap_id'] == item_id)]
print(f"BLEU: {calculate_bleu(generated_review, user_reviews['text'].values[0])}")
print(f"METEOR: {calculate_meteor(generated_review, user_reviews['text'].values[0])}")

BLEU: 0.0036
METEOR: 0.1153


### Flan

In [36]:
# Generamos reviews para 20 pares de user id e item id de manera aleatoria y evaluamos
num_pairs = 20
bleu_scores = []
meteor_scores = []
for i in range(num_pairs):
    print(f"Generando {i}")
    random_row = df.sample(1)
    user_id = random_row['user_id'].values[0]
    item_id = random_row['gmap_id'].values[0]
    generated_review = generate_review_for_user_item_pair(user_id, item_id, df, model_flan_t5, tokenizer_flan_t5, False)
    user_reviews = df[(df['user_id'] == user_id) & (df['gmap_id'] == item_id)]
    bleu_scores.append(calculate_bleu(generated_review, user_reviews['text'].values[0]))
    meteor_scores.append(calculate_meteor(generated_review, user_reviews['text'].values[0]))
    if i % 10 == 0:
      print(f"BLEU promedio: {np.mean(bleu_scores)}")
      print(f"METEOR promedio: {np.mean(meteor_scores)}")
print(f"BLEU promedio FINAL: {np.mean(bleu_scores)}")
print(f"METEOR promedio FINAL: {np.mean(meteor_scores)}")


Generando 0
Prompt:
----------------------
Provide a detailed and unique recommendation for the following restaurant. Use the provided details and avoid repeating information unnecessarily. Highlight the restaurant's features, customer experience, and what makes it stand out. Tailor the recommendation based on the context and information available.

### Restaurant Details ###
Name: Espresso Portuguese American Grill
Average Rating: 4.5
Description: Portuguese-style seafood, steak & sandwiches typify the menu at this rustic European eatery.
Categories: Portuguese restaurant, American restaurant, Bar & grill, Beer hall, Catering food and drink supplier, Caterer, Liquor wholesaler, Mediterranean restaurant, Party planner, Wine bar
Service Options: Curbside pickup, Delivery, Takeout, Dine-in
Image Description: The image features a dining table with a plate of food on it, including a serving of chicken and a serving of broccoli. There are two people sitting at the table, one on the left sid

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-36-9cef8a00df41>", line 10, in <cell line: 5>
    generated_review = generate_review_for_user_item_pair(user_id, item_id, df, model_flan_t5, tokenizer_flan_t5)
  File "<ipython-input-24-2c42533d08b8>", line 63, in generate_review_for_user_item_pair
    return generate_review(user_id, item_id, image_description, context, model, tokenizer)
  File "<ipython-input-24-2c42533d08b8>", line 31, in generate_review
    outputs = model.generate(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2027, in generate
    model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
  File "/usr/local/lib/p

TypeError: object of type 'NoneType' has no len()

### SmolLM

In [None]:
# Generamos reviews para 20 pares de user id e item id de manera aleatoria y evaluamos
num_pairs = 20
bleu_scores = []
meteor_scores = []
for i in range(num_pairs):
    print(f"Generando {i}")
    random_row = df.sample(1)
    user_id = random_row['user_id'].values[0]
    item_id = random_row['gmap_id'].values[0]
    generated_review = generate_review_for_user_item_pair_hugging_face(user_id, item_id, df, hugging_face_model, hugging_face_tokenizer)
    user_reviews = df[(df['user_id'] == user_id) & (df['gmap_id'] == item_id)]
    bleu_scores.append(calculate_bleu(generated_review, user_reviews['text'].values[0]))
    meteor_scores.append(calculate_meteor(generated_review, user_reviews['text'].values[0]))
    if i % 10 == 0:
      print(f"BLEU promedio: {np.mean(bleu_scores)}")
      print(f"METEOR promedio: {np.mean(meteor_scores)}")
print(f"BLEU promedio FINAL: {np.mean(bleu_scores)}")
print(f"METEOR promedio FINAL: {np.mean(meteor_scores)}")


Generando 0
BLEU promedio: 0.0026
METEOR promedio: 0.04
Generando 1
Generando 2
Generando 3
Generando 4
Generando 5
Generando 6
Generando 7
Generando 8
Generando 9
Generando 10
BLEU promedio: 0.0035181818181818187
METEOR promedio: 0.09578181818181818
Generando 11
Generando 12
Generando 13
Generando 14
Generando 15
Generando 16
Generando 17
Generando 18
Generando 19
Generando 20
BLEU promedio: 0.0033571428571428567
METEOR promedio: 0.08410952380952381
Generando 21
Generando 22
Generando 23
Generando 24
Generando 25
Generando 26
Generando 27
Generando 28
Generando 29
BLEU promedio FINAL: 0.003263333333333333
METEOR promedio FINAL: 0.08295666666666665
