# Validación por usuarios

### Librerías

In [2]:
import pandas as pd
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import ast

### Lectura datasets

In [3]:
yelp_business_text = pd.read_csv("yelp_business.csv")
yelp_business_text.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","['Bubble Tea', 'Coffee & Tea', 'Bakeries']"
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","['Burgers', 'Fast Food', 'Sandwiches', 'Ice Cr..."
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,"{'Caters': 'True', 'Alcohol': 'Full_bar', 'Res...","['Pubs', 'Italian', 'Bars', 'American (Traditi..."
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,"{'RestaurantsAttire': 'Casual', 'RestaurantsGo...","['Ice Cream & Frozen Yogurt', 'Fast Food', 'Bu..."
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"{'Alcohol': 'None', 'OutdoorSeating': 'None', ...","['Vietnamese', 'Food Trucks']"


In [4]:
yelp_reviews = pd.read_csv("yelp_reviews_for_validation.csv")
yelp_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,Rated 3.0 stars out of 5. If you decide to eat...
1,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,Rated 3.0 stars out of 5. Family diner. Had th...
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,"Rated 5.0 stars out of 5. Wow! Yummy, differe..."
3,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,Rated 4.0 stars out of 5. Cute interior and ow...
4,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,Rated 1.0 stars out of 5. I am a long term fre...


In [5]:
yelp_queries = pd.read_csv("yelp_validation.csv")
yelp_queries["text"] = yelp_queries["text"].apply(ast.literal_eval)
yelp_queries.head()

Unnamed: 0,user_id,business_id,text
0,7RU_xK1tEGlUvXfe0GvtEg,hAmuto6UndVroyd_DaD-TA,[Rated 5.0 stars out of 5. Love Marcos Pizza. ...
1,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,[Rated 5.0 stars out of 5. That bitter sweet d...
2,jG-t2tqFuZLdqRHNn_y9bQ,7UxNDF6ZpabC3O0-Qbg3Xw,[Rated 5.0 stars out of 5. Atmosphere was plea...
3,sHfY5a4-HPa9dhSSBvQK6Q,jQBPO3rYkNwIaOdQS5ktgQ,[Rated 5.0 stars out of 5. I can't get enough ...
4,JPhPZcdUXSSeplvPfW7Auw,Nd_3fSvYDCjM8YJdBx4Y9w,"[Rated 4.0 stars out of 5. Cozy atmosphere, fr..."


### Embeddings restaurantes

In [5]:
def review_embeddings_training(business_dataset, reviews_dataset, merge_id = "business_id", text_column = "text", return_individual_embeddings = False):
    from sentence_transformers import SentenceTransformer

    # 1. Unir datasets por ID
    print("Merging datasets...")
    reviews_df_gouped = reviews_dataset.groupby(merge_id)[text_column].apply(list).reset_index()
    df = pd.merge(business_dataset, reviews_df_gouped, on=merge_id, how="left")
    print("Merge complete")

    # 2. Obtención de los embeddings individuales
    model = SentenceTransformer('all-MiniLM-L6-v2')

    all_ind_reviews = [review for sublist in df.text for review in sublist]
    print("Total reviews to embed:", len(all_ind_reviews))

    print("Embed in process:")
    ind_reviews_emb = model.encode(all_ind_reviews,
                               show_progress_bar = True)
    print("Embedding completed")
    print("Embeddings shape:", ind_reviews_emb.shape)

    # 2. Pooling medio por restaurante
    business_pooled_emb = []
    current_emb_idx = 0

    print("Pooling embeddings...")
    for index, row in df.iterrows():
        num_rev_business = len(row[text_column])

        start_idx = current_emb_idx
        end_idx = current_emb_idx + num_rev_business

        emb_current_business = ind_reviews_emb[start_idx:end_idx]
        pooled_emb = np.mean(emb_current_business, axis = 0)

        business_pooled_emb.append(pooled_emb)

        current_emb_idx = end_idx
    print("Pooling completed")
    
    # Salida
    if return_individual_embeddings:
        return ind_reviews_emb, business_pooled_emb
    
    return business_pooled_emb

In [6]:
restaurant_pooled_emb = review_embeddings_training(yelp_business_text, yelp_reviews)


Merging datasets...
Merge complete
Total reviews to embed: 4279039
Embed in process:


Batches: 100%|██████████| 133720/133720 [1:17:32<00:00, 28.74it/s] 


Embedding completed
Embeddings shape: (4279039, 384)
Pooling embeddings...
Pooling completed


In [7]:
df_embeddings = pd.DataFrame(restaurant_pooled_emb)
df_embeddings.head()
filename = "validation_reviews_emb.csv"
df_embeddings.to_csv(filename, index = False)
print("Embeddings guardados en", filename)

Embeddings guardados en validation_reviews_emb.csv


### Embeddings queries

In [6]:
from sentence_transformers import SentenceTransformer

df = yelp_queries.copy()

# 2. Obtención de los embeddings individuales
model = SentenceTransformer('all-MiniLM-L6-v2')

all_ind_reviews_q = [review for sublist in df.text for review in sublist]
print("Total reviews to embed:", len(all_ind_reviews_q))

print("Embed in process:")
ind_reviews_emb = model.encode(all_ind_reviews_q,
                            show_progress_bar = True)
print("Embedding completed")
print("Embeddings shape:", ind_reviews_emb.shape)

queries_emb = []
current_emb_idx = 0

print("Pooling embeddings...")
for index, row in df.iterrows():
    num_rev_business = len(row["text"])

    start_idx = current_emb_idx
    end_idx = current_emb_idx + num_rev_business

    # Esta es una porción del array
    business_embs = ind_reviews_emb[start_idx:end_idx]
    pooled_emb = np.mean(business_embs, axis=0)

    # Ahora sí usamos .append() sobre la lista
    queries_emb.append(pooled_emb)

    current_emb_idx = end_idx

print("Pooling completed")

# Convertimos la lista de vectores a array
queries_emb = np.array(queries_emb)
queries_emb.shape

Total reviews to embed: 241611
Embed in process:


Batches: 100%|██████████| 7551/7551 [03:08<00:00, 40.12it/s] 


Embedding completed
Embeddings shape: (241611, 384)
Pooling embeddings...
Pooling completed


(10000, 384)

In [8]:
queries_emb = pd.DataFrame(queries_emb)
queries_emb.head()
filename = "validation_queries_emb.csv"
queries_emb.to_csv(filename, index = False)
print("Embeddings guardados en", filename)

Embeddings guardados en validation_queries_emb.csv


In [24]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

# Inicializar columnas
for k in [1, 5, 10, 50]:
    yelp_queries[f"top{k}"] = False

for i in tqdm(range(len(queries_emb))):
    
    # ID y ciudad del restaurante real
    target_business_id = yelp_queries.loc[i, "business_id"]
    target_row = yelp_business_text[yelp_business_text["business_id"] == target_business_id]
    
    if target_row.empty:
        continue  # salta si no hay match (por seguridad)

    target_city = target_row["city"].values[0]
    target_cats_str = target_row["categories"].values[0]
    target_cats = set([cat.strip().lower() for cat in target_cats_str.split(",")])

    # Filtrar restaurantes por ciudad
    city_mask = (yelp_business_text["city"] == target_city)

    # Filtrar por categoría (usamos apply)
    def has_overlap(cats):
        if isinstance(cats, str):
            cat_set = set([c.strip().lower() for c in cats.split(",")])
            return not target_cats.isdisjoint(cat_set)
        return False

    cat_mask = yelp_business_text["categories"].apply(has_overlap)

    # Combinar ambas condiciones
    combined_mask = (city_mask & cat_mask).values
    filtered_restaurants = yelp_business_text[combined_mask].reset_index(drop=True)
    filtered_embeddings = restaurant_pooled_emb[combined_mask]

    if filtered_embeddings.shape[0] == 0:
        continue  # saltar si no hay restaurantes filtrados

    # Calcular similitudes
    cosine_similarities = cosine_similarity(
        queries_emb[i].reshape(1, -1),
        filtered_embeddings
    )[0]

    sorted_indices = np.argsort(cosine_similarities)[::-1]

    for k in [1, 5, 10, 50]:
        top_k_indices = sorted_indices[:k]
        top_k_business_ids = filtered_restaurants.iloc[top_k_indices]["business_id"].values

        if target_business_id in top_k_business_ids:
            yelp_queries.loc[i, f"top{k}"] = True

100%|██████████| 10000/10000 [09:04<00:00, 18.36it/s]


In [25]:
accuracy_at_1 = yelp_queries["top1"].mean()
recall_at_5 = yelp_queries["top5"].mean()
recall_at_10 = yelp_queries["top10"].mean()
recall_at_50 = yelp_queries["top50"].mean()

print(f"Top-1 Accuracy: {accuracy_at_1:.2%}")
print(f"Top-5 Recall: {recall_at_5:.2%}")
print(f"Top-10 Recall: {recall_at_10:.2%}")
print(f"Top-50 Recall: {recall_at_50:.2%}")

Top-1 Accuracy: 14.05%
Top-5 Recall: 33.50%
Top-10 Recall: 43.08%
Top-50 Recall: 68.98%
