# Oliv.ia testing by user rieviews

## Libraries

In [18]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

## Data loading

In [None]:
yelp_business = pd.read_parquet("../datasets/yelp_business.parquet")
yelp_queries = pd.read_csv("../datasets/yelp_validation.csv")
reviews_emb = pd.read_csv("../datasets/validation_reviews_emb.csv").to_numpy()
queries_emb = pd.read_csv("../datasets/validation_queries_emb.csv").to_numpy()
features_emb = pd.read_parquet("../datasets/features_embeddings.parquet").to_numpy()

In [3]:
metrics = pd.DataFrame(columns = ["comparison", "filters"])
for k in [1, 5, 10, 50]:
    metrics[f"recall_{k}"] = False

## Functions

In [24]:
def save_recalls(queries_data, i, comparison, filters):
    metrics.loc[i, "comparison"] = comparison
    metrics.loc[i, "filters"] = filters
    for k in [1, 5, 10, 50]:
        metrics.loc[i, f"recall_{k}"] = queries_data[f"top{k}"].mean()

In [19]:
def top_similarities(query_embedding, comparison_embeddings, top_n):
    cosine_similarities = cosine_similarity(query_embedding, comparison_embeddings)[0]
    return cosine_similarities.argsort()[-top_n:][::-1]

In [None]:
def ur_test_model(model: tuple, queries_emb, features_emb, reviews_emb, business_data, queries_data, filters: tuple = None, first_top_n = 100):

    for k in [1, 5, 10, 50]:
        queries_data[f"top{k}"] = False

    for i in tqdm(range(len(queries_emb))):

        target_business_id = queries_data.loc[i, "business_id"]
        target_row = business_data[business_data["business_id"] == target_business_id]

        city_mask = None
        cat_mask = None

        if filters and "city" in filters:
            target_city = target_row["city"].values[0]
            city_mask = business_data["city"] == target_city

        if filters and "categories" in filters:
            target_cats = set(cat.strip().lower() for cat in target_row["categories"].values[0].split(","))
            
            def has_overlap(cats):
                if isinstance(cats, str):
                    cat_set = set(cat.strip().lower() for cat in cats.split(","))
                    return not target_cats.isdisjoint(cat_set)
                return False
            
            cat_mask = business_data["categories"].apply(has_overlap)

        if city_mask is not None and cat_mask is not None:
            combined_mask = (city_mask & cat_mask).values
        elif city_mask is not None:
            combined_mask = city_mask.values
        elif cat_mask is not None:
            combined_mask = cat_mask.values
        else:
            combined_mask = np.ones(len(business_data), dtype=bool)

        filtered_business = business_data[combined_mask].reset_index(drop=True)

        if model[0] == "features":
            filtered_first_emb = features_emb[combined_mask]
            filtered_second_emb = reviews_emb[combined_mask]
        elif model[0] == "reviews":
            filtered_first_emb = reviews_emb[combined_mask]
            filtered_second_emb = features_emb[combined_mask]
        
        top_n = top_similarities(queries_emb[i].reshape(1, -1), filtered_first_emb, top_n = first_top_n)

        if len(model) > 1:
            emb_top_n = filtered_second_emb[top_n]
            pseudo_top_n = top_similarities(queries_emb[i].reshape(1, -1), emb_top_n, top_n = 50)
            top_n = top_n[pseudo_top_n]
        
        for k in [1, 5, 10, 50]:
            top_k_indices = top_n[:k]
            top_k_business_ids = filtered_business.iloc[top_k_indices]["business_id"].values

            if target_business_id in top_k_business_ids:
                queries_data.loc[i, f"top{k}"] = True
    
    return queries_data

In [None]:
ur_test_model(("reviews",), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = None)
save_recalls(yelp_queries, 0, "Reviews", "No")
metrics.head(12)

Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177


In [28]:
ur_test_model(("reviews",), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city",))
save_recalls(yelp_queries, 1, "Reviews", "City")
metrics.head(12)

100%|██████████| 10000/10000 [03:44<00:00, 44.59it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081


In [31]:
ur_test_model(("reviews",), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city", "categories"))
save_recalls(yelp_queries, 2, "Reviews", "City & Categories")
metrics.head(12)

100%|██████████| 10000/10000 [12:50<00:00, 12.98it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898


In [32]:
ur_test_model(("features",), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = None)
save_recalls(yelp_queries, 3, "Features", "No")
metrics.head(12)

100%|██████████| 10000/10000 [37:00<00:00,  4.50it/s] 


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018


In [33]:
ur_test_model(("features",), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city",))
save_recalls(yelp_queries, 4, "Features", "City")
metrics.head(12)

100%|██████████| 10000/10000 [03:17<00:00, 50.73it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041


In [34]:
ur_test_model(("features",), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city", "categories"))
save_recalls(yelp_queries, 5, "Features", "City & Categories")
metrics.head(12)

100%|██████████| 10000/10000 [10:02<00:00, 16.61it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041
5,Features,City & Categories,0.0964,0.2643,0.357,0.5902


In [35]:
ur_test_model(("features", "reviews"), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = None)
save_recalls(yelp_queries, 6, "Features to Reviews", "No")
metrics.head(12)

100%|██████████| 10000/10000 [39:06<00:00,  4.26it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041
5,Features,City & Categories,0.0964,0.2643,0.357,0.5902
6,Features to Reviews,No,0.0002,0.0004,0.001,0.0028


In [36]:
ur_test_model(("features", "reviews"), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city",))
save_recalls(yelp_queries, 7, "Features to Reviews", "City")
metrics.head(12)

100%|██████████| 10000/10000 [04:29<00:00, 37.11it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041
5,Features,City & Categories,0.0964,0.2643,0.357,0.5902
6,Features to Reviews,No,0.0002,0.0004,0.001,0.0028
7,Features to Reviews,City,0.0219,0.0699,0.1121,0.249


In [37]:
ur_test_model(("features", "reviews"), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city", "categories"))
save_recalls(yelp_queries, 8, "Features to Reviews", "City & Categories")
metrics.head(12)

100%|██████████| 10000/10000 [10:46<00:00, 15.46it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041
5,Features,City & Categories,0.0964,0.2643,0.357,0.5902
6,Features to Reviews,No,0.0002,0.0004,0.001,0.0028
7,Features to Reviews,City,0.0219,0.0699,0.1121,0.249
8,Features to Reviews,City & Categories,0.1407,0.3325,0.4239,0.6428


In [38]:
ur_test_model(("reviews", "features"), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = None)
save_recalls(yelp_queries, 9, "Reviews to Features", "No")
metrics.head(12)

100%|██████████| 10000/10000 [35:20<00:00,  4.72it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041
5,Features,City & Categories,0.0964,0.2643,0.357,0.5902
6,Features to Reviews,No,0.0002,0.0004,0.001,0.0028
7,Features to Reviews,City,0.0219,0.0699,0.1121,0.249
8,Features to Reviews,City & Categories,0.1407,0.3325,0.4239,0.6428
9,Reviews to Features,No,0.0004,0.0015,0.0031,0.0153


In [39]:
ur_test_model(("reviews", "features"), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city",))
save_recalls(yelp_queries, 10, "Reviews to Features", "City")
metrics.head(12)

100%|██████████| 10000/10000 [03:31<00:00, 47.29it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041
5,Features,City & Categories,0.0964,0.2643,0.357,0.5902
6,Features to Reviews,No,0.0002,0.0004,0.001,0.0028
7,Features to Reviews,City,0.0219,0.0699,0.1121,0.249
8,Features to Reviews,City & Categories,0.1407,0.3325,0.4239,0.6428
9,Reviews to Features,No,0.0004,0.0015,0.0031,0.0153


In [40]:
ur_test_model(("reviews", "features"), queries_emb, features_emb, reviews_emb, yelp_business, yelp_queries, filters = ("city", "categories"))
save_recalls(yelp_queries, 11, "Reviews to Features", "City & Categories")
metrics.head(12)

100%|██████████| 10000/10000 [10:47<00:00, 15.44it/s]


Unnamed: 0,comparison,filters,recall_1,recall_5,recall_10,recall_50
0,Reviews,No,0.001,0.0036,0.0049,0.0177
1,Reviews,City,0.0238,0.0787,0.1262,0.3081
2,Reviews,City & Categories,0.1405,0.335,0.4308,0.6898
3,Features,No,0.0,0.0002,0.0003,0.0018
4,Features,City,0.0101,0.037,0.0655,0.2041
5,Features,City & Categories,0.0964,0.2643,0.357,0.5902
6,Features to Reviews,No,0.0002,0.0004,0.001,0.0028
7,Features to Reviews,City,0.0219,0.0699,0.1121,0.249
8,Features to Reviews,City & Categories,0.1407,0.3325,0.4239,0.6428
9,Reviews to Features,No,0.0004,0.0015,0.0031,0.0153


In [41]:
metrics.to_csv("users_validation_metrics.csv", index = False)