In [None]:
!pip install -q pandas beautifulsoup4 sentence-transformers scipy

### Data Preparation + Labelling

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', None) 

hotels_dataframe = pd.read_csv('data/hotels.csv', encoding='ISO-8859-1')

In [2]:
hotels_dataframe.columns = hotels_dataframe.columns.str.strip()

relevant_cities = ['Tokyo', 'Rome', 'Frankfurt', 'Los Angeles', 'Barcelona', 'Incheon', 'Singapore']
relevant_hotels = hotels_dataframe[hotels_dataframe['cityName'].isin(relevant_cities)].copy()

relevant_hotels['Description'] = (
    relevant_hotels['Description']
    .fillna('')
    .astype(str)
    .apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
)

possible_amenities = [
    "pool", "gym", "spa", "bar", "restaurant", "wifi", "parking",
    "beach", "downtown", "airport", "breakfast", "family", "pet"
]

relevant_hotels['Amenities'] = relevant_hotels['Description'].apply(
    lambda d: [a for a in possible_amenities if a in d.lower()]
)
relevant_hotels = relevant_hotels[relevant_hotels['Amenities'].apply(lambda x: len(x) > 0)]
relevant_hotels = relevant_hotels.iloc[:1000]

In [3]:
import random
amenity_vocab = sorted({a for lst in relevant_hotels['Amenities'] for a in lst})

def generate_queries(num_queries=50):
    query_templates = [
        "hotel with {a1} in {city}",
        "{a1} and {a2} hotel near {city}",
        "family friendly {a1} hotel in {city}",
        "cheap {a1} hotel close to {city} center",
        "romantic hotel with {a1} and {a2} in {city}",
    ]
    queries = []
    for _ in range(num_queries):
        tpl = random.choice(query_templates)
        city = random.choice(relevant_cities)
        a1, a2 = random.sample(amenity_vocab, 2)
        q = tpl.format(a1=a1, a2=a2, city=city)
        queries.append(q)
    return queries

def relevant_hotel_indices(query, docs):
    q = query.lower()
    city_in_q = next((c for c in relevant_cities if c.lower() in q), None)
    amenities_in_q = [a for a in amenity_vocab if a in q]

    relevant_indices = []
    for i, description in enumerate(docs):
        description = description.lower()

        city_match = (city_in_q is None) or (city_in_q.lower() in description)
        amenity_match = any(a in description for a in amenities_in_q) if amenities_in_q else True

        if city_match and amenity_match:
            relevant_indices.append(i)

    return relevant_indices



### Semantic Retrieval

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

# --- helpers: retrieval ---
def encode_texts(model, texts, batch_size=64, normalize=True):
    emb = model.encode(texts, convert_to_numpy=True, batch_size=batch_size)
    if normalize:
        norms = np.linalg.norm(emb, axis=1, keepdims=True)
        emb = emb / np.maximum(norms, 1e-12)
    return emb


from scipy.spatial.distance import cdist
def retrieve_topk(query_embedding, doc_embeddings, k, metric):
    if metric == 'cosine':
        similarity_scores = doc_embeddings @ query_embedding.reshape(-1)
        topk_idices = np.argsort(-similarity_scores)[:k]
        scores = similarity_scores[topk_idices]
    else:
        if metric == 'euclidean':
            d = cdist(query_embedding.reshape(1, -1), doc_embeddings, metric='euclidean')[0]
            topk_idices = np.argsort(d)[:k]
            scores = -d[topk_idices]
        else:
            raise ValueError("unsupported metric")
    return topk_idices, scores

# --- metrics ---
def precision(relevant_indices, retrieved_indices):
    if len(relevant_indices) == 0:
        return 0.0
    intersection = len(set(relevant_indices) & set(retrieved_indices))
    return intersection / len(retrieved_indices)

# Mean Reciprocal Rank -> How early we get something that is relevant
def mrr(relevant_indices, retrieved_indices):
    for rank, idx in enumerate(retrieved_indices, start=1):
        if idx in relevant_indices:
            return 1.0 / rank
    return 0.0

# Normalized Discounted Cumulative Gain -> How highly ranked are the relevant hits
def ndcg(relevant_set, retrieved_indices):
    def dcg(relevances):
        return sum((2**r - 1)/np.log2(i+2) for i, r in enumerate(relevances))
    gains = [1 if idx in relevant_set else 0 for idx in retrieved_indices]
    ideal_gains = sorted(gains, reverse=True)
    idcg = dcg(ideal_gains)
    if idcg == 0:
        return 0.0
    return dcg(gains) / idcg

def evaluate_batch(model_name, queries, docs, k, distance_metric):
    model = SentenceTransformer(model_name)
    doc_embeddings = encode_texts(model, docs, normalize=True)
    results = {}
    for index, query in enumerate(queries):
        relevant_indices = relevant_hotel_indices(query, docs)
        query_embeddings = encode_texts(model, [query], normalize=True)[0]
        retrieved_indices, _ = retrieve_topk(query_embeddings, doc_embeddings, k, distance_metric)
        results[index] = {
            'precision': precision(relevant_indices, retrieved_indices),
            'mrr': mrr(relevant_indices, retrieved_indices),
            'ndcg': ndcg(relevant_indices, retrieved_indices)
        }
        
    agg = {metric: np.mean([r[metric] for r in results.values()]) for metric in ['precision','mrr','ndcg']}
    return agg


  from .autonotebook import tqdm as notebook_tqdm


### Ablation

In [5]:
queries = generate_queries(100)
relevant_query_indices = [relevant_hotel_indices(query, relevant_hotels) for query in queries]

results = []
distance_metrics = ["cosine", "euclidean"]
models = ["all-MiniLM-L6-v2", "multi-qa-MPNet-base-dot-v1", "sentence-t5-base"]

for model_name in models:
    for distance_metric in distance_metrics:
        print("model_name: ", model_name, "distance_metric: ", distance_metric)
        agg_metrics = evaluate_batch(
            model_name=model_name,
            queries=queries,
            docs=relevant_hotels["Description"].tolist(),
            k=10,
            distance_metric=distance_metric
        )
        row = {"model_name": model_name, "distance_metric": distance_metric}
        print(agg_metrics)
        row.update(agg_metrics)
        results.append(row)

df_results = pd.DataFrame(results)
print(df_results)


all-MiniLM-L6-v2 cosine
{'precision': np.float64(0.099), 'mrr': np.float64(0.13953968253968255), 'ndcg': np.float64(0.15358864815483458)}
all-MiniLM-L6-v2 euclidean
{'precision': np.float64(0.099), 'mrr': np.float64(0.13953968253968255), 'ndcg': np.float64(0.15358864815483458)}
multi-qa-MPNet-base-dot-v1 cosine
{'precision': np.float64(0.11600000000000002), 'mrr': np.float64(0.165), 'ndcg': np.float64(0.1982086905815973)}
multi-qa-MPNet-base-dot-v1 euclidean
{'precision': np.float64(0.11600000000000002), 'mrr': np.float64(0.165), 'ndcg': np.float64(0.1982086905815973)}
sentence-t5-base cosine
{'precision': np.float64(0.11900000000000001), 'mrr': np.float64(0.22392857142857142), 'ndcg': np.float64(0.2270322642342289)}
sentence-t5-base euclidean
{'precision': np.float64(0.11900000000000001), 'mrr': np.float64(0.22392857142857142), 'ndcg': np.float64(0.2270322642342289)}
                   model_name distance_metric  precision       mrr      ndcg
0            all-MiniLM-L6-v2          cos