In [None]:
#%pip install chromadb 
#%pip install pandas
#%pip install sentence-transformers

In [None]:
import pandas as pd
from tqdm import tqdm
import chromadb 
from chromadb.utils import embedding_functions
import math

  from .autonotebook import tqdm as notebook_tqdm


### Prep hotel data for embedding and storage

In [2]:
hotels_df = pd.read_csv(r'data/hotels.csv', encoding='ISO-8859-1')

In [19]:
relevant_countries = ['Singapore', 'France', 'South Korea', 'Switzerland', 'Japan', 'United States', 'Spain']

In [20]:
hotels_df = hotels_df[hotels_df[' countyName'].isin(relevant_countries)]

In [22]:
len(hotels_df)

308646

In [23]:
# put all relevant hotel information into a paragraph
def format_info(attractions, description, facilities):
    return f"""
    Attractions near hotel: {attractions}\n
    Description of hotel: {description}\n
    Hotel faciltiies: {facilities}
    """

In [24]:
hotel_info = []
metadata = []

for index, row in tqdm(hotels_df.iterrows(), total = hotels_df.shape[0], desc = 'Processing'):
    attractions = row[' Attractions']
    description = row[' Description']
    facilities = row[' HotelFacilities']
    info = format_info(attractions, description, facilities) 

    hotel_info.append(info)
    metadata_dict = {
        'country': row[' countyName'], 
        'city': row[' cityName'],
        'hotel code': row[' HotelCode'] # unique ID
    }
    metadata.append(metadata_dict)

Processing: 100%|██████████| 308646/308646 [00:06<00:00, 48217.14it/s]


### Embeddings

In [None]:
chromadb_client = chromadb.PersistentClient(r'./chromadb_storage')
ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") # embedding model

In [None]:
collection = chromadb_client.get_or_create_collection(
    name="hotel_information", # please set this as collection name
    embedding_function=ef
)

In [28]:
max_id = len(hotel_info)
ids = ['id'+str(i) for i in range(1, max_id+1)]

batch_size = 500
total_batches = math.ceil(max_id/batch_size)

for i in tqdm(range(total_batches), desc="Adding to Chroma", unit="batch"):
    start = i * batch_size
    end = start + batch_size

    batch_ids = ids[start:end]
    batch_docs = hotel_info[start:end]
    batch_meta = metadata[start:end]

    collection.add(
        ids=batch_ids,
        documents=batch_docs,
        metadatas=batch_meta
    )

Adding to Chroma: 100%|██████████| 618/618 [1:48:50<00:00, 10.57s/batch]


In [30]:
res = collection.query(query_texts = ['gym'], n_results = 2, where={
    "$and": [
        {"country": "Japan"},
        {"city": "Tokyo"}
    ]
}, include = ['documents', 'metadatas'])

In [31]:
res

{'ids': [['id65253', 'id65232']],
 'embeddings': None,
 'documents': [['\n    Attractions near hotel: Distances are displayed to the nearest 0.1 mile and kilometer. <br /> <p>Yebisu Garden Place - 0.1 km / 0.1 mi <br /> Happoen Garden - 1.6 km / 1 mi <br /> Cerulean Tower - 2.6 km / 1.6 mi <br /> Roppongi Hills - 2.8 km / 1.7 mi <br /> Shibuya Crossing - 2.8 km / 1.8 mi <br /> Tokyo Anime Center - 3 km / 1.9 mi <br /> Keio University - 3.1 km / 1.9 mi <br /> Love Hotel Hill - 3.2 km / 2 mi <br /> Omotesando Hills - 3.4 km / 2.1 mi <br /> Tokyo Midtown - 3.6 km / 2.2 mi <br /> Yoyogi National Gymnasium - 3.7 km / 2.3 mi <br /> Yoyogi Park - 3.9 km / 2.4 mi <br /> Shiba Park - 3.9 km / 2.4 mi <br /> Meiji Jingu Stadium - 4.1 km / 2.5 mi <br /> NHK Hall - 4.2 km / 2.6 mi <br /> </p><p>The nearest airports are:<br />Narita Intl. Airport (NRT) - 77.3 km / 48 mi<br /> Haneda Airport (HND) - 23.7 km / 14.7 mi<br /> </p><p>The preferred airport for The Westin Tokyo is Narita Intl. Airport (NRT