In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sentence_transformers import SentenceTransformer
import joblib
import os

# --- Install Libraries ---
!pip install pandas gcsfs pyarrow numpy scikit-learn xgboost sentence-transformers joblib -q

# --- Define Paths and Download Assets ---
print("Downloading all model assets from GCS...")

bucketName = 'wanderlust-recommender-system'
localDir = 'inference_assets'

# Create a local directory to store assets
if not os.path.exists(localDir):
    os.makedirs(localDir)

# Define all the assets we need to download
assets_to_download = {
    'xgb_ranker.joblib': f'gs://{bucketName}/processed/xgbScorer.joblib',
    'hotel_content_embeddings.npy': f'gs://{bucketName}/processed/newEmbedding.npy',
    'user_factors.npy': f'gs://{bucketName}/processed/userFactor.npy',
    'hotel_factors.npy': f'gs://{bucketName}/processed/hotelFactor.npy',
    'combined_hotel_reviews.parquet': f'gs://{bucketName}/processed/combined_hotel_reviews.parquet'
}

# Download all asset files from GCS to the local directory
for filename, gcs_path in assets_to_download.items():
    local_path = os.path.join(localDir, filename)
    !gsutil cp {gcs_path} {local_path}

print('Downloading fine-tuned model from GCS ...')
# Handling the sentence transformer model folder
local_finetunedModel_path = os.path.join(localDir, 'hotel_recommender_finetuned/')
if not os.path.exists(local_finetunedModel_path):
    os.makedirs(local_finetunedModel_path)
gcs_finetunedModel_path = f'gs://{bucketName}/processed/hotel_recommender_finetuned/'
!gsutil -m cp -r {gcs_finetunedModel_path}* {local_finetunedModel_path}

print("All assets downloaded successfully.")

# --- Load All Assets into Memory ---
print("\nLoading all assets into memory...")

# Load the trained XGBoost scorer
xgb_ranker = joblib.load(os.path.join(localDir, 'xgb_ranker.joblib'))

# Load the fine-tuned sentence transformer model
finetunedModel = SentenceTransformer(local_finetunedModel_path)

# Load the hotel content embeddings dictionary
hotel_embeddings = np.load(os.path.join(localDir, 'hotel_content_embeddings.npy'), allow_pickle=True).item()

# Load the SVD factor matrices
user_factors = np.load(os.path.join(localDir, 'user_factors.npy'))
item_factors = np.load(os.path.join(localDir, 'hotel_factors.npy'))

# Load the main hotel data DataFrame (for metadata lookup)
hotel_df = pd.read_parquet(os.path.join(localDir, 'combined_hotel_reviews.parquet'))

print("--- All models and data are loaded and ready. ---")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations_final(query, user_id=None, city=None, country=None, top_n=5):
    """
    
    This function assumes all the assets loaded from GCS bucket and returns a panadas dataframe
    including # top hotels specified by user. The default number is 5. User can be either logged-in
    or in guest mode. Function will use the hybrid model if user is logged in and will use the content-
    based embeddings with fine-tuned model if user is guest. If city is specified by user,
    a prefiltering is applied on the dataset of hotels information dataframe.
    
    Args:
        query(str): The query of user to find top hotels per his taste
        user_id(int, optional): ID of user if he is logged-in which is optional
        city(str, optional): Preferred city for recommendations which is optional
        top_n(int): Number of hotels to be recommended which is defaulted to 5
    
    Returns: 
        pandas.DataFrame: Dataframe of top hotels to be recommended
    
    """
    
    # --- Candidate Selection (City Filtering) ---
    # Start with all hotels that have a rating.
    candidate_df = hotel_df[hotel_df['reviews.rating'].notnull()].copy()

    # Build filter conditions dynamically and safely
    if city:
        candidate_df = candidate_df[candidate_df['city'].str.lower() == city.lower()]
    if country:
        candidate_df = candidate_df[candidate_df['country'].str.lower() == country.lower()]
        
    if candidate_df.empty:
        print("No hotels found for the specified location.")
        return pd.DataFrame()

    candidate_hotel_ids = candidate_df['hotel_id'].unique()

    # --- Candidate Generation (Content-Based Ranker) ---
    # Find the top 100 most semantically relevant hotels based on the query.
    print("Finding semantically relevant candidates...")
    query_embedding = finetunedModel.encode(query)
    
    # Filter embeddings to only include our candidates
    candidate_embeddings = {hid: hotel_embeddings.get(hid) for hid in candidate_hotel_ids if hid in hotel_embeddings}
    
    if not candidate_embeddings:
        print("No valid candidates with content embeddings found.")
        return pd.DataFrame()

    hotel_ids, hotel_embs = list(candidate_embeddings.keys()), list(candidate_embeddings.values())
    
    similarities = cosine_similarity(query_embedding.reshape(1, -1), np.array(hotel_embs))[0]
    
    # Create a DataFrame of semantically similar candidates
    similarity_df = pd.DataFrame({'hotel_id': hotel_ids, 'similarity_score': similarities})
    
    # Get the Top 100 most similar candidates
    l1_candidates_df = similarity_df.sort_values(by='similarity_score', ascending=False).head(100)

    # --- Personalized Re-ranking (Hybrid Ranker) ---
    # If the user is anonymous, we return the top results from the content-based ranker.
    if user_id is None or user_id >= len(user_factors):
        print("Anonymous user. Returning top content-based results.")
        top_recommendations = l1_candidates_df.head(top_n)
        
    # If the user is logged in, we re-rank the 100 candidates for personalization.
    else:
        print(f"Logged-in user ({user_id}). Re-ranking candidates for personalization...")
        
        # Get the feature vectors for only the top 100 candidates
        re_rank_candidates = l1_candidates_df['hotel_id'].tolist()
        recommendation_data = []
        user_svd_vector = user_factors[user_id]
        
        for hotel_id in re_rank_candidates:
            hotel_svd_vector = item_factors[hotel_id]
            hotel_content_vector = hotel_embeddings.get(hotel_id)
            feature_vector = np.hstack([user_svd_vector, hotel_svd_vector, hotel_content_vector])
            recommendation_data.append({'hotel_id': hotel_id, 'features': feature_vector})
            
        reco_df = pd.DataFrame(recommendation_data)
        X_reco = np.vstack(reco_df['features'].values)
        
        # Predict with XGBoost
        final_scores = xgb_ranker.predict(X_reco)
        reco_df['score'] = final_scores
        
        # Get the top N from the re-ranked list
        top_recommendations = reco_df.sort_values(by='score', ascending=False).head(top_n)

    # --- Merge and Return Final Results ---
    hotel_info = hotel_df[['hotel_id', 'name', 'city', 'country']].drop_duplicates()
    final_results = pd.merge(top_recommendations, hotel_info, on='hotel_id')
    
    return final_results[['name', 'city', 'country']]

In [None]:
# --- Example Usage ---
final_recos = get_recommendations_final(
    query="a modern hotel near the city centre in newyork", 
    user_id=3000,
    city="los angeles"
)
display(final_recos)

final_recos_anon = get_recommendations_final(
    query="a cheap and cheerful place with good reviews in san francisco"
)
display(final_recos_anon)