In [None]:
# you will be prompted with a window asking to grant permissions
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# fill in the path in your Google Drive in the string below. Note: do not escape slashes or spaces
import os
datadir = "/content/drive/MyDrive/CS 410 - Project/CS 410 - Codebase"
if not os.path.exists(datadir):
  !ln -s "/content/drive/MyDrive/CS 410 - Project/CS 410 - Codebase" $datadir
os.chdir(datadir)
!pwd

/content/drive/.shortcut-targets-by-id/1D6k6NaYmoa4Jo28jhmhNMJlL0fFZ-fvO/CS 410 - Project/CS 410 - Codebase


## Data Preprocessing

In [None]:
import pandas as pd
# Load datasets
df = pd.read_csv("Data/Hotel_Reviews.csv") # Use first row as headers
df = df[[ "Hotel_Name", "Positive_Review", "Negative_Review", "Tags", "Review_Date", "Reviewer_Score", "Average_Score", "Hotel_Address", "lat", "lng" ]] # Select only relevant columns
df["Tags"] = df["Tags"].apply(lambda x: eval(x)) # Convert into 'list' type
df["Review_Date"] = pd.to_datetime(df["Review_Date"]) # Convert into 'Timestamp' type
df.head()

Unnamed: 0,Hotel_Name,Positive_Review,Negative_Review,Tags,Review_Date,Reviewer_Score,Average_Score,Hotel_Address,lat,lng
0,Hotel Arena,Only the park outside of the hotel was beauti...,I am so angry that i made this post available...,"[ Leisure trip , Couple , Duplex Double Room...",2017-08-03,2.9,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968
1,Hotel Arena,No real complaints the hotel was great great ...,No Negative,"[ Leisure trip , Couple , Duplex Double Room...",2017-08-03,7.5,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968
2,Hotel Arena,Location was good and staff were ok It is cut...,Rooms are nice but for elderly a bit difficul...,"[ Leisure trip , Family with young children ,...",2017-07-31,7.1,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968
3,Hotel Arena,Great location in nice surroundings the bar a...,My room was dirty and I was afraid to walk ba...,"[ Leisure trip , Solo traveler , Duplex Doub...",2017-07-31,3.8,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968
4,Hotel Arena,Amazing location and building Romantic setting,You When I booked with your company on line y...,"[ Leisure trip , Couple , Suite , Stayed 2 ...",2017-07-24,6.7,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968


In [None]:
# Hyperparameters setting
tag_weight = 1

In [None]:
# Add tags to positive review (Each tag: added {tag_weight} times)
def add_tags(row):
  tags=[t.strip() for t in row["Tags"]]
  return row["Positive_Review"] + (' '.join(tags) * tag_weight)

df["Positive_Review_and_Tags"] = df.apply(add_tags, axis=1)
df.head(5)

Unnamed: 0,Hotel_Name,Positive_Review,Negative_Review,Tags,Review_Date,Reviewer_Score,Average_Score,Hotel_Address,lat,lng,Positive_Review_and_Tags
0,Hotel Arena,Only the park outside of the hotel was beauti...,I am so angry that i made this post available...,"[ Leisure trip , Couple , Duplex Double Room...",2017-08-03,2.9,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Only the park outside of the hotel was beauti...
1,Hotel Arena,No real complaints the hotel was great great ...,No Negative,"[ Leisure trip , Couple , Duplex Double Room...",2017-08-03,7.5,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,No real complaints the hotel was great great ...
2,Hotel Arena,Location was good and staff were ok It is cut...,Rooms are nice but for elderly a bit difficul...,"[ Leisure trip , Family with young children ,...",2017-07-31,7.1,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Location was good and staff were ok It is cut...
3,Hotel Arena,Great location in nice surroundings the bar a...,My room was dirty and I was afraid to walk ba...,"[ Leisure trip , Solo traveler , Duplex Doub...",2017-07-31,3.8,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Great location in nice surroundings the bar a...
4,Hotel Arena,Amazing location and building Romantic setting,You When I booked with your company on line y...,"[ Leisure trip , Couple , Suite , Stayed 2 ...",2017-07-24,6.7,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Amazing location and building Romantic settin...


In [None]:
import re
import nltk
nltk.download('stopwords',quiet=True)
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_sentence(line, include_stopwords=False, return_as_list=False, no_stemming=False):
  line = line.strip() # Remove leading and trailing whitespaces
  line = re.sub(r'[^\w\s]', ' ', line) # Remove character that are not letters, digits, underscores, and whitespaces
  line = line.lower() # Convert to lowercase
  for x in line:
    if (x in punctuations) or (x.isdigit()):
      line = line.replace(x, " ") # Replace punctuation and numbers with whitespaces

  cleaned_words = []
  for word in line.split(): # Seperate description into words
    if no_stemming == False:
      word = stemmer.stem(word) # Stemming
    if include_stopwords: # Include stopwords
      cleaned_words.append(word)
    else:
      if word not in stop_words: # Remove stopwords
        cleaned_words.append(word)
  if return_as_list:
    return cleaned_words
  return ' '.join(cleaned_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
train_df = df.copy()
# Data preprocessing
cleaned_data_df_path = "cleaned_data_df"
try:
  train_df = pd.read_pickle(cleaned_data_df_path) # Load the cleaned train_df
except:
  # If cleaned trained_df is not available, perform data preprocessing
  train_df["Cleaned_Positive"] = train_df["Positive_Review_and_Tags"].apply(lambda x: clean_sentence(x, return_as_list=True))
  train_df["Cleaned_Negative"] = train_df["Negative_Review"].apply(lambda x: clean_sentence(x, return_as_list=True))
  train_df.to_pickle(cleaned_data_df_path) # Save cleaned train_df

## Train Word2Vec model

In [None]:
from gensim.models import Word2Vec
w2v_model = None
word2vec_model_path = 'trained_w2v.model'

try:
  w2v_model = Word2Vec.load(word2vec_model_path) # Load the saved Word2Vec model if available
except:
  w2v_train_data = [doc for doc in train_df["Cleaned_Positive"]] + [doc for doc in train_df["Cleaned_Negative"]]
  w2v_model = Word2Vec(w2v_train_data, sg=1) # Train the Word2Vec model using our data
  w2v_model.save(word2vec_model_path) # Save the trained Word2Vec model

In [None]:
def find_most_similar(query, w2v_model=w2v_model):
  root_query = stemmer.stem(query) # Vocabulary of Word2Vec are in root form (stemmed), so we must convert query into its root form first
  return w2v_model.wv.most_similar(root_query)

find_most_similar("massage")

[('spa', 0.7122982740402222),
 ('therapist', 0.7063989639282227),
 ('sauna', 0.6773682236671448),
 ('treatment', 0.6594771146774292),
 ('hamam', 0.6448525190353394),
 ('pedicur', 0.6363069415092468),
 ('therapeut', 0.6254048347473145),
 ('swim', 0.6246352791786194),
 ('suana', 0.6212950944900513),
 ('steam', 0.6114707589149475)]

In [None]:
# Step 1:
# Reduce each review: [swimming pool, pool, swimming, breakfast] => {swimming pool: 2.5, breakfast: 1, ...}

# Hyperparameter: Threshold for similarities between words that will be considered as "same words"
  # You don't have to reduce redundant words to 1 time like in the example above
  # You should apply BM-25 on it, for example, 1 time = 1, 2 times = 2, 3 times= 2.5 (reduced by BM-25), 4 times= 2.9 (reduced by BM-25), etc.

# Save the results (see above), potentially in a separate column like "Weighted_Postiive_Reviews" and "Weighted_Negative_Reviews" (so we don't have to run it everytime)

# Step 2:
# Use weighted pos/neg review to compute similarity with query

# Step 3: Compute score for each review
# Total_score = Positive_score - Negative_score

# Step 4: Combine score for each hotel
# total_score_each_review = Positive_score - Negative_score
# total_score_hotel = SUM( (weights be date) * total_score_each_review )
# Return the ranking

In [None]:
# TO-DO: Need to train the full data later
train_df = train_df.head(200000)

In [None]:
from tqdm import tqdm
tqdm.pandas()

# Hyperparameters setting
similarities_threshold = 0.5

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_cosine_similarity(word1, word2, word2vec_model):
    if word1 in w2v_model.wv.key_to_index.keys() and word2 in w2v_model.wv.key_to_index.keys():
        vec1 = word2vec_model.wv[word1].reshape(1, -1)
        vec2 = word2vec_model.wv[word2].reshape(1, -1)
    else:
        return None  # Skip if word not in vocabulary

    similarity = cosine_similarity(vec1, vec2) # Compute cosine similarity
    return similarity[0][0]

def reduce_and_weight_reviews(review):
    reduced_reviews = {}

    for word in review:
        if word not in w2v_model.wv: # Skip words that are not in Word2Vec model (cannot contribute to similarity computation)
          continue

        if len(reduced_reviews) == 0: # First word of the review
          reduced_reviews[word] = 1
          continue

        if word in reduced_reviews: # Found exact match in reduced dictionary
            reduced_reviews[word] += 1
            continue

        # Compute similarity with existing word in reduced dictionary
        max_similarity_score = -np.inf
        max_similarity_word = None
        for k_word in reduced_reviews.keys():
            current_similarity_score = compute_cosine_similarity(word, k_word, w2v_model)
            if (current_similarity_score is not None) and (current_similarity_score > max_similarity_score):
                max_similarity_score = current_similarity_score
                max_similarity_word = k_word

        # Assign word to reduced review dictionary
        if max_similarity_score > similarities_threshold: # If the word is similar to some existing words (cutoff by a threshold)
          reduced_reviews[max_similarity_word] += 1
        else: # If the word is new (not enough similarity with existing words)
          reduced_reviews[word] = 1

    return reduced_reviews


In [None]:
# Weights calculation based on days and quater use (whichever you want to use)
def calculate_weight_daily(review_dates):
    oldest_date = min(review_dates)
    newest_date = max(review_dates)
    total_days = (newest_date - oldest_date).days
    weights = [(date - oldest_date).days / total_days for date in review_dates]
    return weights

def calculate_weight_quarterly(review_dates):
    oldest_date = min(review_dates)
    newest_date = max(review_dates)

    # Calculate the number of quarters between the oldest and newest dates
    num_quarters = (newest_date.year - oldest_date.year) * 4 + (newest_date.quarter - oldest_date.quarter)

    # Calculate the weight for each review date
    weights = []
    for date in review_dates:
        quarter_diff = (date.year - oldest_date.year) * 4 + (date.quarter - oldest_date.quarter)
        weight = quarter_diff / num_quarters
        weights.append(weight)

    return weights

In [None]:
weighted_reviews_data_df_path = "weighted_reviews_data_df"
try:
  train_df = pd.read_pickle(weighted_reviews_data_df_path)
except:
  train_df["Weighted_Positive_Reviews"] = train_df["Cleaned_Positive"].progress_apply(reduce_and_weight_reviews)
  train_df["Weighted_Negative_Reviews"] = train_df["Cleaned_Negative"].progress_apply(reduce_and_weight_reviews)
  train_df['Date_Weight'] = calculate_weight_daily(train_df['Review_Date'])
  train_df.to_pickle(weighted_reviews_data_df_path)

In [None]:
train_df.head()

Unnamed: 0,Hotel_Name,Positive_Review,Negative_Review,Tags,Review_Date,Reviewer_Score,Average_Score,Hotel_Address,lat,lng,Positive_Review_and_Tags,Cleaned_Positive,Cleaned_Negative,Weighted_Positive_Reviews,Weighted_Negative_Reviews,Date_Weight
0,Hotel Arena,Only the park outside of the hotel was beauti...,I am so angry that i made this post available...,"[ Leisure trip , Couple , Duplex Double Room...",2017-08-03,2.9,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Only the park outside of the hotel was beauti...,"[onli, park, outsid, hotel, wa, beauti, leisur...","[angri, made, thi, post, avail, via, possibl, ...","{'onli': 1, 'park': 1, 'outsid': 1, 'hotel': 1...","{'angri': 6, 'made': 5, 'post': 3, 'avail': 6,...",1.0
1,Hotel Arena,No real complaints the hotel was great great ...,No Negative,"[ Leisure trip , Couple , Duplex Double Room...",2017-08-03,7.5,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,No real complaints the hotel was great great ...,"[real, complaint, hotel, wa, great, great, loc...",[neg],"{'real': 2, 'complaint': 2, 'hotel': 1, 'wa': ...",{'neg': 1},1.0
2,Hotel Arena,Location was good and staff were ok It is cut...,Rooms are nice but for elderly a bit difficul...,"[ Leisure trip , Family with young children ,...",2017-07-31,7.1,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Location was good and staff were ok It is cut...,"[locat, wa, good, staff, ok, cute, hotel, brea...","[room, nice, elderli, bit, difficult, room, tw...","{'locat': 1, 'wa': 2, 'good': 3, 'staff': 1, '...","{'room': 3, 'nice': 1, 'elderli': 2, 'bit': 3,...",0.99589
3,Hotel Arena,Great location in nice surroundings the bar a...,My room was dirty and I was afraid to walk ba...,"[ Leisure trip , Solo traveler , Duplex Doub...",2017-07-31,3.8,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Great location in nice surroundings the bar a...,"[great, locat, nice, surround, bar, restaur, n...","[room, wa, dirti, wa, afraid, walk, barefoot, ...","{'great': 4, 'locat': 1, 'surround': 3, 'bar':...","{'room': 14, 'dirti': 13, 'afraid': 5, 'walk':...",0.99589
4,Hotel Arena,Amazing location and building Romantic setting,You When I booked with your company on line y...,"[ Leisure trip , Couple , Suite , Stayed 2 ...",2017-07-24,6.7,7.7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Amazing location and building Romantic settin...,"[amaz, locat, build, romant, set, leisur, trip...","[book, compani, line, show, pictur, room, thou...","{'amaz': 1, 'locat': 1, 'build': 1, 'romant': ...","{'book': 7, 'compani': 1, 'line': 2, 'show': 5...",0.986301


## GEOCODING API to get bounding box

In [None]:
!pip install geopy



In [None]:
from geopy.geocoders import Nominatim
import math

def get_location_by_name(hotel_name, area_km):
    geolocator = Nominatim(user_agent="CS410-Project-2")
    location = geolocator.geocode(hotel_name, timeout=None)
    latitude,longitude =  location.latitude, location.longitude

    half_side_in_km = math.sqrt(area_km) / 2
    half_side_in_radians = half_side_in_km / 6371.01
    latitude_in_radians = math.radians(latitude)

    min_latitude = latitude - math.degrees(half_side_in_radians)
    max_latitude = latitude + math.degrees(half_side_in_radians)

    min_longitude = longitude - math.degrees(half_side_in_radians / math.cos(latitude_in_radians))
    max_longitude = longitude + math.degrees(half_side_in_radians / math.cos(latitude_in_radians))

    # Return the coordinates of the four corners
    return [(min_latitude, min_longitude), (min_latitude, max_longitude), (max_latitude, min_longitude), (max_latitude, max_longitude)]

print(get_location_by_name("Tokyo Tower", 100))

[(35.61348309028327, 139.69019361642174), (35.61348309028327, 139.80087829367753), (35.70341510971673, 139.69019361642174), (35.70341510971673, 139.80087829367753)]


## Method 1: Rank fetched hotels based on the similarities score and location

In [None]:
import numpy as np
def create_document_term_matrix_bm(texts, vocabulary):
    dt_matrix = np.zeros((len(texts), len(vocabulary)), dtype=int)
    for i, text in enumerate(texts):
      for word in text:
        if word in vocabulary:
          dt_matrix[i, vocabulary.index(word)] = text[word] # Get count of the word in that text
    return dt_matrix

def bm25_score_with_doc_len_norm(query, documents, vocabulary, k=1.5, b=0.75):
    dt_matrix = create_document_term_matrix_bm(documents, vocabulary)

    # Compute IDF and avdl
    word_df = np.sum(dt_matrix > 0, axis=0)
    idf_vals = np.log((len(documents) + 2) / (word_df + 1)) # Avoid division by zero

    dl = np.array([sum(doc.values()) for doc in documents])
    avgdl = np.mean(dl)
    doc_length_norm_vals = (1 - b) + b * (dl / avgdl)

    # Compute scores with BM25, TF-IDF, and doc length normalization
    query_words = clean_sentence(query, return_as_list=True)

    scores = []
    for i, doc in enumerate(documents):
        score = 0
        for word in query_words:
            if (word in doc) and (word in vocabulary):
                tf = doc[word]
                idf = idf_vals[vocabulary.index(word)]
                score += ((tf * (k + 1)) / ((tf + k) * doc_length_norm_vals[i])) * idf
        scores.append(score)

    return scores

def compute_review_score(query, location, area):
    tmp_df = train_df.copy()

    corners = get_location_by_name(hotel_name=location, area_km=area)
    min_lat, min_long = corners[0]
    max_lat, max_long = corners[3]

    mask = (
        (tmp_df['lat'] >= min_lat) &
        (tmp_df['lat'] <= max_lat) &
        (tmp_df['lng'] >= min_long) &
        (tmp_df['lng'] <= max_long)
    )

    filtered_df = tmp_df.loc[mask]

    filtered_df["positive_score"] = bm25_score_with_doc_len_norm(query, filtered_df["Weighted_Positive_Reviews"], list(w2v_model.wv.key_to_index.keys()))
    filtered_df["negative_score"] = bm25_score_with_doc_len_norm(query, filtered_df["Weighted_Negative_Reviews"], list(w2v_model.wv.key_to_index.keys()))
    filtered_df["total_score"] = filtered_df["positive_score"] - filtered_df["negative_score"]

    result = filtered_df.groupby('Hotel_Name').apply(lambda x: (x['Date_Weight'] * x['total_score']).mean())
    return result.sort_values(ascending=False)

In [None]:
query = "hotel with swimming pool and car parking"

compute_review_score(query,"Paris", 30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["positive_score"] = bm25_score_with_doc_len_norm(query, filtered_df["Weighted_Positive_Reviews"], list(w2v_model.wv.key_to_index.keys()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["negative_score"] = bm25_score_with_doc_len_norm(query, filtered_df["Weighted_Negative_Reviews"], list(w2v_model.wv.key_to_index.keys()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://

Hotel_Name
H tel Saint Marc                                   3.600386
Hotel La Lanterne                                  2.625398
H tel Paris Bastille Boutet MGallery by Sofitel    2.269638
La Villa Haussmann                                 1.487016
Le Roch Hotel Spa                                  1.084434
                                                     ...   
Renaissance Paris Republique Hotel Spa            -0.184317
Maxim Op ra                                       -0.211736
H tel De Castiglione                              -0.230972
H tel De Buci by MH                               -0.243862
Hotel Le Pera                                     -0.425560
Length: 184, dtype: float64

In [None]:
query = "spa and pet friendly"

compute_review_score(query,"Paris", 30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["positive_score"] = bm25_score_with_doc_len_norm(query, filtered_df["Weighted_Positive_Reviews"], list(w2v_model.wv.key_to_index.keys()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["negative_score"] = bm25_score_with_doc_len_norm(query, filtered_df["Weighted_Negative_Reviews"], list(w2v_model.wv.key_to_index.keys()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://

Hotel_Name
Renaissance Paris Republique Hotel Spa    2.764848
Maison Albar Hotel Paris C line           2.466207
Hotel Scribe Paris Opera by Sofitel       1.647974
Saint James Albany Paris Hotel Spa        1.571585
H tel Da Vinci Spa                        1.071007
                                            ...   
Hotel Astor Saint Honor                   0.000000
Splendide Royal Paris                     0.000000
Drawing Hotel                             0.000000
Hotel Dupond Smith                        0.000000
Hotel Astra Opera Astotel                -0.175443
Length: 184, dtype: float64

## Method 2: Custom Word2Vec + Word2Vec log-likelihood (1 vector per word)

In [None]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

# Build vocabulary based on most frequent words
def get_top_words(reviews, n):
    words = [word for text in reviews for word in text]
    word_freq = Counter(words)
    top_words = word_freq.most_common(n) # Select the top n most frequent words
    return [word[0] for word in top_words]

# Pre-compute the similarity of (each word in) vocabulary and (each word in) Word2Vec model, return as dictionary
def vocabulary_similarity(w2v_model, vocabulary):
  w2v_keys = w2v_model.wv.key_to_index.keys()
  w2v_keys_vectors = [w2v_model.wv[word] for word in w2v_keys]

  vocab_vectors = [w2v_model.wv[word] for word in vocabulary]

  similarity_matrix = cosine_similarity(vocab_vectors, w2v_keys_vectors) # Compute the similarity scores

  similarity_dict = {} # Build precomputed results as dictionary
  for i, word in enumerate(vocabulary):
    for j, word2 in enumerate(w2v_keys):
      similarity_dict[word + "_" + word2] = similarity_matrix[i][j]

  return similarity_dict

# Find similarity score by using each word in query and each word in document (1 vector per word)
def compute_average_log_likelihood(doc, query, model, similarity_dict, vocabulary):
    query_tokens = clean_sentence(query, return_as_list=True)

    final_score = 0
    for query_token in query_tokens: # Loop through each word in query
      if query_token in vocabulary:
        for doc_token in doc: # Loop through each word in the doc
          final_score += (similarity_dict.get(query_token + "_" + doc_token, 0))

    return final_score / len(query_tokens)

In [None]:
def compute_review_score_w2v(query, tmp_w2v_df, num_most_freq_words, w2v_model, location, area):

  # Build vocabulary based on most frequent words
  pos_vocabulary = get_top_words(tmp_w2v_df['Cleaned_Positive'], num_most_freq_words)
  neg_vocabulary = get_top_words(tmp_w2v_df['Cleaned_Negative'], num_most_freq_words)

  tmp_df = tmp_w2v_df.copy()

  #Creating a filter to get hotels in a specific geolocation
  corners = get_location_by_name(hotel_name=location, area_km=area)
  min_lat, min_long = corners[0]
  max_lat, max_long = corners[3]

  mask = (
      (tmp_df['lat'] >= min_lat) &
      (tmp_df['lat'] <= max_lat) &
      (tmp_df['lng'] >= min_long) &
      (tmp_df['lng'] <= max_long)
  )

  filtered_df = tmp_df.loc[mask]

  # Pre-compute similarity scores
  pos_similarity_dict = vocabulary_similarity(w2v_model, pos_vocabulary)
  neg_similarity_dict = vocabulary_similarity(w2v_model, neg_vocabulary)

  # Calculate score for each document
  filtered_df["pos_w2v_score"] = filtered_df['Cleaned_Positive'].apply(compute_average_log_likelihood, query = query, model = w2v_model, similarity_dict = pos_similarity_dict, vocabulary= pos_vocabulary)
  filtered_df["neg_w2v_score"] = filtered_df['Cleaned_Negative'].apply(compute_average_log_likelihood, query = query, model = w2v_model, similarity_dict = neg_similarity_dict, vocabulary= neg_vocabulary)
  filtered_df["total_score"] = filtered_df["pos_w2v_score"] - filtered_df["neg_w2v_score"]
  filtered_df['Date_Weight'] = calculate_weight_daily(filtered_df['Review_Date']) # Calculate weight for each date

  # Combine scores within the same hotel, using date (recentness) to weight the scores
  result = filtered_df.groupby('Hotel_Name').apply(lambda x: (x['Date_Weight'] * x['total_score']).mean())

  return result.sort_values(ascending=False)

In [None]:
tmp_df = train_df.copy()
num_most_freq_words = 1500 # Hyperparameters setting

query = "hotel with swimming pool and car parking"

result = compute_review_score_w2v(query, tmp_df, num_most_freq_words, w2v_model, 'Paris', 30)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["pos_w2v_score"] = filtered_df['Cleaned_Positive'].apply(compute_average_log_likelihood, query = query, model = w2v_model, similarity_dict = pos_similarity_dict, vocabulary= pos_vocabulary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["neg_w2v_score"] = filtered_df['Cleaned_Negative'].apply(compute_average_log_likelihood, query = query, model = w2v_model, similarity_dict = neg_similarity_dict, vocabulary= neg_vocabulary)
A value is trying to be set on a copy of a slice from a DataFrame.

Hotel_Name
Hotel Villa Lafayette Paris IX       4.766613
Splendide Royal Paris                4.206512
H tel Saint Marc                     4.123807
BoB Hotel by Elegancia               3.905112
Drawing Hotel                        3.806386
                                       ...   
Pullman Paris Montparnasse           0.311036
Paris Marriott Rive Gauche Hotel     0.191866
Suites H tel Helzear Montparnasse    0.100910
Maxim Op ra                         -0.325337
Le Grand H tel de Normandie         -0.644398
Length: 184, dtype: float64

In [None]:
tmp_df = train_df.copy()
num_most_freq_words = 1500 # Hyperparameters setting

query = "spa and pet friendly"

result = compute_review_score_w2v(query, tmp_df, num_most_freq_words, w2v_model,'Paris', 30)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["pos_w2v_score"] = filtered_df['Cleaned_Positive'].apply(compute_average_log_likelihood, query = query, model = w2v_model, similarity_dict = pos_similarity_dict, vocabulary= pos_vocabulary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["neg_w2v_score"] = filtered_df['Cleaned_Negative'].apply(compute_average_log_likelihood, query = query, model = w2v_model, similarity_dict = neg_similarity_dict, vocabulary= neg_vocabulary)
A value is trying to be set on a copy of a slice from a DataFrame.

Hotel_Name
Hotel Whistler                            5.306610
Hotel Villa Lafayette Paris IX            5.011353
Splendide Royal Paris                     4.380073
Drawing Hotel                             4.109327
H tel Saint Marc                          4.109270
                                            ...   
Garden Elys e                             0.808009
Pullman Paris Montparnasse                0.746574
Hyatt Regency Paris Etoile                0.700274
Maxim Op ra                               0.622549
Holiday Inn Paris Montparnasse Pasteur    0.196910
Length: 267, dtype: float64

## Method 3: Pre-trained Word2Vec + Word2Vec log-likelihood (1 vector per word)

In [None]:
w2v_train_df = df.copy()
import pandas as pd

# Data preprocessing
w2v_cleaned_data_df_path = "W2v_cleaned_data_df"

try:
  w2v_train_df = pd.read_pickle(w2v_cleaned_data_df_path) # Load the cleaned w2v_train_df
except:
  # If cleaned trained_df is not available, perform data preprocessing
  w2v_train_df["Cleaned_Positive"] = w2v_train_df["Positive_Review_and_Tags"].apply(lambda x: clean_sentence(x, return_as_list=True, no_stemming=True))
  w2v_train_df["Cleaned_Negative"] = w2v_train_df["Negative_Review"].apply(lambda x: clean_sentence(x, return_as_list=True, no_stemming=True))
  w2v_train_df.to_pickle(w2v_cleaned_data_df_path) # Save cleaned w2v_train_df

In [None]:
w2v_train_df = w2v_train_df.head(200000)

In [None]:
import gensim.downloader
from gensim.models import KeyedVectors

# Initialize Word2Vec model - using a smaller-dimension model, as larger-dimension model take too long
w2v_model = None
word2vec_model_name = 'glove-twitter-100'
word2vec_model_path = word2vec_model_name + '.kvmodel'
try:
  w2v_model = KeyedVectors.load(word2vec_model_path) # Load the saved Word2Vec model if available
except:
  w2v_model = gensim.downloader.load(word2vec_model_name) # Download pre-trained Word2Vec model (with vector of size 100)
  w2v_model.save(word2vec_model_path) # Save the Word2Vec model

In [None]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Get set of unique words, and most frequent words
def get_top_words(reviews, n):
    words = [word for text in reviews for word in text]

    all_words = set(words) # Get the set of unique words present

    word_freq = Counter(words)
    top_words = word_freq.most_common(n) # Select the top n most frequent words
    freq_words= [word[0] for word in top_words]

    return all_words, freq_words

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Pre-compute similarity score of (each word in) vocabulary and all words ever presented in the docs
def vocabulary_similarity(model, vocabulary, all_words):
  # Get word embeddings for all words
  all_word_vectors = []
  for word in all_words:
    if word in model:
      all_word_vectors.append(model[word])
    else:
        all_word_vectors.append(np.zeros_like(model['hello']))

  # Get word embeddings for all words in vocabulary
  vocab_words_vectors = []
  for word in vocabulary:
    if word in model:
      vocab_words_vectors.append(model[word])
    else:
      vocab_words_vectors.append(np.zeros_like(model['hello']))

  # Compute similaity, return as array
  all_sims = cosine_similarity(vocab_words_vectors,all_word_vectors)

  # Built pre-computed results as dictionary
  similarity_dict = {}
  for i, word in enumerate(vocabulary):
    for j, word2 in enumerate(all_words):
      similarity_dict[word + "_" + word2] = all_sims[i][j]

  del all_word_vectors, vocab_words_vectors, all_sims

  return similarity_dict

# Find similarity score of query with each documents
def compute_average_log_likelihood(doc, query_tokens, model, similarity_dict, vocabulary):


    final_score = 0
    for query_token in query_tokens:
      if query_token in vocabulary:
        for doc_token in doc:
          final_score += (similarity_dict.get(query_token + "_" + doc_token, 0))

    # final_score=sigmoid(final_score/len(query_tokens))

    return final_score/len(query_tokens)


In [None]:
def compute_review_score_w2v(query, tmp_w2v_df, num_most_freq_words, w2v_model, location, area):
  # Get vocabulary by most frequent words
  all_pos_words, pos_vocabulary = get_top_words(tmp_w2v_df['Cleaned_Positive'], num_most_freq_words)
  all_neg_words, neg_vocabulary = get_top_words(tmp_w2v_df['Cleaned_Negative'], num_most_freq_words)

  tmp_df = tmp_w2v_df.copy()

  #Creating a filter to get hotels in a specific geolocation
  corners = get_location_by_name(hotel_name=location, area_km=area)
  min_lat, min_long = corners[0]
  max_lat, max_long = corners[3]

  mask = (
      (tmp_df['lat'] >= min_lat) &
      (tmp_df['lat'] <= max_lat) &
      (tmp_df['lng'] >= min_long) &
      (tmp_df['lng'] <= max_long)
  )

  filtered_df = tmp_df.loc[mask]

  # Preprocess query
  query_tokens = clean_sentence(query, return_as_list=True, no_stemming=True)

  # Precompute similarity score
  pos_similarity_dict = vocabulary_similarity(w2v_model,pos_vocabulary,all_pos_words)
  neg_similarity_dict = vocabulary_similarity(w2v_model,neg_vocabulary,all_neg_words)

  del all_pos_words, all_neg_words

  # Compute similarity score of the query with each review
  filtered_df["pos_w2v_score"] = filtered_df['Cleaned_Positive'].apply(compute_average_log_likelihood, query_tokens = query_tokens, model = w2v_model, similarity_dict = pos_similarity_dict, vocabulary = pos_vocabulary)
  filtered_df["neg_w2v_score"] = filtered_df['Cleaned_Negative'].apply(compute_average_log_likelihood, query_tokens = query_tokens, model = w2v_model, similarity_dict = neg_similarity_dict, vocabulary = neg_vocabulary)
  filtered_df["total_score"] = filtered_df["pos_w2v_score"] - filtered_df["neg_w2v_score"]

  # Weight the scores by date (recentness)
  filtered_df['Date_Weight'] = calculate_weight_daily(filtered_df['Review_Date'])

  result = filtered_df.groupby('Hotel_Name').apply(lambda x: (x['Date_Weight'] * x['total_score']).mean())

  return result.sort_values(ascending=False)

In [None]:
tmp_df = w2v_train_df.copy()
num_most_freq_words = 1000 # Hyperparameters setting

query = "hotel with swimming pool and car parking"

result = compute_review_score_w2v(query, tmp_df, num_most_freq_words, w2v_model, 'Paris', 30)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["pos_w2v_score"] = filtered_df['Cleaned_Positive'].apply(compute_average_log_likelihood, query_tokens = query_tokens, model = w2v_model, similarity_dict = pos_similarity_dict, vocabulary = pos_vocabulary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["neg_w2v_score"] = filtered_df['Cleaned_Negative'].apply(compute_average_log_likelihood, query_tokens = query_tokens, model = w2v_model, similarity_dict = neg_similarity_dict, vocabulary = neg_vocabulary)
A value is trying to be set on a cop

Hotel_Name
Hotel Whistler                            7.819165
Hotel Villa Lafayette Paris IX            6.934027
Splendide Royal Paris                     6.289290
Le Tsuba Hotel                            5.747486
H tel Amastan Paris                       5.733768
                                            ...   
Le Lavoisier                              0.670669
Suites H tel Helzear Montparnasse         0.539522
Maxim Op ra                               0.052158
Le Grand H tel de Normandie              -0.611855
Holiday Inn Paris Montparnasse Pasteur   -0.934603
Length: 267, dtype: float64

In [None]:
tmp_df = w2v_train_df.copy()
num_most_freq_words = 1000 # Hyperparameters setting

query = "spa and pet friendly"

result = compute_review_score_w2v(query, tmp_df, num_most_freq_words, w2v_model, 'Paris', 30)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["pos_w2v_score"] = filtered_df['Cleaned_Positive'].apply(compute_average_log_likelihood, query_tokens = query_tokens, model = w2v_model, similarity_dict = pos_similarity_dict, vocabulary = pos_vocabulary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["neg_w2v_score"] = filtered_df['Cleaned_Negative'].apply(compute_average_log_likelihood, query_tokens = query_tokens, model = w2v_model, similarity_dict = neg_similarity_dict, vocabulary = neg_vocabulary)
A value is trying to be set on a cop

Hotel_Name
Hotel Whistler                            7.280972
Hotel Villa Lafayette Paris IX            6.556177
Splendide Royal Paris                     5.769967
Drawing Hotel                             5.700566
Le Tsuba Hotel                            5.361393
                                            ...   
Garden Elys e                             1.090707
Maxim Op ra                               1.029501
Le Lavoisier                              1.023006
Hyatt Regency Paris Etoile                1.018291
Holiday Inn Paris Montparnasse Pasteur    0.516399
Length: 267, dtype: float64

## Method 4: Pre-trained Word2Vec + Word2Vec log-likelihood (1 vector per doc)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

# Load pre-trained Word2Vec model
def load_glove_model(glove_file):
    print("Loading GloVe model...")

    with open(glove_file, 'r') as f:
        word_embeddings = {}
        for line in f:
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs

    print("GloVe model loaded successfully.")

    return word_embeddings

# Save word_embeddings
def save_word_embeddings(word_embeddings, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(word_embeddings, f)
    print("Word embeddings saved successfully.")

# Load word_embeddings
def load_word_embeddings(file_path):
    with open(file_path, 'rb') as f:
        word_embeddings = pickle.load(f)
    print("Word embeddings loaded successfully.")
    return word_embeddings

# Get word embeddings from pre-trained Word2Vec model
save_path = 'word_embeddings.pkl'
try:
  word_embeddings = load_word_embeddings(save_path)
except:
  glove_file = 'glove.840B.300d.txt'
  word_embeddings = load_glove_model(glove_file)
  save_word_embeddings(word_embeddings, save_path)


Word embeddings loaded successfully.


In [None]:

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity between 2 vectors
def compute_cosine_similarity(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    similarity = cosine_similarity(vec1, vec2)
    return similarity[0][0]

# Get a vector representing each document
def calculate_document_vector(doc, word_embeddings):
    doc_vector = []
    for word in doc:
        if word in word_embeddings:
            doc_vector.append(word_embeddings[word]) # Get vector for each word

    # Get average of all word vectors
    if doc_vector:
        return np.mean(doc_vector, axis=0)
    else:
        return np.zeros_like(word_embeddings['hello'])  # Return zero vector if no words are found

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Compute similarity of query and doc
def average_log_likelihood_similarity(query_vector, doc_vector, word_embeddings):
    similarity = compute_cosine_similarity(query_vector, doc_vector)
    return similarity


In [None]:
def compute_review_score_w2v(query, tmp_w2v_df, word_embeddings, location, area):
  tmp_df = tmp_w2v_df.copy()

  #Creating a filter to get hotels in a specific geolocation
  corners = get_location_by_name(hotel_name=location, area_km=area)
  min_lat, min_long = corners[0]
  max_lat, max_long = corners[3]

  mask = (
      (tmp_df['lat'] >= min_lat) &
      (tmp_df['lat'] <= max_lat) &
      (tmp_df['lng'] >= min_long) &
      (tmp_df['lng'] <= max_long)
  )

  filtered_df = tmp_df.loc[mask]

  # Get vector representing the query (average of word vectors)
  query_words = clean_sentence(query, return_as_list=True, no_stemming=True) # Preprocess queries
  query_vector = np.mean([word_embeddings[word] for word in query if word in word_embeddings], axis=0)


  pos_reviews = filtered_df['Cleaned_Positive']
  neg_reviews = filtered_df['Cleaned_Negative']

  # Calculate vectors of positive reviews
  pos_document_vectors = [calculate_document_vector(doc, word_embeddings) for doc in pos_reviews]
  pos_relevance_scores = []
  for doc_vector in pos_document_vectors:
      score = average_log_likelihood_similarity(query_vector, doc_vector, word_embeddings)
      pos_relevance_scores.append(score)
  filtered_df["pos_w2v_score"] = pos_relevance_scores

  # Calculate vectors of negative reviews
  neg_document_vectors = [calculate_document_vector(doc, word_embeddings) for doc in neg_reviews]
  neg_relevance_scores = []
  for doc_vector in neg_document_vectors:
      score = average_log_likelihood_similarity(query_vector, doc_vector, word_embeddings)
      neg_relevance_scores.append(score)
  filtered_df["neg_w2v_score"] = neg_relevance_scores

  # Calculate final score for each review
  filtered_df["total_score"] = filtered_df["pos_w2v_score"] - filtered_df["neg_w2v_score"]

  # Calculate weight if each review based on date (recentness)
  filtered_df['Date_Weight'] = calculate_weight_daily(filtered_df['Review_Date'])

  result = filtered_df.groupby('Hotel_Name').apply(lambda x: (x['Date_Weight'] * x['total_score']).mean())

  return result.sort_values(ascending=False)

In [None]:
tmp_w2v_df = w2v_train_df.copy()

query = "hotel with swimming pool and car parking"

result = compute_review_score_w2v(query, tmp_w2v_df, word_embeddings, 'Paris', 30)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["pos_w2v_score"] = pos_relevance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["neg_w2v_score"] = neg_relevance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["total_score"] = filtered_df["pos_w2v_score"] - filtered_df["neg_w2v_score"]
A value is t

Hotel_Name
Hotel Whistler                          0.072000
Splendide Royal Paris                   0.070994
XO Hotel                                0.062282
Hotel Le Saint Gregoire                 0.058852
Hotel Astra Opera Astotel               0.058064
                                          ...   
Hotel Astor Saint Honor                 0.008578
H tel Champs lys es Plaza               0.008357
Garden Elys e                           0.004536
Mercure Paris Arc de Triomphe Etoile    0.003417
Pershing Hall                          -0.014231
Length: 267, dtype: float64

In [None]:
tmp_w2v_df = w2v_train_df.copy()

query = "spa and pet friendly"

result = compute_review_score_w2v(query, tmp_w2v_df, word_embeddings, 'Paris', 30)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["pos_w2v_score"] = pos_relevance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["neg_w2v_score"] = neg_relevance_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["total_score"] = filtered_df["pos_w2v_score"] - filtered_df["neg_w2v_score"]
A value is t

Hotel_Name
Splendide Royal Paris                   0.075962
Hotel Whistler                          0.068165
Drawing Hotel                           0.060383
Hotel Le Saint Gregoire                 0.058623
XO Hotel                                0.058260
                                          ...   
Hotel Astor Saint Honor                 0.008605
H tel Champs lys es Plaza               0.006365
Garden Elys e                           0.005406
Mercure Paris Arc de Triomphe Etoile    0.005093
Pershing Hall                          -0.010576
Length: 267, dtype: float64