# Save Processed Room Names

In this notebook we will save the processed room names of the reference database and Expedia database to use it in our API.

In [1]:
# Imports
import pandas as pd
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
# from fuzzywuzzy import fuzz
from rapidfuzz import process, fuzz

In [2]:
# Go one folder back
os.chdir('..')

In [3]:
file_url_reference = "Data/referance_rooms-1737378184366.csv"

df_reference_initial = pd.read_csv(file_url_reference)
df_reference_initial

Unnamed: 0,hotel_id,lp_id,room_id,room_name
0,13484077,lp23e8ef,1142730702,Double or Twin Room
1,13487663,lp6554de34,1141927122,House
2,13462809,lp6556c3dc,1142722063,Room
3,13530116,lp6555450b,1141968275,Triple Room
4,13530071,lp6557a92c,1142513784,Apartment
...,...,...,...,...
99995,21684,lp6561b025,2168409,Two-Bedroom Suite
99996,21684,lp6561b025,2168411,Deluxe Triple Room
99997,21684,lp6561b025,2168412,Deluxe Queen Room with Two Queen Beds
99998,21684,lp6561b025,2168413,Classic Quadruple Room


In [4]:
file_url_expedia = "Data/updated_core_rooms.csv"

df_expedia_initial = pd.read_csv(file_url_expedia)
df_expedia_initial

Unnamed: 0,core_room_id,core_hotel_id,lp_id,supplier_room_id,supplier_name,supplier_room_name
0,1,506732,lp7bb6c,200979491,Expedia,Superior Double Room
1,2,509236,lp7c534,200998017,Expedia,"Deluxe Room, Balcony"
2,3,516326,lp7e0e6,201144757,Expedia,Female Dormitory- 3 Beds
3,4,495330,lp78ee2,201028863,Expedia,"Standard Apartment, 2 Bedrooms (6 people)"
4,5,970167,lpecdb7,218116045,Expedia,"Traditional Cottage, 2 Bedrooms, Harbor View"
...,...,...,...,...,...,...
2869051,2912439,193359,lp2f34f,323872346,Expedia,"Deluxe Room, 1 King Bed with Sofa bed"
2869052,2912440,143473,lp23071,230770971,Expedia,Ocean Bay Pool Room
2869053,2912441,1701692958,lp656dc61e,322166812,Expedia,8 Berth Luxury Caravan
2869054,2912442,143473,lp23071,315521742,Expedia,Beach Room


### Data cleaning
Check how many ids are equal in both datasets:

df_reference['hotel_id'] and df_expedia['core_hotel_id']

In [5]:
# Convert the columns to sets
set_1 = set(df_reference_initial['lp_id'])
set_2 = set(df_expedia_initial['lp_id'])

# Find the intersection of the sets
equal_ids = set_1.intersection(set_2)

# Count the number of equal IDs
count_equal_ids = len(equal_ids)

# Display the results
print("Number of equal IDs:", count_equal_ids)

Number of equal IDs: 28638


We will keep only the hotels from the reference that are found in both datasets, because those are the only rooms that will need to be matched.

In [6]:
df_reference = df_reference_initial[df_reference_initial['lp_id'].isin(equal_ids)]
df_reference

Unnamed: 0,hotel_id,lp_id,room_id,room_name
0,13484077,lp23e8ef,1142730702,Double or Twin Room
1,13487663,lp6554de34,1141927122,House
2,13462809,lp6556c3dc,1142722063,Room
3,13530116,lp6555450b,1141968275,Triple Room
4,13530071,lp6557a92c,1142513784,Apartment
...,...,...,...,...
99988,482128,lp65563022,48212802,Single Room
99989,482128,lp65563022,48212803,Superior Double or Twin Room with Lake View
99990,482128,lp65563022,48212808,Twin Room with Garden View
99991,482128,lp65563022,48212809,Deluxe Double or Twin Room with Lake View


In [7]:
df_expedia = df_expedia_initial[df_expedia_initial['lp_id'].isin(equal_ids)]
df_expedia

Unnamed: 0,core_room_id,core_hotel_id,lp_id,supplier_room_id,supplier_name,supplier_room_name
1,2,509236,lp7c534,200998017,Expedia,"Deluxe Room, Balcony"
4,5,970167,lpecdb7,218116045,Expedia,"Traditional Cottage, 2 Bedrooms, Harbor View"
6,7,626491,lp98f3b,201681924,Expedia,"Comfort House, 6 Bedrooms, Ocean View"
9,10,627565,lp9936d,201691684,Expedia,"Family House, 4 Bedrooms"
47,48,544769,lp85001,201249503,Expedia,Family Cabin
...,...,...,...,...,...,...
2868987,2912375,1701692924,lp656dc5fc,322141874,Expedia,"Basic Shared Dormitory (10 bunk beds, N2)"
2869025,2912413,840974,lpcd50e,323988812,Expedia,Presidential Suite
2869026,2912414,840974,lpcd50e,323988824,Expedia,Presidential Villa
2869033,2912421,840974,lpcd50e,323988797,Expedia,Superior Twin


### Room name preprocesing
Cleaning, feature extraction, and normalization.

In [8]:
# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Use nlp.pipe() to process multiple room names at once
def preprocess_batch(texts):
    """Efficiently process a batch of text using nlp.pipe()."""
    return [" ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
            for doc in nlp.pipe(texts, batch_size=1000, n_process=8)]  # Adjust batch_size & n_process


In [9]:
# Apply batch processing in df_reference
df_reference = df_reference.copy()
df_reference["processed_room_name"] = preprocess_batch(df_reference["room_name"].astype(str).tolist())

In [10]:
# Apply batch processing in df_expedia
df_expedia = df_expedia.copy()
df_expedia["processed_room_name"] = preprocess_batch(df_expedia["supplier_room_name"].astype(str).tolist())

Save both datasets in the folder data/processed.

In [11]:
df_reference.to_csv("data/processed/reference_rooms.csv")

In [12]:
df_expedia.to_csv("data/processed/expedia_rooms.csv")