In [8]:
import pandas as pd
import numpy as np

# Load the CSV file
data = pd.read_csv('dubai_derlenmis_reviews.csv')

# Display the first few rows of the dataset
print(data.head())

                              hotel_name hotel_city  review_date  \
0  are_dubai_abc_almanar_hotel_apartment      dubai  Oct 26 2009   
1  are_dubai_abc_almanar_hotel_apartment      dubai  Apr 29 2009   
2  are_dubai_abc_almanar_hotel_apartment      dubai  Oct 26 2009   
3          are_dubai_admiral_plaza_hotel      dubai   Nov 2 2009   
4          are_dubai_admiral_plaza_hotel      dubai   Oct 7 2009   

                                        hotel_review  
0  Just came back after a week at this hotel. The...  
1  Room was nice and modern. Had reasonable size ...  
2                 May 11 2008 \tEher 3* als 5*-Hotel  
3  It was a good experience as the Hotel was situ...  
4  Good hotel offering value for money. Breakfast...  


In [9]:

import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('punkt')


data['hotel_review'] = data['hotel_review'].fillna('')

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub("\d+", "", text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

# Apply the preprocessing function to the 'hotel_review' column
data['tokens'] = data['hotel_review'].apply(preprocess_text)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\megeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\megeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_file ='glovetxt/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file)

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to calculate sentiment score for a review based on GloVe embeddings and VADER
def sentiment_score(tokens, embeddings_index):
    sentiment = 0
    valid_tokens = 0
    for word in tokens:
        if word in embeddings_index:
            sentiment += sid.polarity_scores(word)['compound']
            valid_tokens += 1
    return sentiment / valid_tokens if valid_tokens > 0 else 0

# Apply the sentiment score function to each review
data['sentiment_score'] = data['tokens'].apply(lambda tokens: sentiment_score(tokens, embeddings_index))


In [12]:
attributes = ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']

# Example keyword mapping for each attribute (this can be expanded)
attribute_keywords = {
    'cleanliness': ['clean', 'dirty', 'hygiene'],
    'room': ['room', 'suite', 'bed', 'space'],
    'service': ['service', 'staff', 'helpful', 'rude'],
    'location': ['location', 'area', 'near', 'far'],
    'value': ['value', 'price', 'cost', 'expensive', 'cheap'],
    'safety': ['safety', 'secure', 'dangerous'],
    'comfort': ['comfort', 'comfortable', 'uncomfortable'],
    'transportation': ['transportation', 'bus', 'train', 'subway'],
    'noise': ['noise', 'quiet', 'loud']
}

# Function to calculate sentiment score for each attribute
def attribute_sentiment_score(tokens, attribute_keywords, embeddings_index):
    sentiment = 0
    valid_tokens = 0
    for word in tokens:
        if word in attribute_keywords:
            sentiment += sid.polarity_scores(word)['compound']
            valid_tokens += 1
    return sentiment / valid_tokens if valid_tokens > 0 else 0

# Apply the function to each attribute
for attribute in attributes:
    data[f'{attribute}_sentiment_score'] = data['tokens'].apply(lambda tokens: attribute_sentiment_score(tokens, attribute_keywords[attribute], embeddings_index))


In [13]:
def recommend_hotels(city, preferences, data):
    # Filter by city
    city_data = data[data['hotel_city'].str.lower() == city.lower()]
    
    # Calculate a weighted score based on user preferences
    city_data['weighted_score'] = 0
    for attribute, weight in preferences.items():
        city_data['weighted_score'] += city_data[f'{attribute}_sentiment_score'] * weight
    
    # Sort by weighted score and return top recommendations
    recommended_hotels = city_data.sort_values(by='weighted_score', ascending=False)
    return recommended_hotels

# Example usage:
city = 'dubai'
preferences = {
    'cleanliness': 1.0,
    'room': 0.8,
    'service': 0.9,
    'location': 0.7,
    'value': 0.6,
    'safety': 0.5,
    'comfort': 0.4,
    'transportation': 0.3,
    'noise': 0.2
}

recommendations = recommend_hotels(city, preferences, data)
print(recommendations[['hotel_name', 'weighted_score']].head())


                                   hotel_name  weighted_score
1142                    are_dubai_ascot_hotel        1.189490
9321             are_dubai_rimal_rotana_dubai        1.087490
8595                 are_dubai_panorama_deira        1.087490
2765              are_dubai_flora_grand_hotel        0.999815
3183  are_dubai_golden_sands_hotel_apartments        0.999815
