## EDA and Modelling Based on Reviews

In this section, we will investigate and dive deeper into the data based on the hotel reviews and geolocation.

In [1]:
#import required libraires
import pandas as pd
import numpy as np
from langdetect import detect
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import folium
from folium import plugins
import ipywidgets
import geocoder
import geopy
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import reverse_geocode
from sklearn.feature_extraction.text import CountVectorizer
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
#installing stylecloud library
#!pip install stylecloud 
import stylecloud



In [2]:
#loading the pickle file
new_df_rev=pd.read_pickle('../data/review.pkl')

In [3]:
new_df_rev.head()

Unnamed: 0,hotel_name,negative_review,positive_review,lat_x,lng_x,hotel_address,tags,lat_y,lng_y,location,country,city
0,11 Cadogan Gardens,Thought the prise of drinks at the bar a litt...,We were particularly impressed by the very wa...,51.493616,-0.159235,11 Cadogan Gardens Sloane Square Kensington an...,"[' Leisure trip ', ' Couple ', ' Superior Quee...",51.493616,-0.159235,"[{'country_code': 'GB', 'city': 'Chelsea', 'co...",United Kingdom,Chelsea
1,1K Hotel,Air conditioning in room didn t work and desp...,Location good close to le Marais and 3e arron...,48.863932,2.365874,13 Boulevard Du Temple 3rd arr 75003 Paris France,"[' Leisure trip ', ' Couple ', ' Superior M Do...",48.863932,2.365874,"[{'country_code': 'FR', 'city': 'Paris', 'coun...",France,Paris
2,25hours Hotel beim MuseumsQuartier,Breakfast not included and buffet really expe...,Cool vintage style in the middle of the museu...,48.206474,16.35463,Lerchenfelder Stra e 1 3 07 Neubau 1070 Vienna...,"[' Leisure trip ', ' Solo traveler ', ' Standa...",48.206474,16.35463,"[{'country_code': 'AT', 'city': 'Vienna', 'cou...",Austria,Vienna
3,41,"There wasn t a thing that we didn t like , No...",Its central proximity close to all services a...,51.498147,-0.143649,41 Buckingham Palace Road Westminster Borough ...,"[' Leisure trip ', ' Couple ', ' Executive Kin...",51.498147,-0.143649,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London
4,45 Park Lane Dorchester Collection,More kinds of fruit juice will make the mini ...,Everything here are almost perfect the staffs...,51.506371,-0.151536,45 Park Lane Westminster Borough London W1K 1P...,"[' Leisure trip ', ' Solo traveler ', ' Execut...",51.506371,-0.151536,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London


In [4]:
#subsetting the dataset for investigation
hotel_review=new_df_rev[['hotel_name','positive_review','negative_review','city','lat_x','lng_x']]

In [5]:
#join positive and negative review
hotel_review['review_text'] = hotel_review['positive_review'].astype(str) + hotel_review['negative_review'].astype(str)


In [6]:
#detect language
hotel_review['lang']=hotel_review['review_text'].apply(lambda x: detect(x))

In [7]:
#english review
hotel_review['lang'].value_counts()

lang
en    1474
Name: count, dtype: int64

In [8]:
#assigning the stopwords from nltk to variable
stops_rev = set(ENGLISH_STOP_WORDS)


## Feature Engineering and Text Visualisation

In the following section, we are intersted to understand the frequency of top words and its occurance with respect to stop words. We have plotted these visualisation for understanding the data set.

In [9]:
def get_top_n_words(corpus, n=None,y=None):
    vec = CountVectorizer(ngram_range=y).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [10]:
def get_top_n_words_with_stop_words(corpus, n=None,y=None):
    vec = CountVectorizer(ngram_range=y,stop_words=stops_rev).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [11]:
hotel_review['word_count'] = hotel_review['review_text'].apply(lambda x: len(str(x).split()))
desc_lengths = list(hotel_review['word_count'])
print("Number of descriptions:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))


Number of descriptions: 1474 
Average word count 12143.495251017639 
Minimum word count 183 
Maximum word count 172330


In [12]:
hotel_review['word_count'].iplot(
    kind='hist',
    bins = 100,
    linecolor='black',
    xTitle='Word Count',
    yTitle='Count',
    title='Word Count Distribution in Hotel Description')

## Unigram Text Visualisation 

### Without Stopwords

In [13]:
common_words = get_top_n_words(hotel_review['review_text'], 20,y=(1,1))
df3 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df3.groupby('review_text').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in hotel description before removing stop words')


### With Stopwords

In [14]:
def get_top_n_words_with_stop_words(corpus, n=None, y=None):
    vec = CountVectorizer(ngram_range=y, stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 20, y=(1, 1))
df4 = pd.DataFrame(common_words, columns=['review_text', 'count'])
df4.groupby('review_text').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in hotel description after removing stop words')


## Bi-gram Text Visualisation
### Without Stopwords

In [15]:
# bi-gram without stop words

common_words = get_top_n_words(hotel_review['review_text'], 20,y=(2,2))
df5 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df5.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description before removing stop words')


### With Stopwords

In [16]:
common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 20,y=(2,2))
df6 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df6.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description After removing stop words')

## Tri-gram Text Visualisation

### Without stopwords

In [17]:
#tri-gram before stop words


common_words = get_top_n_words(hotel_review['review_text'], 20,y=(3,3))
df7 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df7.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description before removing stop words')


### With stopwords

In [18]:

common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 20,y=(3,3))
df8 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df8.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description after removing stop words')


# Modelling

Based on the visualisation, it is evident that bi-gram and tri-gram has more impact on the modelling process. Bi-gram and Tri-gram gives us more context of the reviews and it makes more sense for modelling recommender.

In [19]:
replace_space = re.compile('[/(){}\[\]\|@,;]')
symbol = re.compile('[^0-9a-z #+_]')
stopwordset = stops_rev

def clean_text(text):
    # lowercase text
    text = text.lower() 
    # replace replace_space symbols by space in text. substitute the matched string in replace_space with space.
    text = replace_space.sub(' ', text) 
    # remove symbols which are in symbol from text. substitute the matched string in symbol with Nothing.
    text = symbol.sub('', text)  
    # remove stopwords from text
    text = ' '.join(word for word in text.split() if word not in stopwordset) 
    return text
    


In [32]:
#applying function to clean_text
hotel_review['review_text_clean'] = hotel_review['review_text'].apply(clean_text)
hotel_review['positive_review'][0]

' We were particularly impressed by the very warm welcome we received a lovely hotel in a superb location good size room with excellent bathroom We will come back and already have recommended it to friends and relatives ,  The atmosphere and staff were excellent just what you would expect from a small luxury hotel Breakfast and restaurant food delicious and reasonably priced ,  Bed was amazingly comfortable The building is Full of character and class The staff where excellent and most helpful I really enjoyed staying there and would stay again Didn t get the opportunity to enjoy the breakfast maybe another time ,  Lovely hotel,  Concierge service excellent Bed very comfortable and lux Lounges and bar very comfortable and elegantly furnished Location perfect ,  Customer service was above and beyond from all staff Richie Long is attentive and nothing was too much trouble in fact all the staff were amazing shout out to Richie Emerson and Christian,  Everything Most comfortable bed Extreme

In [33]:
hotel_review

Unnamed: 0,hotel_name,positive_review,negative_review,city,lat_x,lng_x,review_text,lang,word_count,review_text_clean
0,11 Cadogan Gardens,We were particularly impressed by the very wa...,Thought the prise of drinks at the bar a litt...,Chelsea,51.493616,-0.159235,We were particularly impressed by the very wa...,en,5437,particularly impressed warm welcome received l...
1,1K Hotel,Location good close to le Marais and 3e arron...,Air conditioning in room didn t work and desp...,Paris,48.863932,2.365874,Location good close to le Marais and 3e arron...,en,5777,location good close le marais 3e arrondissemen...
2,25hours Hotel beim MuseumsQuartier,Cool vintage style in the middle of the museu...,Breakfast not included and buffet really expe...,Vienna,48.206474,16.354630,Cool vintage style in the middle of the museu...,en,25370,cool vintage style middle museum quarter metro...
3,41,Its central proximity close to all services a...,"There wasn t a thing that we didn t like , No...",West End of London,51.498147,-0.143649,Its central proximity close to all services a...,en,3411,central proximity close services restaurants s...
4,45 Park Lane Dorchester Collection,Everything here are almost perfect the staffs...,More kinds of fruit juice will make the mini ...,West End of London,51.506371,-0.151536,Everything here are almost perfect the staffs...,en,476,perfect staffs friendly room comfortable way s...
...,...,...,...,...,...,...,...,...,...,...
1469,citizenM London Bankside,"No, What an amazing and unique hotel A short ...",This was our third stay at this hotel and it ...,City of London,51.505151,-0.100472,"No, What an amazing and unique hotel A short ...",en,50772,amazing unique hotel short cab ride 15 minute ...
1470,citizenM London Shoreditch,Swift auto check in Good bar and lounge area ...,Lifts need reprogramming exasperating journey...,Barbican,51.524137,-0.078698,Swift auto check in Good bar and lounge area ...,en,29152,swift auto check good bar lounge area decent i...
1471,citizenM Tower of London,The use of technology was impressive UBS outl...,"Rooms are small but well designed, breakfast...",City of London,51.510237,-0.076443,The use of technology was impressive UBS outl...,en,65052,use technology impressive ubs outlets using ta...
1472,every hotel Piccadilly,The location was the only great aspect of thi...,The hotel overall requires an update furnitur...,London,51.510146,-0.131506,The location was the only great aspect of thi...,en,15926,location great aspect hotel central size rooms...


In [48]:
# Show All reviews

def get_individual_reviews_for_city(city, hotel_data):
    city_hotels = hotel_data[hotel_data['city'] == city]

    if city_hotels.empty:
        return None, [f"No hotels found in {city}."]

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name}."]

    return hotel_name, reviews_phrases

# Example usage
city_to_check = 'Paris'
hotel_name_most, reviews_for_city = get_individual_reviews_for_city(city_to_check, hotel_review)

if reviews_for_city is None:
    print(f"No reviews found for hotels in {city_to_check}.")
else:
    print(f"Reviews for the hotel with the most positive reviews in {city_to_check} - {hotel_name_most}:\n")

    for index, review_text in enumerate(reviews_for_city, start=1):
        print(f"Review {index}: {review_text}\n")


Reviews for the hotel with the most positive reviews in Paris - Hotel Regina:

Review 1: Building is very old but nicely maintained Bed quality was excellent and quality of toilettes are very good We booked family suite and there are three single beds in children room but we have a teenage daughter so they gave us two connecting rooms later which were much better The rooms are facing back of building which is better than facing road because it is more quiet and you can keep window open

Review 2: Beautiful hotel in a great location We booked the Eiffel tower junior suite which was lovely with a great view The bed nice and big but very hard I know some prefer hard to soft Good and friendly staff on reception

Review 3: THE CITY

Review 4: This is my favorite hotel I have stayed here three times and never been disappointed It is located right next to the Louvre and the Tuileries Gardens near a stop on metro line 1 and within easy walking distance of the Paris Opera and the Musee d Orsay 

In [59]:
# Show Random Reviews 

import random

def get_individual_reviews_for_city(city, hotel_data, num_reviews=10):
    city_hotels = hotel_data[hotel_data['city'] == city]

    if city_hotels.empty:
        return None, [f"No hotels found in {city}."]

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name}."]

    # Display only a random subset of reviews
    selected_reviews = random.sample(reviews_phrases, min(num_reviews, len(reviews_phrases)))

    return hotel_name, selected_reviews

# Example usage
city_to_check = 'Paris'
hotel_name, reviews_for_city = get_individual_reviews_for_city(city_to_check, hotel_review, num_reviews=10)

if reviews_for_city is None:
    print(f"No reviews found for hotels in {city_to_check}.")
else:
    print(f"Reviews for the hotel with the most positive reviews in {city_to_check} - {hotel_name}:\n")

    for index, review_text in enumerate(reviews_for_city, start=1):
        print(f"Review {index}: {review_text}\n")


Reviews for the hotel with the most positive reviews in Paris - Hotel Regina:

Review 1: The staff were very helpful which along with the setting made a very special trip a fantastic one

Review 2: Location and friendly staff

Review 3: The hotel is just a couple of minutes walk from La Rambla and is in a very central location without being disturbed by traffic noise Reception staff were extremely helpful the room was large comfortable and cleaned beautifully each day Can t comment on the food here as we ate out

Review 4: Great very large corner room with a wonderful view

Review 5: The location is excellent for exploring the historical centre of Milan and around The tram is right out the front and a metro station not far away The hotel is comfortable

Review 6: Staff where very helpful especially the concierge

Review 7: The staff was very helpfull and smiling Great service

Review 8: Good breakfast The WiFi was reasonably fast good for work Chocolate on the pillow Small cake at entr

In [58]:
# Show First 10 reviews

def get_individual_reviews_for_city(city, hotel_data, num_reviews=10):
    city_hotels = hotel_data[hotel_data['city'] == city]

    if city_hotels.empty:
        return None, [f"No hotels found in {city}."]

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name}."]

    return hotel_name, reviews_phrases[:num_reviews]

# Example usage
city_to_check = 'Paris'
hotel_name, reviews_for_city = get_individual_reviews_for_city(city_to_check, hotel_review, num_reviews=10)

if reviews_for_city is None:
    print(f"No reviews found for hotels in {city_to_check}.")
else:
    print(f"Reviews for the hotel with the most positive reviews in {city_to_check} - {hotel_name}:\n")

    for index, review_text in enumerate(reviews_for_city, start=1):
        print(f"Review {index}: {review_text}\n")


Reviews for the hotel with the most positive reviews in Paris - Hotel Regina:

Review 1: Building is very old but nicely maintained Bed quality was excellent and quality of toilettes are very good We booked family suite and there are three single beds in children room but we have a teenage daughter so they gave us two connecting rooms later which were much better The rooms are facing back of building which is better than facing road because it is more quiet and you can keep window open

Review 2: Beautiful hotel in a great location We booked the Eiffel tower junior suite which was lovely with a great view The bed nice and big but very hard I know some prefer hard to soft Good and friendly staff on reception

Review 3: THE CITY

Review 4: This is my favorite hotel I have stayed here three times and never been disappointed It is located right next to the Louvre and the Tuileries Gardens near a stop on metro line 1 and within easy walking distance of the Paris Opera and the Musee d Orsay 

In the sklearn library, there are many other functions you can use, to find cosine similarities between documents. You can directly use TfidfVectorizer in the sklearn’s feature_extraction.text class to Vectorize the words. It will calculate TF_IDF normalization.In sklearn, we can perform the dot product of the vector by using a linear kernel.Here the linear kernel is the same as the cosine similarity, but faster

In [21]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(hotel_review['review_text_clean'])


In [22]:
# function that will find us the top n similar papers based on cosine similarity:

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)


In [23]:
# function to generate maps
def new_recommendations(name,city, cosine_similarities):
    
    recommended_hotels = []
    
    #get input city index
    city_index= list(hotel_review[hotel_review.city==city].index)
    
    # gettin the index of the hotel that matches the name
    idx = hotel_review[(hotel_review.hotel_name == name)].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of  similar hotels list
    top_10_indexes = list(score_series.index)
    
    # populating the list with the names of hotels
    for i in range(len(top_10_indexes)):
        if top_10_indexes[i] not in city_index:
            pass
        else:
            recommended_hotels.append(hotel_review[hotel_review.index==top_10_indexes[i]]['hotel_name'].values[0])

    #getting the list of hotels based on the lat and long
    h = hotel_review[['hotel_name','lat_x','lng_x']].to_dict(orient='records')
    l = {k['hotel_name']: [k['lat_x'], k['lng_x']] for k in h}
    if {hotel: l[hotel] for hotel in recommended_hotels }=={}:
        print("There are no hotels of similar hotel")
    else:
        output= {hotel: l[hotel] for hotel in recommended_hotels[:10]}
        newoutput={i:output for i in range(1,len(output)+1)}
        return newoutput

In [24]:
positivismodoone = hotel_review['positive_review'][1]
positivismodoone

' Location good close to le Marais and 3e arrondissement Fitness room not great but adequate,  Comfortable beds welcoming staff cleanliness was very good,  The hotel is good The personal was nice and the installation very good The breakfast time interval excellent ,  Great location almost next to the metro station a ten minute metro journey to the centre of Paris,  Nice staff clean rooms with a decent service Close to metro and bus to reach the main tourist areas of the city but not much else nearby Well equipped gym that almost nobody uses ,  The position for public transport was excellent the metro is right there So we could get around Paris super easy the room was very small but had everything we could need the staff where very helpful and even arranged to change our room as the first one was on the road and we like to open our windows at night while we sleep There where quick to change and the new room was perfect they had an awesome restaurant on the premises and a funky little ni

In [25]:
new_recommendations('The Belgrave Hotel','Paris',cosine_similarities)

{1: {'Hotel Regina': [48.8637503, 2.3320406],
  'Holiday Inn Paris Gare de l Est': [48.8758981, 2.3590504],
  'Hotel Saint Petersbourg Opera': [48.872174, 2.328075],
  'Best Western Premier Op ra Faubourg Ex Hotel Jules ': [48.8753359,
   2.3414617],
  'K K Hotel Cayr Saint Germain des Pr s': [48.8553117, 2.3254628],
  'Saint James Albany Paris Hotel Spa': [48.8642689, 2.3308179],
  'Little Palace Hotel': [48.8675674, 2.3539896],
  'Acad mie Hotel Saint Germain': [48.855263, 2.3305901],
  'Hotel Horset Op ra Best Western Premier Collection': [48.8691686,
   2.3337818],
  'Novotel Paris Les Halles': [48.8607299, 2.3465326]},
 2: {'Hotel Regina': [48.8637503, 2.3320406],
  'Holiday Inn Paris Gare de l Est': [48.8758981, 2.3590504],
  'Hotel Saint Petersbourg Opera': [48.872174, 2.328075],
  'Best Western Premier Op ra Faubourg Ex Hotel Jules ': [48.8753359,
   2.3414617],
  'K K Hotel Cayr Saint Germain des Pr s': [48.8553117, 2.3254628],
  'Saint James Albany Paris Hotel Spa': [48.86426

In [26]:
#function to generate folium map based on ideal location
def get_hotel_fn_pin(mydict,city):
    loc2 = geocoder.osm(city)

    # map
    main_map = folium.Map(location=[loc2.lat, loc2.lng], zoom_start=13)
    folium.raster_layers.TileLayer('Open Street Map').add_to(main_map)

    # loop through dict
    for i in range (1,len(mydict)+1):
        folium.Marker(location=list(mydict[i].values())[i-1],tooltip=list(mydict[i].keys())[i-1]
                      ,popup=list(mydict[i].keys())[i-1],
                     icon=plugins.BeautifyIcon(number=i,
                                               icon='bus',
                                            border_color='blue',
                                            border_width=0.5,
                                            text_color='red',
                                            inner_icon_style='margin-top:0px;')).add_to(main_map)
     
    return main_map


In [27]:
#saving a html in image
get_hotel_fn_pin(new_recommendations('The Belgrave Hotel','Paris',cosine_similarities),'Paris').save(os.path.join('../image', 'reviews.html'))


In [28]:
# to populate and pin locations of recommended hotels
get_hotel_fn_pin(new_recommendations('The Belgrave Hotel','Vienna',cosine_similarities),'Vienna')