## EDA and Modelling Based on Hotel Attributes


In this section, we will investigate and dive deeper into the data based on the hotel reviews and geolocation.

In [1]:
#import required libraires
import pandas as pd
import numpy as np
from langdetect import detect
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import folium
from folium import plugins
import ipywidgets
import geocoder
import geopy
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import reverse_geocode
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
#installing stylecloud library
#!pip install stylecloud 
import stylecloud
import string
from math import radians, sin, cos, sqrt, atan2
import folium
from folium.plugins import MarkerCluster

In [2]:
#importing data 
hoteltags=pd.read_pickle('../data/hoteltag.pkl')
geocode=pd.read_pickle('../data/geocode.pkl')

In [3]:
# merging geocode into dataset
hoteltags_geo= pd.merge(hoteltags,geocode,on='hotel_name',how='outer')

In [4]:
hoteltags_geo.head()

Unnamed: 0,hotel_name,tags,lat_x,lng_x,lat_y,lng_y,location,country,city
0,11 Cadogan Gardens,"[' Leisure trip ', ' Couple ', ' Superior Quee...",51.493616,-0.159235,51.493616,-0.159235,"[{'country_code': 'GB', 'city': 'Chelsea', 'co...",United Kingdom,Chelsea
1,1K Hotel,"[' Leisure trip ', ' Couple ', ' Superior M Do...",48.863932,2.365874,48.863932,2.365874,"[{'country_code': 'FR', 'city': 'Paris', 'coun...",France,Paris
2,25hours Hotel beim MuseumsQuartier,"[' Leisure trip ', ' Solo traveler ', ' Standa...",48.206474,16.35463,48.206474,16.35463,"[{'country_code': 'AT', 'city': 'Vienna', 'cou...",Austria,Vienna
3,41,"[' Leisure trip ', ' Couple ', ' Executive Kin...",51.498147,-0.143649,51.498147,-0.143649,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London
4,45 Park Lane Dorchester Collection,"[' Leisure trip ', ' Solo traveler ', ' Execut...",51.506371,-0.151536,51.506371,-0.151536,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London


In [5]:
#subsetting dataset
hoteltags_geo=hoteltags_geo[['hotel_name','tags','lat_x','lng_x','city']]

The hotel tags are saved as string. We will need to write a function to extract each attribute and store in tags.

In [6]:
exclude = set(string.punctuation)
def clean(x):
    return set([''.join(ch for ch in i.lower() if ch not in exclude).strip() for i in x[2:][:-2].split(',')])

In [7]:
# applying to clean the tags and assigning them to a new column
hoteltags_geo['new_tags'] = hoteltags_geo['tags'].map(clean)


## Feature Engineering

This encoding is needed for feeding categorical data. We store every different attributes in columns that contain either 1 or 0. 1 indicates that a hotel has that attribute and 0 shows that it doesn’t. 

In order to understand what are the key attributes in the dataframe, set was used to store respective hotel's attributes. This allows us to not keep duplicating attributes.

In [8]:

# global variable
tag_sum_list = []

def get_tag_sum_elems(tag_sum_string):
    global tag_sum_list # use the global variable
    # extend the global variable with this_list
    tag_sum_list.extend(tag_sum_string)
    return True

for i in hoteltags_geo['new_tags']:
    get_tag_sum_elems(i)

tag_sum_set = set(tag_sum_list)
print(tag_sum_set)
len(tag_sum_set)

{'family room with sofa bed', 'junior suite triple', 'grand deluxe contemporary room', 'standard double room with bridge view', '2 double rooms next to each other', 'royal double room', 'superior double room with massage chair', 'standard triple room', 'one bedroom suite with sofa bed', 'stayed 21 nights', 'business double room with wifi and breakfast', 'one bedroom suite with air conditioning', 'king room with eiffel tower view', 'classic double room', 'superior twin room with park view', 'queen room mobility access', 'junior king suite', 'junior suite with free bottle of champagne', '2 twin beds guest room', 'one bedroom suite 1 2 adults', 'executive room with garden view', 'small double room without window', 'deluxe triple suite', 'triple room with spa bath', 'superior double room with terrace', 'standard room with 1 single bed', 'queen superior room', 'executive studio suite', 'one bedroom queen suite', 'one bedroom suite with kitchenette', 'superior room with queen bed sofa', 'twi

2400

In [9]:
# removing 'submitted from a mobile device' since this attribute has no contributing factor to our investigation
hoteltags_geo['new_tags'].apply(lambda x: x.remove('submitted from a mobile device'))

0       None
1       None
2       None
3       None
4       None
        ... 
1469    None
1470    None
1471    None
1472    None
1473    None
Name: new_tags, Length: 1474, dtype: object

In [10]:
#function to output the list of hotel with the respective attribute
def get_special(s):
    spike_cols = [col for col in tag_sum_set if s in col]
    hotel_list = set()
    for i in range(0, len(hoteltags_geo.tags)):
        for j in range(0,len(spike_cols)):
            if spike_cols[j] in hoteltags_geo.new_tags[i]:
                hotel_list.add(hoteltags_geo[hoteltags_geo.index==i]['hotel_name'][i])
                
    return hoteltags_geo[hoteltags_geo.hotel_name.isin(list(hotel_list))][['hotel_name','lat_x','lng_x','city']]


In [11]:
#plotting function to map out the location
def get_map(df,imagepath):
    if df.shape[0]<15:
            map2 = folium.Map(location=[df.iloc[0].lat_x,df.iloc[0].lng_x], zoom_start=12)
    elif df.shape[0]>15:
        map2 = folium.Map(location=[df.iloc[0].lat_x,df.iloc[0].lng_x], zoom_start=2)

    #generate folium
    map2 = folium.Map(location=[df.iloc[0].lat_x,df.iloc[0].lng_x], zoom_start=12)
    folium.raster_layers.TileLayer('Open Street Map').add_to(map2)
    for i in range(0,len(df)):
        folium.Marker(
        location=[df.iloc[i].lat_x,df.iloc[i].lng_x],
            tooltip=df.iloc[i].hotel_name,
        icon=folium.Icon(icon_color='white')
    ).add_to(map2)
    map2.save(imagepath)
    return map2


In the following section, will be refactoring the attributes. 
1. collating the attributes into a array. 
2. Introduce relabelled attribute to indicate its presence in each hotel. 
3. Remove the grouped attributes and we will achieve the refactored attributes.

In [12]:
grp_col=['single room','river view','private pool','breakfast','spa bath','twin room','double room'
       ,'superior room','king room','executive room','city view','sea view','stayed '
       ,'eiffel twin','eiffel tower view','suite','triple room','penthouse','standard room','wheelchair accessible',
      'family room ','deluxe room','apartment','terrace']
newarray=[]

for i in grp_col:
    newarray.extend([col for col in tag_sum_set if i in col])

In [13]:
grp_col=['single room','river view','private pool','breakfast','spa bath','twin room','double room'
       ,'superior room','king room','executive room','city view','sea view','stayed '
       ,'eiffel twin','eiffel tower view','suite','triple room','penthouse','standard room','wheelchair accessible',
      'family room ','deluxe room','guest room','apartment','terrace']

In [14]:
#function to refactor the attributes
def new_fn(a,b):
    efv= list(get_special(a).index)
    for i in efv:
        hoteltags_geo.iloc[i].new_tags.add(b)

In [15]:
hoteltags_geo.iloc[323].new_tags

{'2 rooms',
 '4 rooms',
 'business trip',
 'classic double room',
 'classic single room',
 'classic twin room',
 'comfort double room',
 'couple',
 'deluxe double room with eiffel tower view',
 'deluxe eiffel twin',
 'deluxe twin room with eiffel tower view',
 'family with older children',
 'family with young children',
 'group',
 'leisure trip',
 'solo traveler',
 'stayed 1 night',
 'stayed 2 nights',
 'stayed 3 nights',
 'stayed 4 nights',
 'stayed 5 nights',
 'stayed 6 nights',
 'suite',
 'suite with eiffel tower view',
 'superior double room',
 'travelers with friends'}

In [16]:
# to group/refactor hotel attributes to new attributes
new_fn("single room","single_room")
new_fn("river view","river_view")
new_fn("private pool","private_pool")
new_fn("breakfast","break_fast")
new_fn("spa bath","spa_bath")
new_fn("twin room","twin_room")
new_fn("double room","double_room")
new_fn("superior room","superior_room")
new_fn("king room","king_room")
new_fn("executive room","executive_room")
new_fn("city view","city_view")
new_fn("sea view","sea_view")
new_fn("eiffel tower view","eiffel_tower_view")
new_fn("suite",'_suite_')
new_fn("triple room",'triple_room')
new_fn("penthouse",'_penthouse_')
new_fn("standard room",'standard_room')
new_fn("wheelchair accessible",'wheelchair_accessible')
new_fn("family room ",'family_room')
new_fn("deluxe room",'deluxe_room')
new_fn("guest room",'guest_room')
new_fn("apartment",'_apartment_')
new_fn("terrace",'_terrace')


In [17]:
# to remove attributes which are not used after replacing with generic attribute
for i in range(0, len(hoteltags_geo.new_tags)):
    for j in range(0,len(newarray)):
        if newarray[j] in hoteltags_geo.new_tags[i]:
            hoteltags_geo.new_tags[i].remove(newarray[j])

In [18]:
# global variable
tag_sum_list = []

def get_tag_sum_elems(tag_sum_string):
    global tag_sum_list # use the global variable
    # extend the global variable with this_list
    tag_sum_list.extend(tag_sum_string)
    return True

for i in hoteltags_geo['new_tags']:
    get_tag_sum_elems(i)

tag_sum_set = set(tag_sum_list)
len(tag_sum_set)

514

In [19]:
for i in tag_sum_list:
    hoteltags_geo[i] = 0


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [20]:
hoteltags_geo.head()

Unnamed: 0,hotel_name,tags,lat_x,lng_x,city,new_tags,family with young children,superior queen room,2 rooms,single_room,...,king grand premier with canal view,king superior plus room,twin premier room,king grand premier room,twin grand premier room,luxury quadruple room,large room,art room with iconic view,art room xl with iconic view,penta plus room
0,11 Cadogan Gardens,"[' Leisure trip ', ' Couple ', ' Superior Quee...",51.493616,-0.159235,Chelsea,"{family with young children, superior queen ro...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1K Hotel,"[' Leisure trip ', ' Couple ', ' Superior M Do...",48.863932,2.365874,Paris,"{family with young children, 2 rooms, single_r...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,25hours Hotel beim MuseumsQuartier,"[' Leisure trip ', ' Solo traveler ', ' Standa...",48.206474,16.35463,Vienna,"{family with young children, with a pet, 2 roo...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,41,"[' Leisure trip ', ' Couple ', ' Executive Kin...",51.498147,-0.143649,West End of London,"{family with young children, couple, family wi...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45 Park Lane Dorchester Collection,"[' Leisure trip ', ' Solo traveler ', ' Execut...",51.506371,-0.151536,West End of London,"{executive queen room, family with young child...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
#iterating through the columns comparing if attribtues in is presence in new_tags, assign 1 
only_new_tags=pd.DataFrame(hoteltags_geo.new_tags.loc[:])

for index, row in only_new_tags.iterrows():
    cs_list = set(row['new_tags'])
    
    for j in cs_list:
        #set corresponding column to 1
        if j in tag_sum_set:
            hoteltags_geo.loc[index,j]=1

In [22]:
hoteltags_geo

Unnamed: 0,hotel_name,tags,lat_x,lng_x,city,new_tags,family with young children,superior queen room,2 rooms,single_room,...,king grand premier with canal view,king superior plus room,twin premier room,king grand premier room,twin grand premier room,luxury quadruple room,large room,art room with iconic view,art room xl with iconic view,penta plus room
0,11 Cadogan Gardens,"[' Leisure trip ', ' Couple ', ' Superior Quee...",51.493616,-0.159235,Chelsea,"{family with young children, superior queen ro...",1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1K Hotel,"[' Leisure trip ', ' Couple ', ' Superior M Do...",48.863932,2.365874,Paris,"{family with young children, 2 rooms, single_r...",1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,25hours Hotel beim MuseumsQuartier,"[' Leisure trip ', ' Solo traveler ', ' Standa...",48.206474,16.354630,Vienna,"{family with young children, with a pet, 2 roo...",1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,41,"[' Leisure trip ', ' Couple ', ' Executive Kin...",51.498147,-0.143649,West End of London,"{family with young children, couple, family wi...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45 Park Lane Dorchester Collection,"[' Leisure trip ', ' Solo traveler ', ' Execut...",51.506371,-0.151536,West End of London,"{executive queen room, family with young child...",1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1469,citizenM London Bankside,"[' Leisure trip ', ' Couple ', ' Double Room '...",51.505151,-0.100472,City of London,"{family with young children, with a pet, 2 roo...",1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1470,citizenM London Shoreditch,"[' Leisure trip ', ' Solo traveler ', ' Double...",51.524137,-0.078698,Barbican,"{family with young children, with a pet, 2 roo...",1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1471,citizenM Tower of London,"[' Leisure trip ', ' Solo traveler ', ' Double...",51.510237,-0.076443,City of London,"{family with young children, with a pet, 2 roo...",1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1472,every hotel Piccadilly,"[' Leisure trip ', ' Couple ', ' Standard Doub...",51.510146,-0.131506,London,"{family with young children, 2 rooms, couple, ...",1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


#### Visualisation of type of rooms in the dataset

In [23]:
index_room = ['single_room','twin_room','double_room','superior_room'
        ,'king_room','executive_room','_suite_','triple_room',
        '_penthouse_','standard_room','family_room','deluxe_room','guest_room','_apartment_',
        '_terrace_']
room_count=[]

for i in index_room:
    room_count.append(get_special(i).shape[0])


In [24]:
import plotly.io as pio


In [25]:
df1 = pd.DataFrame({'No.of Hotel': room_count}, index=index_room)
ax = df1.iplot(kind='barh', yTitle='Room Types', linecolor='black', title='Number of hotels by Room Types')
ax


![Drag Racing](../image/room_charactertistic.png)


#### Visualisation of  types of hotel characteristics

In [26]:
index_view = ['river_view','private_pool','break_fast','spa_bath','city_view','sea_view','eiffel_tower_view'
              ,'wheelchair_accessible']
view_count=[]

for i in index_view:
    view_count.append(get_special(i).shape[0])

In [27]:
df2 = pd.DataFrame({'No.of Hotel': view_count}, index=index_view)
ax2 = df2.iplot(kind='barh', yTitle='Characteristic',colors='blue', linecolor='black', title='Number of hotels with special characteristics')
ax2

![Drag Racing](../image/hotelcharacteristic.png)


In [28]:
hoteltags_geo.isnull().sum()

hotel_name                      0
tags                            0
lat_x                           0
lng_x                           0
city                            0
                               ..
luxury quadruple room           0
large room                      0
art room with iconic view       0
art room xl with iconic view    0
penta plus room                 0
Length: 520, dtype: int64

In [29]:
hoteltags_geo.new_tags.iloc[323]

{'2 rooms',
 '4 rooms',
 '_suite_',
 'business trip',
 'couple',
 'double_room',
 'eiffel_tower_view',
 'family with older children',
 'family with young children',
 'group',
 'leisure trip',
 'single_room',
 'solo traveler',
 'travelers with friends',
 'twin_room'}

In [30]:
hoteltags_geo['double_room'].value_counts()

double_room
1    1046
0     428
Name: count, dtype: int64

### Mapinha

In [31]:
# to generate map and visualise
get_map(get_special('eiffel_tower_view'),'../image/eiffel.html')

![Drag Racing](../image/eiffel.png)

## Modelling

### Create landmarks DF

In [32]:
import pandas as pd

# Data for famous landmarks in Paris, London, Vienna, Barcelona, Amsterdam, and Milan
landmarks_data = {
    'landmark_name': ['Eiffel Tower', 'Louvre Museum', 'Notre-Dame Cathedral', 'Montmartre', 'Arc de Triomphe',
                      'Big Ben', 'The British Museum', 'St. Paul\'s Cathedral', 'Tate Modern', 'Tower Bridge',
                      'St. Stephen\'s Cathedral', 'Belvedere Palace', 'Schönbrunn Palace', 'Vienna State Opera', 'Hundertwasser House',
                      'Sagrada Familia', 'Park Güell', 'Casa Batlló', 'La Rambla', 'Barri Gòtic',
                      'Rijksmuseum', 'Van Gogh Museum', 'Anne Frank House', 'Rembrandt House Museum', 'Canal Ring',
                      'Duomo di Milano', 'Galleria Vittorio Emanuele II', 'Sforza Castle', 'Navigli District', 'Brera Art Gallery',
                      'Musée d\'Orsay', 'Sainte-Chapelle', 'Panthéon', 'Musée Picasso', 'Champs-Élysées', 'Tuileries Garden',
                      'Westminster Abbey', 'Tower of London', 'British Library', 'Covent Garden', 'London Eye',
                      'Belvedere 21', 'Albertina', 'Hofburg Palace', 'Kunsthistorisches Museum', 'Prater Park',
                      'Magic Fountain of Montjuïc', 'Gothic Quarter', 'Palau de la Música Catalana', 'Camp Nou', 'Barcelona Cathedral',
                      'Anne Frank Statue', 'Dam Square', 'Rembrandt Square', 'Jordaan District', 'Vondelpark',
                      'Sforza Palace', 'Leonardo da Vinci\'s Last Supper', 'Pinacoteca di Brera', 'Navigli Canals', 'Bosco Verticale'],
    'land_lat_x': [48.858844, 48.860611, 48.8530, 48.8867, 48.8738,
                   51.5008, 51.5194, 51.5138, 51.5076, 51.5055,
                   48.2082, 48.1912, 48.1783, 48.2024, 48.2082,
                   41.4036, 41.4141, 41.3919, 41.3809, 41.3816,
                   52.3599, 52.3584, 52.3752, 52.3670, 52.3676,
                   45.4642, 45.4655, 45.4668, 45.4602, 45.4719,
                   48.8599, 48.8558, 48.8462, 48.8610, 48.8738, 48.8635,
                   51.4991, 51.5081, 51.5194, 51.5112, 51.5034,
                   48.1934, 48.2008, 48.2082, 48.2047, 48.2181,
                   41.3706, 41.3838, 41.3816, 41.3802, 41.3847,
                   52.3775, 52.3702, 52.5168, 52.5323, 52.5212,
                   41.3768, 41.3847, 41.3879, 41.3804, 45.4780],
    'land_lng_x': [2.294350, 2.337643, 2.3498, 2.3431, 2.2950,
                   -0.1246, -0.1270, -0.0982, -0.0993, -0.0754,
                   16.3738, 16.3808, 16.3134, 16.3699, 16.3731,
                   2.1744, 2.1520, 2.1665, 2.1720, 2.1760,
                   4.8851, 4.8810, 4.8837, 4.9018, 4.8955,
                   9.1900, 9.1900, 9.1925, 9.1859, 9.1879,
                   2.3315, 2.3451, 2.3431, 2.3580, 2.2944, 2.3322,
                   -0.1270, -0.0757, -0.1280, -0.1240, -0.1216,
                   16.3963, 16.3945, 16.3636, 16.3716, 16.3987,
                   2.1694, 2.2075, 2.2489, 2.2405, 2.2193,
                   2.1500, 2.3500, 2.3280, 2.2560, 2.2900,
                   1.9052, 2.1774, 2.1699, 2.1327, 9.1885],
    'city': ['Paris', 'Paris', 'Paris', 'Paris', 'Paris',
             'London', 'London', 'London', 'London', 'London',
             'Vienna', 'Vienna', 'Vienna', 'Vienna', 'Vienna',
             'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona',
             'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam',
             'Milan', 'Milan', 'Milan', 'Milan', 'Milan',
             'Paris', 'Paris', 'Paris', 'Paris', 'Paris', 'Paris',
             'London', 'London', 'London', 'London', 'London',
             'Vienna', 'Vienna', 'Vienna', 'Vienna', 'Vienna',
             'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona',
             'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam',
             'Milan', 'Milan', 'Milan', 'Milan', 'Milan']
}

# Creating an updated DataFrame
landmarks_df = pd.DataFrame(landmarks_data)

# Display the updated DataFrame
landmarks_df.head


<bound method NDFrame.head of                       landmark_name  land_lat_x  land_lng_x   city
0                      Eiffel Tower   48.858844    2.294350  Paris
1                     Louvre Museum   48.860611    2.337643  Paris
2              Notre-Dame Cathedral   48.853000    2.349800  Paris
3                        Montmartre   48.886700    2.343100  Paris
4                   Arc de Triomphe   48.873800    2.295000  Paris
..                              ...         ...         ...    ...
56                    Sforza Palace   41.376800    1.905200  Milan
57  Leonardo da Vinci's Last Supper   41.384700    2.177400  Milan
58              Pinacoteca di Brera   41.387900    2.169900  Milan
59                   Navigli Canals   41.380400    2.132700  Milan
60                  Bosco Verticale   45.478000    9.188500  Milan

[61 rows x 4 columns]>

In [33]:
landmarks_df

Unnamed: 0,landmark_name,land_lat_x,land_lng_x,city
0,Eiffel Tower,48.858844,2.294350,Paris
1,Louvre Museum,48.860611,2.337643,Paris
2,Notre-Dame Cathedral,48.853000,2.349800,Paris
3,Montmartre,48.886700,2.343100,Paris
4,Arc de Triomphe,48.873800,2.295000,Paris
...,...,...,...,...
56,Sforza Palace,41.376800,1.905200,Milan
57,Leonardo da Vinci's Last Supper,41.384700,2.177400,Milan
58,Pinacoteca di Brera,41.387900,2.169900,Milan
59,Navigli Canals,41.380400,2.132700,Milan


### Additional landmarks 

In [34]:
# Additional data for landmarks in Paris, London, Vienna, Barcelona, Amsterdam, and Milan
additional_landmarks_data = {
    'landmark_name': ['Louvre Pyramid', 'Musée de l\'Orangerie', 'Centre Pompidou', 'Le Marais', 'Luxembourg Gardens',
                      'Westminster Palace', 'London Bridge', 'Victoria and Albert Museum', 'National Gallery', 'The Shard',
                      'Belvedere 21', 'KunstHausWien', 'Imperial Crypt', 'Haus der Musik', 'Museum of Natural History',
                      'Parc Güell', 'Magic Fountain of Montjuïc', 'Casa Milà', 'Plaça de Catalunya', 'Camp Nou',
                      'Dam Square', 'Royal Palace of Amsterdam', 'NEMO Science Museum', 'Hermitage Amsterdam', 'Rembrandtplein',
                      'Galleria d\'Arte Moderna', 'Basilica of Sant\'Ambrogio', 'Navigli Canals', 'Leonardo da Vinci\'s Vineyard', 'Porta Nuova',
                      'Eiffel Tower Park', 'Luxembourg Palace', 'Place des Vosges', 'Musée Rodin', 'Seine River Cruise', 'Parc des Buttes-Chaumont',
                      'Buckingham Palace', 'Hyde Park', 'St. James\'s Park', 'The O2', 'Kensington Palace',
                      'Belvedere Garden', 'Augarten', 'Prater Park', 'Leopold Museum', 'Haus des Meeres',
                      'Barri Gòtic Streets', 'Montjuïc Castle', 'Hospital de Sant Pau', 'Catalonia History Museum', 'Magic Fountain of Montjuïc',
                      'Vondelpark', 'Rijksmuseum Gardens', 'Red Light District', 'Jordaan Canals', 'Amsterdam Museum',
                      'Leonardo da Vinci\'s Last Supper Replica', 'Biblioteca Ambrosiana', 'GAM Milano', 'Brera Pinacoteca', 'Leonardo da Vinci Museum'],
    'land_lat_x': [48.8608, 48.8637, 48.8606, 48.8589, 48.8462,
                   51.4994, 51.5081, 51.4965, 51.5089, 51.5045,
                   48.1934, 48.2092, 48.2021, 48.2031, 48.2082,
                   41.4145, 41.3778, 41.3954, 41.3876, 41.3803,
                   52.3622, 52.3730, 52.3663, 52.3744, 52.3676,
                   45.4675, 45.4637, 45.4749, 45.4675, 45.4838,
                   48.8588, 48.8462, 48.8570, 48.8551, 48.8858, 48.8782,
                   51.5014, 51.5073, 51.5009, 51.5045, 51.5074,
                   48.1923, 48.2285, 48.2174, 48.2034, 48.1984,
                   41.3651, 41.3819, 41.3865, 41.3843, 41.3896,
                   52.3751, 52.3781, 52.3697, 52.3765, 52.3784,
                   45.4641, 45.4786, 45.4395, 45.4236, 45.4672],
    'land_lng_x': [2.3375, 2.3224, 2.3522, 2.3582, 2.3303,
               -0.1269, -0.0877, -0.1722, -0.1283, -0.0755,
               16.3738, 16.3820, 16.3695, 16.3803, 16.3731,
               2.1778, 2.1346, 2.1500, 2.1697, 2.1701,
               4.9027, 4.8922, 4.9022, 4.8924, 4.8990,
               9.1910, 9.1848, 9.1946, 9.1868, 9.1905,
               2.2920, 2.3191, 2.3615, 2.3202, 2.2944, 2.3825,
               -0.1419, -0.1650, -0.1394, -0.0043, -0.1821,
               16.3585, 16.3687, 16.3913, 16.3689, 16.3705,
               9.1824, 16.3767, 16.3509, 16.3782, 16.4170,
               2.1769, 2.1589, 2.1844, 2.1640, 2.1370,
               2.1669, 2.1836, 2.1480, 2.1563, 2.1789],

    'city': ['Paris', 'Paris', 'Paris', 'Paris', 'Paris',
             'London', 'London', 'London', 'London', 'London',
             'Vienna', 'Vienna', 'Vienna', 'Vienna', 'Vienna',
             'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona',
             'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam',
             'Milan', 'Milan', 'Milan', 'Milan', 'Milan',
             'Paris', 'Paris', 'Paris', 'Paris', 'Paris', 'Paris',
             'London', 'London', 'London', 'London', 'London',
             'Vienna', 'Vienna', 'Vienna', 'Vienna', 'Vienna',
             'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona', 'Barcelona',
             'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam', 'Amsterdam',
             'Milan', 'Milan', 'Milan', 'Milan', 'Milan']
}

# Adding additional landmarks to the DataFrame
additional_landmarks_df = pd.DataFrame(additional_landmarks_data)
extended_landmarks_df = pd.concat([landmarks_df, additional_landmarks_df], ignore_index=True)

# Display the extended DataFrame
extended_landmarks_df

Unnamed: 0,landmark_name,land_lat_x,land_lng_x,city
0,Eiffel Tower,48.858844,2.294350,Paris
1,Louvre Museum,48.860611,2.337643,Paris
2,Notre-Dame Cathedral,48.853000,2.349800,Paris
3,Montmartre,48.886700,2.343100,Paris
4,Arc de Triomphe,48.873800,2.295000,Paris
...,...,...,...,...
117,Leonardo da Vinci's Last Supper Replica,45.464100,2.166900,Milan
118,Biblioteca Ambrosiana,45.478600,2.183600,Milan
119,GAM Milano,45.439500,2.148000,Milan
120,Brera Pinacoteca,45.423600,2.156300,Milan


### Find closest landmark

In [35]:
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    radius_of_earth = 6371  # Radius of Earth in kilometers
    distance = radius_of_earth * c

    return distance

def find_closest_landmark(hotel_df, landmark_df):
    for hotel_index, hotel_row in hotel_df.iterrows():
        if hotel_row['city'] == 'Paris':
            closest_landmark = None
            min_distance = float('inf')

            for landmark_index, landmark_row in landmark_df.iterrows():
                distance = haversine_distance(
                    hotel_row['lat_x'], hotel_row['lng_x'],
                    landmark_row['land_lat_x'], landmark_row['land_lng_x']
                )

                if distance < min_distance:
                    min_distance = distance
                    closest_landmark = landmark_row['landmark_name']

            print(f"{hotel_row['hotel_name']} is {min_distance:.2f} km away from {closest_landmark}.")


find_closest_landmark(hoteltags_geo, landmarks_df)


1K Hotel is 0.66 km away from Musée Picasso.
9Hotel Republique is 1.11 km away from Musée Picasso.
A La Villa Madame is 0.90 km away from Panthéon.
Acad mie Hotel Saint Germain is 0.52 km away from Musée d'Orsay.
Amarante Beau Manoir is 1.05 km away from Tuileries Garden.
Artus Hotel by MH is 0.71 km away from Sainte-Chapelle.
Au Manoir Saint Germain is 0.69 km away from Musée d'Orsay.
Banke Hotel is 1.15 km away from Tuileries Garden.
Best Western Aulivia Op ra is 1.37 km away from Musée Picasso.
Best Western Ducs de Bourgogne is 0.47 km away from Louvre Museum.
Best Western Le Jardin de Cluny is 0.42 km away from Notre-Dame Cathedral.
Best Western Premier Faubourg 88 is 1.10 km away from Montmartre.
Best Western Premier Louvre Saint Honor  is 0.25 km away from Louvre Museum.
Best Western Premier Marais Grands Boulevards is 0.91 km away from Musée Picasso.
Best Western Premier Op ra Faubourg Ex Hotel Jules  is 1.27 km away from Montmartre.
Best Western Premier Op ra Opal is 1.08 km aw

#### More precise 

In [36]:
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    radius_of_earth = 6371  # Radius of Earth in kilometers
    distance = radius_of_earth * c

    return distance

def find_closest_landmark_for_hotel(hotel_df, landmark_df, hotel_name):
    # Filter hotel_df for the given hotel name
    hotel_row = hotel_df[hotel_df['hotel_name'] == hotel_name].iloc[0]

    closest_landmark = None
    min_distance = float('inf')

    for landmark_index, landmark_row in landmark_df.iterrows():
        distance = haversine_distance(
            hotel_row['lat_x'], hotel_row['lng_x'],
            landmark_row['land_lat_x'], landmark_row['land_lng_x']
        )

        if distance < min_distance:
            min_distance = distance
            closest_landmark = landmark_row['landmark_name']

    print(f"{hotel_name} is {min_distance:.2f} km away from {closest_landmark}.")


hotel_name_to_check = '1K Hotel'  
find_closest_landmark_for_hotel(hoteltags_geo, landmarks_df, hotel_name_to_check)


1K Hotel is 0.66 km away from Musée Picasso.


In [37]:
def find_closest_landmarks_for_hotel_5(hotel_df, landmark_df, hotel_name):
    # Filter the hotel_df for the given hotel_name
    hotel_row = hotel_df[hotel_df['hotel_name'] == hotel_name].iloc[0]

    # Initialize variables to store the three closest landmarks
    closest_landmarks = []
    closest_distances = [float('inf')] * 5

    for landmark_index, landmark_row in landmark_df.iterrows():
        distance = haversine_distance(
            hotel_row['lat_x'], hotel_row['lng_x'],
            landmark_row['land_lat_x'], landmark_row['land_lng_x']
        )

        # Check if the current distance is smaller than any of the three closest distances
        for i, min_distance in enumerate(closest_distances):
            if distance < min_distance:
                closest_distances.insert(i, distance)
                closest_landmarks.insert(i, landmark_row['landmark_name'])
                closest_distances = closest_distances[:5]
                closest_landmarks = closest_landmarks[:5]
                break

    return closest_landmarks, closest_distances


hotel_name_to_check = '1K Hotel'  # Replace with the hotel name you want to check
closest_landmarks, closest_distances = find_closest_landmarks_for_hotel_5(hoteltags_geo, landmarks_df, hotel_name_to_check)

if not closest_landmarks:
    print(f"No information found for {hotel_name_to_check}.")
else:
    print(f"The three closest landmarks to {hotel_name_to_check} are:")
    for landmark, distance in zip(closest_landmarks, closest_distances):
        print(f"{landmark} - {distance:.2f} km away.")

The three closest landmarks to 1K Hotel are:
Musée Picasso - 0.66 km away.
Notre-Dame Cathedral - 1.69 km away.
Sainte-Chapelle - 1.77 km away.
Louvre Museum - 2.10 km away.
Tuileries Garden - 2.46 km away.


In [38]:
def distance_between_hotel_and_landmark(hotel_df, landmark_df, hotel_name, landmark_name):
    # Filter the hotel_df for the given hotel_name
    hotel_row = hotel_df[hotel_df['hotel_name'] == hotel_name].iloc[0]

     # Convert landmark_name to lowercase
    landmark_name_lower = landmark_name.lower()

    # Filter the landmark_df for the given landmark_name (converted to lowercase)
    landmark_row = landmark_df[landmark_df['landmark_name'].str.lower() == landmark_name_lower].iloc[0]

    # Calculate the distance between the hotel and landmark
    distance = haversine_distance(
        hotel_row['lat_x'], hotel_row['lng_x'],
        landmark_row['land_lat_x'], landmark_row['land_lng_x']
    )

    return distance


hotel_name_to_check = '1K Hotel'  # Replace with the hotel name you want to check
landmark_name_to_check = 'louvre Museum'  # Replace with the landmark name you want to check

distance_to_landmark = distance_between_hotel_and_landmark(hoteltags_geo, landmarks_df, hotel_name_to_check, landmark_name_to_check)

if not distance_to_landmark:
    print(f"No information found for the given hotel or landmark.")
else:
    print(f"The distance from {hotel_name_to_check} to {landmark_name_to_check} is: {distance_to_landmark:.2f} km.")


The distance from 1K Hotel to louvre Museum is: 2.10 km.


### TRY Making a map with the hotels + landmarks

In [39]:
# to generate map and visualise
get_map(get_special('eiffel_tower_view'),'../image/eiffel.html')

![Drag Racing](../image/eiffel.png)

## Modelling

### Similirity DF

In [40]:
similarityDF=cosine_similarity(hoteltags_geo.iloc[:, 6:],hoteltags_geo.iloc[:, 6:])


In [41]:
np.save("../data/tagcosine.npy",similarityDF)

In [42]:
hoteltags_geo.to_pickle("../data/clean_hoteltag.pkl")

### Recommendation by TAGS + map

In [43]:
def new_recommendations_tags(name,city, cosine_similarities):
    
    recommended_hotels = []
    
    #get input city index
    city_index= list(hoteltags_geo[hoteltags_geo.city==city].index)
    
    # gettin the index of the hotel that matches the name
    idx = hoteltags_geo[(hoteltags_geo.hotel_name == name)].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of  similar hotels
    top_10_indexes = list(score_series.index)
    
    # populating the list with the names of the matching hotels
    for i in range(len(top_10_indexes)):
        if top_10_indexes[i] not in city_index:
            pass
        else:
            recommended_hotels.append(hoteltags_geo[hoteltags_geo.index==top_10_indexes[i]]['hotel_name'].values[0])

    # populating a dictionary of size 10 containing hotel name and lat and longitude 
    h = hoteltags_geo[['hotel_name','lat_x','lng_x']].to_dict(orient='records')
    l = {k['hotel_name']: [k['lat_x'], k['lng_x']] for k in h}
    if {hotel: l[hotel] for hotel in recommended_hotels }=={}:
        print("There are no hotels of similar hotel")
    else:
        output= {hotel: l[hotel] for hotel in recommended_hotels[:10]}
        newoutput={i:output for i in range(1,len(output)+1)}
        return newoutput
    
     

#### Gets map of recommendation by TAGS

In [44]:
def get_hotel_fn(mydict,city):
    loc2 = geocoder.osm(city)

    # map
    main_map = folium.Map(location=[loc2.lat, loc2.lng], zoom_start=13)
    folium.raster_layers.TileLayer('Open Street Map').add_to(main_map)

    # loop through dict
    for i in range (1,len(mydict)+1):
        folium.Marker(location=list(mydict[i].values())[i-1],tooltip=list(mydict[i].keys())[i-1]
                      ,popup=list(mydict[i].keys())[i-1],
                     icon=plugins.BeautifyIcon(number=i,
                                               icon='bus',
                                            border_color='blue',
                                            border_width=0.5,
                                            text_color='red',
                                            inner_icon_style='margin-top:0px;')).add_to(main_map)
     
    return main_map


In [45]:
new_recommendations_tags('Hilton Diagonal Mar Barcelona','Vienna',similarityDF)

{1: {'Courtyard by Marriott Vienna Prater Messe': [48.2130607, 16.4133973],
  'Hotel Imperial A Luxury Collection Hotel': [48.2012505, 16.3731255],
  'Hilton Vienna Plaza': [48.2155236, 16.3646858],
  'Vienna Marriott Hotel': [48.2050967, 16.3770928],
  'Sofitel Vienna Stephansdom': [48.212857, 16.37986],
  'Grand Hotel Wien': [48.2021105, 16.3720841],
  'InterContinental Wien': [48.2019865, 16.3789934],
  'Austria Trend Hotel Savoyen Vienna': [48.1950444, 16.3844751],
  'Le Meridien Vienna': [48.2027296, 16.3661211],
  'Hilton Vienna': [48.2062268, 16.3833767]},
 2: {'Courtyard by Marriott Vienna Prater Messe': [48.2130607, 16.4133973],
  'Hotel Imperial A Luxury Collection Hotel': [48.2012505, 16.3731255],
  'Hilton Vienna Plaza': [48.2155236, 16.3646858],
  'Vienna Marriott Hotel': [48.2050967, 16.3770928],
  'Sofitel Vienna Stephansdom': [48.212857, 16.37986],
  'Grand Hotel Wien': [48.2021105, 16.3720841],
  'InterContinental Wien': [48.2019865, 16.3789934],
  'Austria Trend Hotel

In [46]:
# to populate and pin recommended list of hotels
get_hotel_fn(new_recommendations_tags('Hilton Diagonal Mar Barcelona','Vienna',similarityDF),'Vienna')

### [SUCESSO] Devolve o top 10 hoteis dada landmark e if NO working com 10 hoteis e distâncias a landmarks 

In [47]:
from math import radians, sin, cos, sqrt, atan2
import pandas as pd

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    radius_of_earth = 6371  # Radius of Earth in kilometers
    distance = radius_of_earth * c

    return distance

def find_closest_hotels(city, landmark_df, hotel_df):
    # Filter hotels for the specified city
    city_hotels = hotel_df[hotel_df['city'] == city]

    if city_hotels.empty:
        print(f"No hotels found for {city}.")
        return

    # Ask the user if they have a specific landmark in mind
    user_input = input("Do you have a specific landmark in mind to stay closest to? (Yes/No): ").lower()

    if user_input == 'yes':
        user_landmark = input("Please enter the landmark name: ")
        closest_landmark = landmark_df[
            (landmark_df['city'] == city) & (landmark_df['landmark_name'].str.lower() == user_landmark.lower())]

        if closest_landmark.empty:
            print(f"No information found for the specified landmark in {city}.")
            return

        closest_landmark = closest_landmark.iloc[0]

        # Calculate distances and sort hotels by distance
        city_hotels['distance_to_landmark'] = city_hotels.apply(
            lambda row: haversine_distance(
                row['lat_x'], row['lng_x'],
                closest_landmark['land_lat_x'], closest_landmark['land_lng_x']
            ), axis=1)

        closest_hotels = city_hotels.sort_values(by='distance_to_landmark').head(10)

        # Display the top 10 closest hotels to the specified landmark
        for index, hotel_row in closest_hotels.iterrows():
            print(f"{hotel_row['hotel_name']} is {hotel_row['distance_to_landmark']:.2f} km away from {closest_landmark['landmark_name']}.")

    elif user_input == 'no':
        # Display hotels generally close to a landmark (within 1.5 km), limit to 10 hotels
        for landmark_index, landmark_row in landmark_df[landmark_df['city'] == city].iterrows():
            # Calculate distances and sort hotels by distance
            city_hotels['distance_to_landmark'] = city_hotels.apply(
                lambda row: haversine_distance(
                    row['lat_x'], row['lng_x'],
                    landmark_row['land_lat_x'], landmark_row['land_lng_x']
                ), axis=1)

            closest_hotels = city_hotels.sort_values(by='distance_to_landmark').head(10)

            # Display the top 10 closest hotels to the default landmark
            for index, hotel_row in closest_hotels.iterrows():
                print(f"{hotel_row['hotel_name']} is {hotel_row['distance_to_landmark']:.2f} km away from {landmark_row['landmark_name']}.")
                break  # Only display the closest landmark for each hotel
    else:
        print("Invalid input. Please enter 'Yes' or 'No'.")

find_closest_hotels('Paris', landmarks_df, hoteltags_geo)



Invalid input. Please enter 'Yes' or 'No'.


### FILTER DE HOTEIS

In [48]:
# Filter hotels in Paris
paris_hotels = hoteltags_geo.loc[hoteltags_geo['city'] == 'Paris', 'hotel_name']

# Print the hotel names
print("Hotels in Paris:")
print(paris_hotels)

Hotels in Paris:
1                           1K Hotel
6                  9Hotel Republique
7                  A La Villa Madame
23      Acad mie Hotel Saint Germain
40              Amarante Beau Manoir
                    ...             
1440              Villa Opera Drouot
1441                  Villa Panth on
1442                 Villa d Estr es
1450                   W Paris Op ra
1462                   Windsor Opera
Name: hotel_name, Length: 224, dtype: object


In [49]:
hotel_to_search = 'Best Western Le Jardin de Cluny'

if hotel_to_search in paris_hotels.values:
    print(f"{hotel_to_search} is in the list of hotels in Paris.")
else:
    print(f"{hotel_to_search} is not in the list of hotels in Paris.")


Best Western Le Jardin de Cluny is in the list of hotels in Paris.


### DESCRIPTION GET TAGS DE HOTEIS

In [50]:
def get_attributes_description_for_hotel(hotel_name, hotel_data):
    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return f"Hotel {hotel_name} not found."

    attributes = [col.replace('_', ' ').capitalize() for col in hotel_data.columns if hotel_row[col] == 1]

    if not attributes:
        return f"No specific conditions or additions found for {hotel_name}."

    return f"The hotel {hotel_name} has the following conditions & additions such as {', '.join(attributes)}."

hotel_name_to_check = 'Best Western Le Jardin de Cluny'
description = get_attributes_description_for_hotel(hotel_name_to_check, hoteltags_geo)
print(description)


The hotel Best Western Le Jardin de Cluny has the following conditions & additions such as Family with young children, Couple, Family with older children, Business trip, Solo traveler, King room, Group,  suite , Leisure trip, Double room, Twin room, Confort queen room.


# Reviews

### REVIEWS LOAD

In [51]:
#loading the pickle file
new_df_rev=pd.read_pickle('../data/review.pkl')

In [52]:
new_df_rev.head()

Unnamed: 0,hotel_name,negative_review,positive_review,lat_x,lng_x,hotel_address,tags,lat_y,lng_y,location,country,city
0,11 Cadogan Gardens,Thought the prise of drinks at the bar a litt...,We were particularly impressed by the very wa...,51.493616,-0.159235,11 Cadogan Gardens Sloane Square Kensington an...,"[' Leisure trip ', ' Couple ', ' Superior Quee...",51.493616,-0.159235,"[{'country_code': 'GB', 'city': 'Chelsea', 'co...",United Kingdom,Chelsea
1,1K Hotel,Air conditioning in room didn t work and desp...,Location good close to le Marais and 3e arron...,48.863932,2.365874,13 Boulevard Du Temple 3rd arr 75003 Paris France,"[' Leisure trip ', ' Couple ', ' Superior M Do...",48.863932,2.365874,"[{'country_code': 'FR', 'city': 'Paris', 'coun...",France,Paris
2,25hours Hotel beim MuseumsQuartier,Breakfast not included and buffet really expe...,Cool vintage style in the middle of the museu...,48.206474,16.35463,Lerchenfelder Stra e 1 3 07 Neubau 1070 Vienna...,"[' Leisure trip ', ' Solo traveler ', ' Standa...",48.206474,16.35463,"[{'country_code': 'AT', 'city': 'Vienna', 'cou...",Austria,Vienna
3,41,"There wasn t a thing that we didn t like , No...",Its central proximity close to all services a...,51.498147,-0.143649,41 Buckingham Palace Road Westminster Borough ...,"[' Leisure trip ', ' Couple ', ' Executive Kin...",51.498147,-0.143649,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London
4,45 Park Lane Dorchester Collection,More kinds of fruit juice will make the mini ...,Everything here are almost perfect the staffs...,51.506371,-0.151536,45 Park Lane Westminster Borough London W1K 1P...,"[' Leisure trip ', ' Solo traveler ', ' Execut...",51.506371,-0.151536,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London


In [53]:
#subsetting the dataset for investigation
hotel_review=new_df_rev[['hotel_name','positive_review','negative_review','city','lat_x','lng_x']]

In [54]:
#join positive and negative review
hotel_review['review_text'] = hotel_review['positive_review'].astype(str) + hotel_review['negative_review'].astype(str)


In [55]:
#detect language
hotel_review['lang']=hotel_review['review_text'].apply(lambda x: detect(x))

In [56]:
#english review
hotel_review['lang'].value_counts()

lang
en    1474
Name: count, dtype: int64

In [57]:
#assigning the stopwords from nltk to variable
stops_rev = set(ENGLISH_STOP_WORDS)


## Feature Engineering and Text Visualisation

In the following section, we are intersted to understand the frequency of top words and its occurance with respect to stop words. We have plotted these visualisation for understanding the data set.

In [58]:
def get_top_n_words(corpus, n=None,y=None):
    vec = CountVectorizer(ngram_range=y).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [59]:
def get_top_n_words_with_stop_words(corpus, n=None,y=None):
    vec = CountVectorizer(ngram_range=y,stop_words=stops_rev).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [60]:
hotel_review['word_count'] = hotel_review['review_text'].apply(lambda x: len(str(x).split()))
desc_lengths = list(hotel_review['word_count'])
print("Number of descriptions:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))


Number of descriptions: 1474 
Average word count 12143.495251017639 
Minimum word count 183 
Maximum word count 172330


In [114]:
hotel_review['word_count'].iplot(
    kind='hist',
    bins = 100,
    linecolor='black',
    xTitle='Word Count',
    yTitle='Count',
    title='Word Count Distribution in Hotel Description',
    )

## Unigram Text Visualisation 

### Without Stopwords

In [116]:
common_words = get_top_n_words(hotel_review['review_text'], 15,y=(1,1))
df3 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df3.groupby('review_text').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 15 words in hotel description before removing stop words')


### With Stopwords

In [119]:
def get_top_n_words_with_stop_words(corpus, n=None, y=None):
    vec = CountVectorizer(ngram_range=y, stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 15, y=(1, 1))
df4 = pd.DataFrame(common_words, columns=['review_text', 'count'])
df4.groupby('review_text').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 15 words in hotel description after removing stop words')


## Bi-gram Text Visualisation
### Without Stopwords

In [121]:
# bi-gram without stop words

common_words = get_top_n_words(hotel_review['review_text'], 15,y=(2,2))
df5 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df5.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 15 bigrams in hotel description before removing stop words')


### With Stopwords

In [122]:
common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 15,y=(2,2))
df6 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df6.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 15 bigrams in hotel description After removing stop words')

## Tri-gram Text Visualisation

### Without stopwords

In [123]:
#tri-gram before stop words


common_words = get_top_n_words(hotel_review['review_text'], 15,y=(3,3))
df7 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df7.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 15 trigrams in hotel description before removing stop words')


### With stopwords

In [124]:

common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 15,y=(3,3))
df8 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df8.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 15 trigrams in hotel description after removing stop words')


# Modelling REV

Based on the visualisation, it is evident that bi-gram and tri-gram has more impact on the modelling process. Bi-gram and Tri-gram gives us more context of the reviews and it makes more sense for modelling recommender.

In [68]:
replace_space = re.compile('[/(){}\[\]\|@,;]')
symbol = re.compile('[^0-9a-z #+_]')
stopwordset = stops_rev

def clean_text(text):
    # lowercase text
    text = text.lower() 
    # replace replace_space symbols by space in text. substitute the matched string in replace_space with space.
    text = replace_space.sub(' ', text) 
    # remove symbols which are in symbol from text. substitute the matched string in symbol with Nothing.
    text = symbol.sub('', text)  
    # remove stopwords from text
    text = ' '.join(word for word in text.split() if word not in stopwordset) 
    return text
    


In [69]:
#applying function to clean_text
hotel_review['review_text_clean'] = hotel_review['review_text'].apply(clean_text)
hotel_review['positive_review'][0]

' We were particularly impressed by the very warm welcome we received a lovely hotel in a superb location good size room with excellent bathroom We will come back and already have recommended it to friends and relatives ,  The atmosphere and staff were excellent just what you would expect from a small luxury hotel Breakfast and restaurant food delicious and reasonably priced ,  Bed was amazingly comfortable The building is Full of character and class The staff where excellent and most helpful I really enjoyed staying there and would stay again Didn t get the opportunity to enjoy the breakfast maybe another time ,  Lovely hotel,  Concierge service excellent Bed very comfortable and lux Lounges and bar very comfortable and elegantly furnished Location perfect ,  Customer service was above and beyond from all staff Richie Long is attentive and nothing was too much trouble in fact all the staff were amazing shout out to Richie Emerson and Christian,  Everything Most comfortable bed Extreme

In [70]:
hotel_review

Unnamed: 0,hotel_name,positive_review,negative_review,city,lat_x,lng_x,review_text,lang,word_count,review_text_clean
0,11 Cadogan Gardens,We were particularly impressed by the very wa...,Thought the prise of drinks at the bar a litt...,Chelsea,51.493616,-0.159235,We were particularly impressed by the very wa...,en,5437,particularly impressed warm welcome received l...
1,1K Hotel,Location good close to le Marais and 3e arron...,Air conditioning in room didn t work and desp...,Paris,48.863932,2.365874,Location good close to le Marais and 3e arron...,en,5777,location good close le marais 3e arrondissemen...
2,25hours Hotel beim MuseumsQuartier,Cool vintage style in the middle of the museu...,Breakfast not included and buffet really expe...,Vienna,48.206474,16.354630,Cool vintage style in the middle of the museu...,en,25370,cool vintage style middle museum quarter metro...
3,41,Its central proximity close to all services a...,"There wasn t a thing that we didn t like , No...",West End of London,51.498147,-0.143649,Its central proximity close to all services a...,en,3411,central proximity close services restaurants s...
4,45 Park Lane Dorchester Collection,Everything here are almost perfect the staffs...,More kinds of fruit juice will make the mini ...,West End of London,51.506371,-0.151536,Everything here are almost perfect the staffs...,en,476,perfect staffs friendly room comfortable way s...
...,...,...,...,...,...,...,...,...,...,...
1469,citizenM London Bankside,"No, What an amazing and unique hotel A short ...",This was our third stay at this hotel and it ...,City of London,51.505151,-0.100472,"No, What an amazing and unique hotel A short ...",en,50772,amazing unique hotel short cab ride 15 minute ...
1470,citizenM London Shoreditch,Swift auto check in Good bar and lounge area ...,Lifts need reprogramming exasperating journey...,Barbican,51.524137,-0.078698,Swift auto check in Good bar and lounge area ...,en,29152,swift auto check good bar lounge area decent i...
1471,citizenM Tower of London,The use of technology was impressive UBS outl...,"Rooms are small but well designed, breakfast...",City of London,51.510237,-0.076443,The use of technology was impressive UBS outl...,en,65052,use technology impressive ubs outlets using ta...
1472,every hotel Piccadilly,The location was the only great aspect of thi...,The hotel overall requires an update furnitur...,London,51.510146,-0.131506,The location was the only great aspect of thi...,en,15926,location great aspect hotel central size rooms...


### Get REVIEWS

In [71]:
# Show All reviews

def get_individual_reviews_for_city(city, hotel_data):
    city_hotels = hotel_data[hotel_data['city'] == city]

    if city_hotels.empty:
        return None, [f"No hotels found in {city}."]

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name}."]

    return hotel_name, reviews_phrases

city_to_check = 'Paris'
hotel_name_most, reviews_for_city = get_individual_reviews_for_city(city_to_check, hotel_review)

if reviews_for_city is None:
    print(f"No reviews found for hotels in {city_to_check}.")
else:
    print(f"Reviews for the hotel with the most positive reviews in {city_to_check} - {hotel_name_most}:\n")

    for index, review_text in enumerate(reviews_for_city, start=1):
        print(f"Review {index}: {review_text}\n")


Reviews for the hotel with the most positive reviews in Paris - Hotel Regina:

Review 1: Building is very old but nicely maintained Bed quality was excellent and quality of toilettes are very good We booked family suite and there are three single beds in children room but we have a teenage daughter so they gave us two connecting rooms later which were much better The rooms are facing back of building which is better than facing road because it is more quiet and you can keep window open

Review 2: Beautiful hotel in a great location We booked the Eiffel tower junior suite which was lovely with a great view The bed nice and big but very hard I know some prefer hard to soft Good and friendly staff on reception

Review 3: THE CITY

Review 4: This is my favorite hotel I have stayed here three times and never been disappointed It is located right next to the Louvre and the Tuileries Gardens near a stop on metro line 1 and within easy walking distance of the Paris Opera and the Musee d Orsay 

In [72]:
# Show Random Reviews 

import random

def get_individual_reviews_for_city(city, hotel_data, num_reviews=10):
    city_hotels = hotel_data[hotel_data['city'] == city]

    if city_hotels.empty:
        return None, [f"No hotels found in {city}."]

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name}."]

    # Display only a random subset of reviews
    selected_reviews = random.sample(reviews_phrases, min(num_reviews, len(reviews_phrases)))

    return hotel_name, selected_reviews

city_to_check = 'Paris'
hotel_name, reviews_for_city = get_individual_reviews_for_city(city_to_check, hotel_review, num_reviews=10)

if reviews_for_city is None:
    print(f"No reviews found for hotels in {city_to_check}.")
else:
    print(f"Reviews for the hotel with the most positive reviews in {city_to_check} - {hotel_name}:\n")

    for index, review_text in enumerate(reviews_for_city, start=1):
        print(f"Review {index}: {review_text}\n")


Reviews for the hotel with the most positive reviews in Paris - Hotel Regina:

Review 1: Great location

Review 2: location

Review 3: The location is fantastic

Review 4: The personnel very nice

Review 5: This was a hotel with all an excellent complete breakfast buffe a nice restaurant high ceiling comfort beds minibar etc Close to public transports U2 38 1 etc The silent surroundings with the park and the church was nice Almost direct access to the garage

Review 6: Great staff always went one step ahead to provide the best service Even allowed us to stay for 2 hours more without charge

Review 7: The bed was extremely comfortsble

Review 8: Beds super comfy

Review 9: BED COMFORT

Review 10: Location



In [73]:
import random

def get_individual_reviews_for_city_random_10(city, hotel_data, num_reviews=10):
    # Convert the city name to lowercase
    city_lower = city.lower()

    city_hotels = hotel_data[hotel_data['city'].str.lower() == city_lower]

    if city_hotels.empty:
        return None, [f"No hotels found in {city}."]

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name}."]

    # Display only a random subset of reviews
    selected_reviews = random.sample(reviews_phrases, min(num_reviews, len(reviews_phrases)))

    return hotel_name, selected_reviews

city_to_check = 'Paris'
hotel_name, reviews_for_city = get_individual_reviews_for_city_random_10(city_to_check, hotel_review, num_reviews=10)

if reviews_for_city is None:
    print(f"No reviews found for hotels in {city_to_check}.")
else:
    print(f"Reviews for the hotel with the most positive reviews in {city_to_check} - {hotel_name}:\n")

    for index, review_text in enumerate(reviews_for_city, start=1):
        print(f"Review {index}: {review_text}\n")

Reviews for the hotel with the most positive reviews in Paris - Hotel Regina:

Review 1: All the staff were very helpful and happy to help you with anything the hotel was very clean and tidy which I thought was exceptional due to them having a renervation work The location was perfect You are walking distance to las rambles sight seeing buses and shops

Review 2: The conseirge and reception were pleasant and friendly and helpful the room was lovely

Review 3: We had a wonderful stay at Hotal Regina The location was excellent with easy access to various sights and transport options The staff was friendly and attentive from greeting the kids with balloon animals on arrival to providing excellent restaurant recommendations the breakfast was great best bacon we ve ever had Would certainly stay here again if visiting Barcelona

Review 4: everything is great next time we ll stay at the same hotel

Review 5: Staff were helpful

Review 6: The location is excellent Just beside Catalunya station

In [74]:
def get_reviews_for_hotel_in_city(hotel_name, city, hotel_data, num_reviews=10):
    # Convert the hotel name and city to lowercase for case-insensitive matching
    hotel_name_lower = hotel_name.lower()
    city_lower = city.lower()

    # Filter hotels based on the provided city and hotel name
    selected_hotels = hotel_data[(hotel_data['city'].str.lower() == city_lower) & (hotel_data['hotel_name'].str.lower() == hotel_name_lower)]

    if selected_hotels.empty:
        return None, [f"No information found for {hotel_name} in {city}."]

    # Retrieve positive reviews for the specified hotel
    reviews_text = selected_hotels.iloc[0]['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No positive reviews found for {hotel_name} in {city}."]

    # Show exactly 10 reviews or as many as available
    selected_reviews = reviews_phrases[:num_reviews]

    return hotel_name, selected_reviews

hotel_name_to_check = 'Hotel Regina'
city_to_check = 'Paris'
num_reviews_to_show = 10

hotel_name, reviews_for_hotel = get_reviews_for_hotel_in_city(hotel_name_to_check, city_to_check, hotel_review, num_reviews_to_show)

if reviews_for_hotel is None:
    print(f"No reviews found for {hotel_name_to_check} in {city_to_check}.")
else:
    print(f"Reviews for {hotel_name_to_check} in {city_to_check}:\n")

    for index, review_text in enumerate(reviews_for_hotel, start=1):
        print(f"Review {index}: {review_text}\n")


Reviews for Hotel Regina in Paris:

Review 1: Building is very old but nicely maintained Bed quality was excellent and quality of toilettes are very good We booked family suite and there are three single beds in children room but we have a teenage daughter so they gave us two connecting rooms later which were much better The rooms are facing back of building which is better than facing road because it is more quiet and you can keep window open

Review 2: Beautiful hotel in a great location We booked the Eiffel tower junior suite which was lovely with a great view The bed nice and big but very hard I know some prefer hard to soft Good and friendly staff on reception

Review 3: THE CITY

Review 4: This is my favorite hotel I have stayed here three times and never been disappointed It is located right next to the Louvre and the Tuileries Gardens near a stop on metro line 1 and within easy walking distance of the Paris Opera and the Musee d Orsay The hotel is in an original Haussmann build

In [75]:
def get_reviews_for_hotel_in_city(hotel_name, city, hotel_data, num_reviews=10):
    # Convert the hotel name and city to lowercase for case-insensitive matching
    hotel_name_lower = hotel_name.lower()
    city_lower = city.lower()

    # Filter hotels based on the provided city and hotel name
    selected_hotels = hotel_data[(hotel_data['city'].str.lower() == city_lower) & (hotel_data['hotel_name'].str.lower() == hotel_name_lower)]

    if selected_hotels.empty:
        return None, None, [f"No information found for {hotel_name} in {city}."]

    # Retrieve positive reviews for the specified hotel
    reviews_text = selected_hotels.iloc[0]['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, 0, [f"No reviews found for {hotel_name} in {city}."]

    # Count the total number of reviews
    num_total_reviews = len(reviews_phrases)

    # Show exactly 10 reviews or as many as available
    selected_reviews = reviews_phrases[:num_reviews]

    return hotel_name, num_total_reviews, selected_reviews

# Example usage
hotel_name_to_check = 'Hotel Regina'
city_to_check = 'Paris'
num_reviews_to_show = 10

hotel_name, num_total_reviews, reviews_for_hotel = get_reviews_for_hotel_in_city(hotel_name_to_check, city_to_check, hotel_review, num_reviews_to_show)

if reviews_for_hotel is None:
    print(f"No reviews found for {hotel_name_to_check} in {city_to_check}.")
else:
    print(f"{hotel_name_to_check} in {city_to_check} has {num_total_reviews} reviews.")
    print(f"Reviews for {hotel_name_to_check} in {city_to_check}:\n")

    for index, review_text in enumerate(reviews_for_hotel, start=1):
        print(f"Review {index}: {review_text}\n")


Hotel Regina in Paris has 1306 reviews.
Reviews for Hotel Regina in Paris:

Review 1: Building is very old but nicely maintained Bed quality was excellent and quality of toilettes are very good We booked family suite and there are three single beds in children room but we have a teenage daughter so they gave us two connecting rooms later which were much better The rooms are facing back of building which is better than facing road because it is more quiet and you can keep window open

Review 2: Beautiful hotel in a great location We booked the Eiffel tower junior suite which was lovely with a great view The bed nice and big but very hard I know some prefer hard to soft Good and friendly staff on reception

Review 3: THE CITY

Review 4: This is my favorite hotel I have stayed here three times and never been disappointed It is located right next to the Louvre and the Tuileries Gardens near a stop on metro line 1 and within easy walking distance of the Paris Opera and the Musee d Orsay The

In [76]:
# Show First 10 reviews

def get_individual_reviews_for_city(city, hotel_data, num_reviews=10):
    city_hotels = hotel_data[hotel_data['city'] == city]

    if city_hotels.empty:
        return None, [f"No hotels found in {city}."]

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    try:
        hotel_row = hotel_data.loc[hotel_data['hotel_name'] == hotel_name].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name}."]

    return hotel_name, reviews_phrases[:num_reviews]

# Example usage
city_to_check = 'Paris'
hotel_name, reviews_for_city = get_individual_reviews_for_city(city_to_check, hotel_review, num_reviews=10)

if reviews_for_city is None:
    print(f"No reviews found for hotels in {city_to_check}.")
else:
    print(f"Reviews for the hotel with the most positive reviews in {city_to_check} - {hotel_name}:\n")

    for index, review_text in enumerate(reviews_for_city, start=1):
        print(f"Review {index}: {review_text}\n")


Reviews for the hotel with the most positive reviews in Paris - Hotel Regina:

Review 1: Building is very old but nicely maintained Bed quality was excellent and quality of toilettes are very good We booked family suite and there are three single beds in children room but we have a teenage daughter so they gave us two connecting rooms later which were much better The rooms are facing back of building which is better than facing road because it is more quiet and you can keep window open

Review 2: Beautiful hotel in a great location We booked the Eiffel tower junior suite which was lovely with a great view The bed nice and big but very hard I know some prefer hard to soft Good and friendly staff on reception

Review 3: THE CITY

Review 4: This is my favorite hotel I have stayed here three times and never been disappointed It is located right next to the Louvre and the Tuileries Gardens near a stop on metro line 1 and within easy walking distance of the Paris Opera and the Musee d Orsay 

In [77]:
def get_hotel_with_most_positive_reviews(city, hotel_data):
    # Convert the input city to lowercase for case-insensitive comparison
    city_lower = city.lower()

    # Convert the 'city' column in hotel_data to lowercase
    hotel_data['city_lower'] = hotel_data['city'].str.lower()

    city_hotels = hotel_data[hotel_data['city_lower'] == city_lower]

    if city_hotels.empty:
        return None, f"No hotels found in {city}."

    # Find the hotel with the most positive reviews
    max_positive_reviews_index = city_hotels['positive_review'].apply(lambda x: len(x)).idxmax()
    hotel_name_most = city_hotels.loc[max_positive_reviews_index, 'hotel_name']

    # Drop the 'city_lower' column as it was created for comparison
    hotel_data = hotel_data.drop(columns=['city_lower'])

    return hotel_name_most


#city_to_check = 'Paris'
#hotel_name_most = get_hotel_with_most_positive_reviews(city_to_check, hotel_review)

if hotel_name_most is None:
    print(f"No hotels found in {city_to_check}.")
else:
    print(f"Hotel with the most positive reviews in {city_to_check}: {hotel_name_most}")


Hotel with the most positive reviews in Paris: Hotel Regina


In [78]:
#### FINAL E A UTLIZAR 
import random

def get_random_positive_reviews_for_hotel_(hotel_name, city, hotel_data, num_reviews=10):
    # Convert the inputs to lowercase for case-insensitive comparison
    hotel_name_lower = hotel_name.lower()
    city_lower = city.lower()

    # Filter hotels by city and hotel name
    try:
        hotel_row = hotel_data[(hotel_data['hotel_name'].str.lower() == hotel_name_lower) & (hotel_data['city'].str.lower() == city_lower)].iloc[0]
    except IndexError:
        return None, [f"Hotel {hotel_name} in {city} not found."]

    reviews_text = hotel_row['positive_review']
    reviews_phrases = [phrase.strip() for phrase in reviews_text.split(', ')]

    if not reviews_phrases:
        return hotel_name, [f"No reviews found for {hotel_name} in {city}."]

    # Display only a random subset of reviews
    selected_reviews = random.sample(reviews_phrases, min(num_reviews, len(reviews_phrases)))

    # Check if any review is "No" and replace it
    for i, review in enumerate(selected_reviews):
        if review.lower() == "no":
            remaining_reviews = [r for r in reviews_phrases if r.lower() != "no"]
            if remaining_reviews:
                new_review = random.choice(remaining_reviews)
                selected_reviews[i] = new_review

    return hotel_name, selected_reviews

# Example usage
#hotel_to_check = '1K Hotel'
#city_to_check = 'Paris'
#hotel_name, reviews_for_hotel = get_random_positive_reviews_for_hotel_(hotel_to_check, city_to_check, hotel_review, num_reviews=10)

#if reviews_for_hotel is None:
 #   print(f"No reviews found for {hotel_to_check} in {city_to_check}.")
#else:
 #   print(f"Reviews for {hotel_to_check} in {city_to_check}:\n")

  #  for index, review_text in enumerate(reviews_for_hotel, start=1):
   #     print(f"Review {index}: {review_text}\n")



# cosine similarities

In the sklearn library, there are many other functions you can use, to find cosine similarities between documents. You can directly use TfidfVectorizer in the sklearn’s feature_extraction.text class to Vectorize the words. It will calculate TF_IDF normalization.In sklearn, we can perform the dot product of the vector by using a linear kernel.Here the linear kernel is the same as the cosine similarity, but faster

In [79]:
tf_rev = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), min_df=0.0, stop_words='english')
tfidf_matrix_rev = tf_rev.fit_transform(hotel_review['review_text_clean'])


In [80]:
# function that will find us the top n similar papers based on cosine similarity:

cosine_similarities_rev = linear_kernel(tfidf_matrix_rev, tfidf_matrix_rev)


In [81]:
# function to generate maps
def new_recommendations(name,city, cosine_similarities):
    
    recommended_hotels = []
    
    #get input city index
    city_index= list(hotel_review[hotel_review.city==city].index)
    
    # gettin the index of the hotel that matches the name
    idx = hotel_review[(hotel_review.hotel_name == name)].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of  similar hotels list
    top_10_indexes = list(score_series.index)
    
    # populating the list with the names of hotels
    for i in range(len(top_10_indexes)):
        if top_10_indexes[i] not in city_index:
            pass
        else:
            recommended_hotels.append(hotel_review[hotel_review.index==top_10_indexes[i]]['hotel_name'].values[0])
    #getting the list of hotels based on the lat and long
    h = hotel_review[['hotel_name','lat_x','lng_x']].to_dict(orient='records')
    l = {k['hotel_name']: [k['lat_x'], k['lng_x']] for k in h}
    if {hotel: l[hotel] for hotel in recommended_hotels }=={}:
        print("There are no hotels of similar hotel")
    else:
        output= {hotel: l[hotel] for hotel in recommended_hotels[:10]}
        newoutput={i:output for i in range(1,len(output)+1)}
        return newoutput

In [82]:
positivismodoone = hotel_review['positive_review'][1]
positivismodoone

' Location good close to le Marais and 3e arrondissement Fitness room not great but adequate,  Comfortable beds welcoming staff cleanliness was very good,  The hotel is good The personal was nice and the installation very good The breakfast time interval excellent ,  Great location almost next to the metro station a ten minute metro journey to the centre of Paris,  Nice staff clean rooms with a decent service Close to metro and bus to reach the main tourist areas of the city but not much else nearby Well equipped gym that almost nobody uses ,  The position for public transport was excellent the metro is right there So we could get around Paris super easy the room was very small but had everything we could need the staff where very helpful and even arranged to change our room as the first one was on the road and we like to open our windows at night while we sleep There where quick to change and the new room was perfect they had an awesome restaurant on the premises and a funky little ni

In [83]:
new_recommendations('The Belgrave Hotel','Paris',cosine_similarities_rev)

{1: {'Hotel Regina': [48.8637503, 2.3320406],
  'Holiday Inn Paris Gare de l Est': [48.8758981, 2.3590504],
  'Hotel Saint Petersbourg Opera': [48.872174, 2.328075],
  'Best Western Premier Op ra Faubourg Ex Hotel Jules ': [48.8753359,
   2.3414617],
  'K K Hotel Cayr Saint Germain des Pr s': [48.8553117, 2.3254628],
  'Saint James Albany Paris Hotel Spa': [48.8642689, 2.3308179],
  'Little Palace Hotel': [48.8675674, 2.3539896],
  'Acad mie Hotel Saint Germain': [48.855263, 2.3305901],
  'Hotel Horset Op ra Best Western Premier Collection': [48.8691686,
   2.3337818],
  'Novotel Paris Les Halles': [48.8607299, 2.3465326]},
 2: {'Hotel Regina': [48.8637503, 2.3320406],
  'Holiday Inn Paris Gare de l Est': [48.8758981, 2.3590504],
  'Hotel Saint Petersbourg Opera': [48.872174, 2.328075],
  'Best Western Premier Op ra Faubourg Ex Hotel Jules ': [48.8753359,
   2.3414617],
  'K K Hotel Cayr Saint Germain des Pr s': [48.8553117, 2.3254628],
  'Saint James Albany Paris Hotel Spa': [48.86426

In [84]:
landmarks_df

Unnamed: 0,landmark_name,land_lat_x,land_lng_x,city
0,Eiffel Tower,48.858844,2.294350,Paris
1,Louvre Museum,48.860611,2.337643,Paris
2,Notre-Dame Cathedral,48.853000,2.349800,Paris
3,Montmartre,48.886700,2.343100,Paris
4,Arc de Triomphe,48.873800,2.295000,Paris
...,...,...,...,...
56,Sforza Palace,41.376800,1.905200,Milan
57,Leonardo da Vinci's Last Supper,41.384700,2.177400,Milan
58,Pinacoteca di Brera,41.387900,2.169900,Milan
59,Navigli Canals,41.380400,2.132700,Milan


In [85]:
additional_landmarks_df

Unnamed: 0,landmark_name,land_lat_x,land_lng_x,city
0,Louvre Pyramid,48.8608,2.3375,Paris
1,Musée de l'Orangerie,48.8637,2.3224,Paris
2,Centre Pompidou,48.8606,2.3522,Paris
3,Le Marais,48.8589,2.3582,Paris
4,Luxembourg Gardens,48.8462,2.3303,Paris
...,...,...,...,...
56,Leonardo da Vinci's Last Supper Replica,45.4641,2.1669,Milan
57,Biblioteca Ambrosiana,45.4786,2.1836,Milan
58,GAM Milano,45.4395,2.1480,Milan
59,Brera Pinacoteca,45.4236,2.1563,Milan


# FINAL INPUT & OUPUT

In [86]:
def new_recommendations_rev_output(name, city, cosine_similarities):
    recommended_hotels = []

    # Convert the city name to lowercase
    city_lower = city.lower()

    # Get input city index
    city_index = list(hotel_review[hotel_review['city'].str.lower() == city_lower].index)

    # Getting the index of the hotel that matches the name
    idx = hotel_review[(hotel_review.hotel_name == name)].index[0]

    # Creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)

    # Getting the indexes of similar hotels list
    top_10_indexes = list(score_series.index)

    # Populating the list with the names of hotels
    for i in range(len(top_10_indexes)):
        if top_10_indexes[i] not in city_index:
            pass
        else:
            recommended_hotels.append(hotel_review.loc[top_10_indexes[i], 'hotel_name'])

    # Getting the list of hotels based on the lat and long
    h = hotel_review[['hotel_name', 'lat_x', 'lng_x']].to_dict(orient='records')
    l = {k['hotel_name']: [k['lat_x'], k['lng_x']] for k in h}

    if not recommended_hotels:
        print("There are no hotels of similar hotel.")
    else:
        output = {hotel: l[hotel] for hotel in recommended_hotels[:10]}
        newoutput = {i: output for i in range(1, len(output) + 1)}
        return recommended_hotels[:30], newoutput

In [87]:
recommended_hotels_by_rev, output_rev = new_recommendations_rev_output('Hotel Regina','PARIS', cosine_similarities_rev)
recommended_hotels_by_rev

['Hotel Regina',
 'Holiday Inn Paris Notre Dame',
 'Saint James Albany Paris Hotel Spa',
 'Hotel Brighton Esprit de France',
 'Holiday Inn Paris Gare de l Est',
 'Hotel Saint Petersbourg Opera',
 'Hotel Trianon Rive Gauche',
 'K K Hotel Cayr Saint Germain des Pr s',
 'Novotel Paris Les Halles',
 'Hotel Bedford',
 'Little Palace Hotel',
 'Hotel d Orsay Esprit de France',
 'Best Western Premier Op ra Faubourg Ex Hotel Jules ',
 'Royal Saint Michel',
 'Hotel Westminster',
 'Hotel Op ra Richepanse',
 'Newhotel Roblin',
 'Hotel du Louvre in the Unbound Collection by Hyatt',
 'Villa Panth on',
 'Hotel Horset Op ra Best Western Premier Collection',
 'Mercure Paris Terminus Nord',
 'Acad mie Hotel Saint Germain',
 'Edouard 7 Paris Op ra',
 'Select Hotel',
 'Holiday Inn Paris Saint Germain des Pr s',
 'Hotel D Aubusson',
 'Madeleine Plaza',
 'Grand Hotel Du Palais Royal',
 'Golden Tulip Opera de Noailles',
 'Hotel Duo']

In [88]:
def new_recommendations_tags_output(name,city, cosine_similarities):
    
    recommended_hotels = []

    city_lower = city.lower()
    
    #get input city index
    city_index = list(hotel_review[hotel_review['city'].str.lower() == city_lower].index)

    
    # gettin the index of the hotel that matches the name
    idx = hoteltags_geo[(hoteltags_geo.hotel_name == name)].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of  similar hotels
    top_10_indexes = list(score_series.index)
    
    # populating the list with the names of the matching hotels
    for i in range(len(top_10_indexes)):
        if top_10_indexes[i] not in city_index:
            pass
        else:
            recommended_hotels.append(hoteltags_geo[hoteltags_geo.index==top_10_indexes[i]]['hotel_name'].values[0])

    # populating a dictionary of size 10 containing hotel name and lat and longitude 
    h = hoteltags_geo[['hotel_name','lat_x','lng_x']].to_dict(orient='records')
    l = {k['hotel_name']: [k['lat_x'], k['lng_x']] for k in h}
    if {hotel: l[hotel] for hotel in recommended_hotels }=={}:
        print("There are no hotels of similar hotel")
    else:
        output= {hotel: l[hotel] for hotel in recommended_hotels[:10]}
        newoutput={i:output for i in range(1,len(output)+1)}
        return recommended_hotels[:30], newoutput
    

recomended_by_tag, output_tag = new_recommendations_tags_output('Hotel Regina','PARIS',similarityDF)
recomended_by_tag

['Hotel Regina',
 'Hotel Duo',
 'Villa Beaumarchais',
 'Hotel L Antoine',
 'Hotel France d Antin Op ra',
 'Golden Tulip Opera de Noailles',
 'Gardette Park Hotel',
 'Amarante Beau Manoir',
 'Mercure Paris Terminus Nord',
 'Hotel Mayfair Paris',
 'Le 123 S bastopol Astotel',
 'Hotel Hor',
 'Hotel Duc De St Simon',
 'L Empire Paris',
 'Melia Paris Vendome',
 'Hotel Cambon',
 'Maison Albar Hotel Paris C line',
 'Holiday Inn Paris Gare de Lyon Bastille',
 'Hotel Eug ne en Ville',
 'Castille Paris Starhotels Collezione',
 'Royal Saint Michel',
 'Hotel Des Saints Peres Esprit de France',
 'Hotel Duminy Vendome',
 'Hotel De Buci by MH',
 'Hotel Arvor Saint Georges',
 'Melia Paris Notre Dame',
 'Royal Saint Honore',
 'Hotel de Jos phine BONAPARTE',
 'Banke Hotel',
 'Le Pavillon de la Reine Spa']

In [89]:
common_hotels = list(set(recommended_hotels_by_rev).intersection(recomended_by_tag))
print(common_hotels)

['Hotel Duo', 'Golden Tulip Opera de Noailles', 'Mercure Paris Terminus Nord', 'Hotel Regina', 'Royal Saint Michel']


In [90]:
def distance_between_landmark_and_hotel_7(landmarks_df, hoteltags_geo, landmark_name, hotel_name):
    # Convert landmark and hotel names to lowercase for case-insensitive matching
    landmark_name_lower = landmark_name.lower()
    print(landmark_name_lower)
    hotel_name_lower = hotel_name.lower()

    # Filter landmark_df for the given landmark_name and hotel_df for the given hotel_name
    filtered_landmark = landmarks_df[landmarks_df['landmark_name'].str.lower() == landmark_name_lower]

    filtered_hotel = hoteltags_geo[hoteltags_geo['hotel_name'].str.lower() == hotel_name_lower]
    print(filtered_landmark)
    # Check if the filtered DataFrames are empty
    if filtered_landmark.empty:
        print(f"No information found for landmark {landmark_name}.")
        return None

    if filtered_hotel.empty:
        print(f"No information found for hotel {hotel_name}.")
        return None

    # Retrieve the first row of each filtered DataFrame
    landmark_row = filtered_landmark.iloc[0]
    hotel_row = filtered_hotel.iloc[0]

    # Calculate the distance between the hotel and landmark
    distance = haversine_distance(
        hotel_row['lat_x'], hotel_row['lng_x'],
        landmark_row['land_lat_x'], landmark_row['land_lng_x']
    )

    return distance


distance_between_landmark_and_hotel_7(landmarks_df, hoteltags_geo,'LOUVRE MUSEUM' ,'HOTEL REGINA')

louvre museum
   landmark_name  land_lat_x  land_lng_x   city
1  Louvre Museum   48.860611    2.337643  Paris


0.5383411456616147

In [130]:

# new_recommendations é das reviews 
# new_recommendations_tags é das tags 

import webbrowser
## A tentar fazer o output final na função 

def  template_recomendtion_list(city_input,extended_landmarks_df,hotel_review, hoteltags_geo, land_input,cosine_similarities_rev,similarityDF):
    city_to_checkar = city_input
    hotel_name_most_top = get_hotel_with_most_positive_reviews(city_to_checkar, hotel_review)
    description = get_attributes_description_for_hotel(hotel_name_most_top, hoteltags_geo)
    print("-------------------------------- Ideal Hotel --------------------------------")
    print(f"According to our users the best hotel in {city_to_checkar} is {hotel_name_most_top}" )

    print("\n",description, "\n\n")
    if land_input == 'NO':
        closest_landmarks, closest_distances = find_closest_landmarks_for_hotel_5(hoteltags_geo, landmarks_df, hotel_name_most_top)

        if not closest_landmarks:
            print(f"No information about landmarks close to {hotel_name_most_top}.")
        else:
            print(f"The closest landmarks to {hotel_name_most_top} are:")
            for landmark, distance in zip(closest_landmarks, closest_distances):
                print(f"{landmark} - {distance:.2f} km away.")
    else:   
        distance_to_landmark_input_7 = distance_between_landmark_and_hotel_7(extended_landmarks_df, hoteltags_geo, land_input, hotel_name_most_top)
        if distance_to_landmark_input_7 is not None:
            print(f"\nThe distance from {hotel_name_most_top} to {land_input} is: {distance_to_landmark_input_7:.2f} km.")
        else:
            print(f"No information found for the given hotel or landmark.")
            
    print(f"\n\nReviews from our users about {hotel_name_most_top}:\n")
    hotel_name, reviews_for_hotel = get_random_positive_reviews_for_hotel_(hotel_name_most_top, city_to_checkar, hotel_review, num_reviews=10)

    if reviews_for_hotel is None:
        print(f"\n\nNo reviews found for {hotel_name_most_top}.")
    else:

        for index, review_text in enumerate(reviews_for_hotel, start=1):
            print(f"Review {index}: {review_text}\n")

    recommended_hotels_by_rev, output_rev = new_recommendations_rev_output(hotel_name_most_top,city_to_checkar, cosine_similarities_rev)
    recomended_by_tag, output_tag = new_recommendations_tags_output(hotel_name_most_top,city_to_checkar,similarityDF)

    # For viz
    list_map_rev = recommended_hotels_by_rev[:5]
    lis_map_tag = recomended_by_tag[:5]

    # Remove the hotel_name_most_top from the recommendation lists if it's present
    if hotel_name_most_top in recommended_hotels_by_rev:
        recommended_hotels_by_rev.remove(hotel_name_most_top)

    if hotel_name_most_top in recomended_by_tag:
        recomended_by_tag.remove(hotel_name_most_top)

    
    if recommended_hotels_by_rev:
        for index, hotel_recommended in enumerate(recommended_hotels_by_rev[:5], start=1):
                print(list_map_rev)
                print(f"-------------------------------- {index}. Recommended Hotel: {hotel_recommended} --------------------------------")
                print(f"According to our users another similar hotel in {city_input} to {hotel_name_most_top} is {hotel_recommended}")
                print("\n", description, "\n\n")
                if land_input.lower() == 'no':
                    closest_landmarks, closest_distances = find_closest_landmarks_for_hotel_5(hoteltags_geo, extended_landmarks_df, hotel_recommended)
                    if not closest_landmarks:
                        print(f"No information about landmarks close to {hotel_recommended}.")
                    else:
                        print(f"The closest landmarks to {hotel_recommended} are:")
                        for landmark, distance in zip(closest_landmarks, closest_distances):
                            print(f"{landmark} - {distance:.2f} km away.")
                else:
                    distance_to_landmark_input_7 = distance_between_landmark_and_hotel_7(extended_landmarks_df,hoteltags_geo, land_input, hotel_recommended)
                    if not distance_to_landmark_input_7:
                        print(f"No information found for the given hotel or landmark.")
                    else:
                        print(f"\nThe distance from {hotel_recommended} to {land_input} is: {distance_to_landmark_input_7:.2f} km.")

                print(f"\n\nReviews from our users about {hotel_recommended}:\n")
                hotel_name, reviews_for_hotel = get_random_positive_reviews_for_hotel_(hotel_recommended, city_to_checkar,hotel_review, num_reviews=10)

                if reviews_for_hotel is None:
                    print(f"\n\nNo reviews found for {hotel_recommended}.")
                else:
                    for index, review_text in enumerate(reviews_for_hotel, start=1):
                        print(f"Review {index}: {review_text}\n")

        #mapinha -
        hotel_info_df = hoteltags_geo[hoteltags_geo['hotel_name'].isin(list_map_rev)]
        city_landmarks_df = extended_landmarks_df[extended_landmarks_df['city'].str.lower() == city_input.lower()]

        # Separate the first hotel
        first_hotel_name = list_map_rev[0]
        # Find the first hotel row
        first_hotel = hotel_info_df[hotel_info_df['hotel_name'] == first_hotel_name].iloc[0]

        hotel_info_df = hotel_info_df[hotel_info_df['hotel_name'] != first_hotel_name]

        # Calculate the center of the map
        center_latitude = (hotel_info_df['lat_x'].mean() + city_landmarks_df['land_lat_x'].mean()) / 2
        center_longitude = (hotel_info_df['lng_x'].mean() + city_landmarks_df['land_lng_x'].mean()) / 2

        # Create the base map
        my_map1 = folium.Map(location=[center_latitude, center_longitude], zoom_start=14)


        folium.Marker(
            location=[first_hotel['lat_x'], first_hotel['lng_x']],
            popup=first_hotel['hotel_name'],
            icon=folium.Icon(color='red', icon='bed')
            ).add_to(my_map1)

        # hotel markers
        hotel_cluster = MarkerCluster().add_to(my_map1)
        for index, row in hotel_info_df.iterrows():
            folium.Marker(
                location=[row['lat_x'], row['lng_x']],
                popup=row['hotel_name'],
                icon=folium.Icon(color='blue', icon='bed')
            ).add_to(hotel_cluster)

        # landmark markers
        landmark_cluster = MarkerCluster().add_to(my_map1)
        for index, row in city_landmarks_df.iterrows():
            folium.Marker(
                location=[row['land_lat_x'], row['land_lng_x']],
                popup=row['landmark_name'],
                icon=folium.Icon(color='green', icon='info-sign')
            ).add_to(landmark_cluster)

        my_map1
        # Save the map as an HTML file
        map_filename = 'folium_map.html'
        my_map1.save(map_filename)
        webbrowser.open('file://' + os.getcwd() + '/' + map_filename, new=2)    


In [103]:
row

new_tags    {family with young children, penta plus room, ...
Name: 1473, dtype: object

In [134]:
# EM PRINCIIPIO ESTÁ É A VERSÃO FINAL DO OUTPUT TEMPLATE, FALTA RESOLVER O ERRO PARA LANDMARK ESPECIFICA, SE ESCREVERES NO, ELE DEVOLVE AS MAIS PERTO
# E DEPOIS SÓ FALTARIA FAZER O MAPA

print("Welcome to MORIANA the Best Hotel Recomendation System")
print("[MORIANA] - The cities we have data available")
cities = landmarks_df['city'].unique()
display(pd.DataFrame(cities, columns=['City']))


while True: #INPUT LOOP 
    # CITY SELECTION
    print("Which city would you like to search hotels?")
    city_input = input("City: ")
    city_input = city_input.upper()  # Convert input to uppercase
    print(city_input)
    
    if city_input == 'EXIT': #EXIT
        print("Exiting the program.")
        break  # Exit the loop if the user wants to exit

    if city_input not in map(str.upper, cities):
        print(f"\nSorry, we don't have data available regarding {city_input}\n\nHere are the cities we have data available:")
        display(pd.DataFrame(cities, columns=['City']))
    else:
        print(f"\nGreat! Searching for hotels in {city_input}...")

        # LANDMARKS PREFERENCE
        print("\nDo you prefer to stay close to any landmark? Type 'no' if you don't have a specific landmark preference.")
        land_input = input("Landmark: ")
        land_input = land_input.upper() # Convert input to uppercase
        print(land_input)
        
        # Filtering landmarks_df based on the specified city
        landmarks_city_input = landmarks_df[landmarks_df['city'].str.upper() == city_input]

        # Extracting unique landmark names for the specified city
        unique_landmarks_for_city = landmarks_city_input['landmark_name'].unique()
        
        if land_input =='EXIT': #EXIT
            print("Exiting the program.")
            break
        elif land_input == 'NO': # NO -> Continue
            print("\nYou don't have a specific landmark preference. Continuing...\n")
        else:
            while land_input not in map(str.upper, unique_landmarks_for_city):
                print(f"\nSorry, we don't have data regarding the landmark {land_input} in {city_input}. We have limited data about some main landmarks of {city_input}.\n")
                print(f"\nFrom the available data, the landmarks in {city_input} we have are:")
                for landmark in unique_landmarks_for_city:
                    print(landmark)
                land_input = input("\nLandmark: ")
                land_input = land_input.upper() # Convert input to upper
                if land_input =='EXIT': #EXIT
                    print("Exiting the program.")
                    break
            print(f"\nGreat! You prefer to stay close to {land_input}. Continuing search...\n")
        
        break  # Exit the loop if a valid city is provided and landmark preferences are obtained



template_recomendtion_list(city_input,landmarks_df,hotel_review, hoteltags_geo, land_input,cosine_similarities_rev,similarityDF)

Welcome to MORIANA the Best Hotel Recomendation System
[MORIANA] - The cities we have data available


Unnamed: 0,City
0,Paris
1,London
2,Vienna
3,Barcelona
4,Amsterdam
5,Milan


Which city would you like to search hotels?
BARCELONA

Great! Searching for hotels in BARCELONA...

Do you prefer to stay close to any landmark? Type 'no' if you don't have a specific landmark preference.
HYDE PARK

Sorry, we don't have data regarding the landmark HYDE PARK in BARCELONA. We have limited data about some main landmarks of BARCELONA.


From the available data, the landmarks in BARCELONA we have are:
Sagrada Familia
Park Güell
Casa Batlló
La Rambla
Barri Gòtic
Magic Fountain of Montjuïc
Gothic Quarter
Palau de la Música Catalana
Camp Nou
Barcelona Cathedral

Sorry, we don't have data regarding the landmark  in BARCELONA. We have limited data about some main landmarks of BARCELONA.


From the available data, the landmarks in BARCELONA we have are:
Sagrada Familia
Park Güell
Casa Batlló
La Rambla
Barri Gòtic
Magic Fountain of Montjuïc
Gothic Quarter
Palau de la Música Catalana
Camp Nou
Barcelona Cathedral

Sorry, we don't have data regarding the landmark  in BARCELONA. We ha

print("Welcome to MORIANA the Best Hotel Recomendation System")
print("[MORIANA] - The cities we have data available")
cities = landmarks_df['city'].unique()
display(pd.DataFrame(cities, columns=['City']))



while True: #INPUT LOOP 
    # CITY SELECTION
    print("Which city would you like to search hotels?")
    city_input = input("City: ")
    city_input = city_input.upper()  # Convert input to uppercase
    print(city_input)
    
    if city_input == 'EXIT': #EXIT
        print("Exiting the program.")
        break  # Exit the loop if the user wants to exit

    if city_input not in map(str.upper, cities):
        print(f"\nSorry, we don't have data available regarding {city_input}\n\nHere are the cities we have data available:")
        display(pd.DataFrame(cities, columns=['City']))
    else:
        print(f"\nGreat! Searching for hotels in {city_input}...")

        # LANDMARKS PREFERENCE
        print("\nDo you prefer to stay close to any landmark? Type 'no' if you don't have a specific landmark preference.")
        land_input = input("Landmark: ")
        land_input = land_input.upper() # Convert input to uppercase
        print(land_input)
        
        # Filtering landmarks_df based on the specified city
        landmarks_city_input = landmarks_df[landmarks_df['city'].str.upper() == city_input]

        # Extracting unique landmark names for the specified city
        unique_landmarks_for_city = landmarks_city_input['landmark_name'].unique()
        
        if land_input =='EXIT': #EXIT
            print("Exiting the program.")
            break
        elif land_input == 'NO': # NO -> Continue
            print("\nYou don't have a specific landmark preference. Continuing...\n")
        else:
            while land_input not in map(str.upper, unique_landmarks_for_city):
                print(f"\nSorry, we don't have data regarding the landmark {land_input} in {city_input}. We have limited data about some main landmarks of {city_input}.\n")
                print(f"\nFrom the available data, the landmarks in {city_input} we have are:")
                for landmark in unique_landmarks_for_city:
                    print(landmark)
                land_input = input("\nLandmark: ")
                land_input = land_input.upper() # Convert input to upper
                if land_input =='EXIT': #EXIT
                    print("Exiting the program.")
                    break
            print(f"\nGreat! You prefer to stay close to {land_input}. Continuing search...\n")
        
        break  # Exit the loop if a valid city is provided and landmark preferences are obtained




city_to_checkar = city_input
hotel_name_most_top = get_hotel_with_most_positive_reviews(city_to_checkar, hotel_review)
description = get_attributes_description_for_hotel(hotel_name_most_top, hoteltags_geo)
print("-------------------------------- Ideal Hotel --------------------------------")
print(f"According to our users the best hotel in {city_to_checkar} is {hotel_name_most_top}" )

print("\n",description, "\n\n")
if land_input == 'NO':
    closest_landmarks, closest_distances = find_closest_landmarks_for_hotel_5(hoteltags_geo, landmarks_df, hotel_name_most_top)

    if not closest_landmarks:
        print(f"No information about landmarks close to {hotel_name_most_top}.")
    else:
        print(f"The closest landmarks to {hotel_name_most_top} are:")
        for landmark, distance in zip(closest_landmarks, closest_distances):
            print(f"{landmark} - {distance:.2f} km away.")
else:   
    distance_to_landmark_input = distance_between_hotel_and_landmark(hoteltags_geo, landmarks_df, hotel_name_most_top, land_input)
    if not distance_to_landmark:
        print(f"No information found for the given hotel or landmark.")
    else:
        print(f"\nThe distance from {hotel_name_most_top} to {land_input} is: {distance_to_landmark_input:.2f} km.")
        
print(f"\n\nReviews from our users about {hotel_name_most_top}:\n")
hotel_name, reviews_for_hotel = get_random_positive_reviews_for_hotel_(hotel_name_most_top, hotel_review, num_reviews=10)

if reviews_for_hotel is None:
    print(f"\n\nNo reviews found for {hotel_name_most_top}.")
else:

    for index, review_text in enumerate(reviews_for_hotel, start=1):
        print(f"Review {index}: {review_text}\n")
print("-------------------------------- Similar Hotels --------------------------------")
# by review
recommend_by_rev, output_rev = new_recommendations_rev_output(hotel_name_most_top, city_input, cosine_similarities_rev)

## tava a pensar em fazer get list com os hoteis recommendados por review e get list dos hoteis recomendados por tag
## E fazer um check se os hoteis estão em ambas as listas passam poara o output 



