## Get data

In [1]:
import pandas as pd
import requests
from io import BytesIO
import numpy as np

from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import re
import random
import plotly.graph_objs as go
import plotly.plotly as py
import cufflinks

pd.options.display.max_columns = 30

from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff

InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='solar')

In [2]:
data = 'https://docs.google.com/spreadsheets/d/1HE6vUktQd7p0sYXmk87l6Rc4MQyUkJhEHBmsHJ9GyTY'

In [3]:
def get_spreadsheet(link):
  
    r = requests.get(link + '/export?format=csv&id')
    return pd.read_csv(BytesIO(r.content))

In [4]:
data = get_spreadsheet(data)
data.head()

Unnamed: 0,name,address,desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...


In [6]:
df = data

In [5]:
def print_description(index):
    example = df[df.index == index][['desc', 'name']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Name:', example[1])

In [7]:
print_description(10)

Soak up the vibrant scene in the Living Room Bar and get in the mix with our live music and DJ series before heading to a memorable dinner at TRACE. Offering inspired seasonal fare in an award-winning atmosphere, it's a not-to-be-missed culinary experience in downtown Seattle. Work it all off the next morning at FIT®, our state-of-the-art fitness center before wandering out to explore many of the area's nearby attractions, including Pike Place Market, Pioneer Square and the Seattle Art Museum. As always, we've got you covered during your time at W Seattle with our signature Whatever/Whenever® service - your wish is truly our command.
Name: W Seattle


In [8]:
print_description(100)

On a budget in Seattle or looking for something different? The historic charm and "home away from home" atmosphere of The Baroness will be sure to make you feel like one of the family. Conveniently located on First Hill, we are proud to be part of the Virginia Mason Hospital campus and only minutes from Harborview Medical Center and Swedish Hospital. The Baroness Hotel is a great option for short or long term medical, patient or family stays. Whether you are visiting the area's world-class medical facilities or on a budget vacation, our goal is to ensure a wonderful stay. Guest Amenities: Complimentary Internet access, Two twin, one or two queen studios with mini fridge and microwave, Two twin or one queen suites with full kitchens, Laundry facilities available, Flat screen cable television with HBO, Complimentary local calls, Ice and vending machines located in the lobby, Coffee maker and hairdryers in all guestrooms, Room service available seven days a week from the Rhododendron Cafe

In [10]:
# EDA
# Token (vocabulary) Frequency Distribution Before Removing Stop Words

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(df['desc'], 20)
df1 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df1.groupby('desc').sum()['count'].sort_values()

desc
on           129
downtown     133
are          136
center       151
or           161
your         186
for          216
from         224
at           231
is           271
with         280
hotel        295
you          304
our          359
in           449
to           471
seattle      533
of           536
and         1062
the         1258
Name: count, dtype: int64

In [11]:
# Token (vocabulary) Frequency Distribution After Removing Stop Words

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(df['desc'], 20)
df2 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df2.groupby('desc').sum()['count'].sort_values()

desc
breakfast     68
room          77
city          79
just          82
business      87
inn           89
pike          90
enjoy         93
market        97
space         97
airport       99
place        102
stay         105
rooms        106
located      108
free         123
downtown     133
center       151
hotel        295
seattle      533
Name: count, dtype: int64

In [12]:
# Bigrams Frequency Distribution Before Removing Stop Words

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(df['desc'], 20)
df3 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df3.groupby('desc').sum()['count'].sort_values(ascending=False)

desc
in the              147
of the              133
pike place           86
place market         85
to the               81
downtown seattle     79
from the             79
and the              72
space needle         68
at the               67
in seattle           60
the seattle          57
our hotel            54
hotel is             49
of our               45
the city             45
of seattle           44
one of               42
you ll               41
the space            40
Name: count, dtype: int64

In [13]:
# Bigrams Frequency Distribution After Removing Stop Words

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(df['desc'], 20)
df4 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df4.groupby('desc').sum()['count'].sort_values(ascending=False)

desc
pike place               86
place market             85
downtown seattle         80
space needle             68
wi fi                    37
guest rooms              34
seattle hotel            33
pacific northwest        33
fitness center           32
hotel seattle            30
lake union               30
24 hour                  28
international airport    27
business center          27
seattle airport          26
high speed               25
seattle tacoma           25
university washington    24
seattle center           24
convention center        21
Name: count, dtype: int64

In [14]:
# Trigrams Frequency Distribution Before Removing Stop Words

def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(df['desc'], 20)
df5 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df5.groupby('desc').sum()['count'].sort_values(ascending=False)

desc
pike place market               85
the space needle                39
the heart of                    33
in the heart                    28
located in the                  26
place market and                24
the pacific northwest           23
university of washington        23
one of the                      22
tacoma international airport    21
seattle tacoma international    21
easy access to                  20
free wi fi                      19
of the city                     17
washington state convention     17
our hotel is                    16
of downtown seattle             16
seattle art museum              16
state convention center         15
hotel in seattle                15
Name: count, dtype: int64

In [16]:
# Trigrams Frequency Distribution After Removing Stop Words

def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(df['desc'], 20)
df6 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df6.groupby('desc').sum()['count'].sort_values(ascending=False)

desc
pike place market               85
tacoma international airport    21
seattle tacoma international    21
free wi fi                      19
washington state convention     17
seattle art museum              16
place market seattle            16
state convention center         15
high speed internet             14
space needle pike               12
needle pike place               11
south lake union                11
sea tac airport                 10
downtown seattle hotel          10
home away home                   9
link light rail                  8
just minutes away                8
heart downtown seattle           8
free high speed                  8
24 hour fitness                  7
Name: count, dtype: int64

In [17]:
df['word_count'] = df['desc'].apply(lambda x: len(str(x).split()))
desc_lengths = list(df['word_count'])
print("Number of descriptions:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))

Number of descriptions: 152 
Average word count 156.94736842105263 
Minimum word count 16 
Maximum word count 494


In [18]:
df['word_count']

0      184
1      152
2      147
3      150
4      151
5      136
6       70
7      117
8      106
9      118
10     105
11      80
12     133
13      50
14      80
15     155
16     140
17     163
18     100
19     260
20     163
21      39
22     171
23     134
24     168
25     199
26     161
27     173
28     220
29     169
      ... 
122    144
123     38
124    129
125    157
126    213
127    227
128    166
129    116
130    160
131    285
132    143
133    459
134    244
135     73
136    274
137    126
138    143
139    192
140    265
141    494
142    225
143     53
144     36
145     16
146    178
147    198
148     98
149    143
150     57
151    250
Name: word_count, Length: 152, dtype: int64

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mragu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
# Text preprocessing

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
    
df['desc_clean'] = df['desc'].apply(clean_text)

In [23]:
# Modelling

df.set_index('name', inplace = True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['desc_clean'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index)

def recommendations(name, cosine_similarities = cosine_similarities):
    
    recommended_hotels = []
    
    # gettin the index of the hotel that matches the name
    idx = indices[indices == name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar hotels except itself
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the names of the top 10 matching hotels
    for i in top_10_indexes:
        recommended_hotels.append(list(df.index)[i])
        
    return recommended_hotels

In [24]:
recommendations('Hilton Seattle Airport & Conference Center')

['Embassy Suites by Hilton Seattle Tacoma International Airport',
 'DoubleTree by Hilton Hotel Seattle Airport',
 'Seattle Airport Marriott',
 'Motel 6 Seattle Sea-Tac Airport South',
 'Econo Lodge SeaTac Airport North',
 'Four Points by Sheraton Downtown Seattle Center',
 'Knights Inn Tukwila',
 'Econo Lodge Renton-Bellevue',
 'Hampton Inn Seattle/Southcenter',
 'Radisson Hotel Seattle Airport']