# Scraping Hotel Ratings on Tripadvisor

In this homework we will practice web scraping. Let's get some basic information for each hotel in Boston.

On each hotel page, scrape the Traverler ratings. **(10 pts)**

![Information to be scraped](traveler_ratings.png)

Save the data in "traverler_ratings.csv" in the following format:

hotel_name, rating, count

In [2]:
review_result_list=[]

In [3]:
### All cells need to be run sequntially to make the later cell work functionally.


from bs4 import BeautifulSoup
import sys
import time
import os
import logging
import argparse
import requests
import codecs
import json
import locale

locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

base_url = "http://www.tripadvisor.com"
all_hotel_url=[]
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"

hotel_rating_result=[]
""" STEP 1  """
def get_tourism_page(city, state):
    """
        Return json containing the URL
        of the tourism city page
    """

    return 'https://www.tripadvisor.com/Tourism-g60745-Boston_Massachusetts-Vacations.html'


"""STEP 2  """
def get_city_page(tourism_url):
    """
        Get the URL of the hotels of the city
        using the URL returned by the function
        get_tourism_page()
        """

    url = tourism_url

    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
   
    html = response.text.encode('utf-8')

    soup = BeautifulSoup(html,"html.parser")

    li = soup.find("li", {"class": "hotels twoLines"})
    city_url = li.find('a', href = True)

    return city_url['href']

""" STEP 3 """
def get_hotellist_page(city_url, count):
    """ Get the hotel list page given the url returned by
        get_city_page(). Return the html after saving
        it to the datadir 
    """

    url = base_url  + city_url

    time.sleep(2)

    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')

    return html


""" STEP 4 """
def parse_hotellist_page(html):
    """ Parse the html pages returned by get_hotellist_page().
        Return the next url page to scrape (a city can have
        more than one page of hotels) if there is, else exit
        the script.
    """

    soup = BeautifulSoup(html,"html.parser")
    hotel_links = soup.findAll(dir='ltr')
    
    for hotel in hotel_links:
        get_hotel_data(hotel.get_text(),hotel['href'])
        all_hotel_url.append(base_url+hotel['href'])
        time.sleep(0.5)
        
             
def get_hotel_data(name, link):
    url = base_url + link
    
    response = requests.get(url)
    html = response.text.encode('utf-8')
    soup= BeautifulSoup(html,"html.parser")
    x1 = soup.findAll('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_5'})
    x2 = soup.findAll('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_4'})
    x3 = soup.findAll('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_3'})
    x4 = soup.findAll('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_2'})
    x5 = soup.findAll('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_1'})
    vote =[]
    rating=['Excellent','Very good', 'Average','Poor','Terriable']
    for x in [x1,x2,x3,x4,x5]:
        data = [ i for i in x[0].get_text().split('\n') if i!='']
        vote.append(locale.atoi(data[1]))
    
    result=(name,vote[0],vote[1],vote[2],vote[3],vote[4])
    print(result)
    hotel_rating_result.append(result)
    

def get_next_page(link):
    soup = BeautifulSoup(link,"html.parser")
    if soup.find('span', {'class' : 'nav next ui_button disabled'}):
        print("we reached last page")
        return None
    
    div = soup.find("div", {"class" : "unified pagination standard_pagination"})
    for i in div:
        if i.get_text()=="Next":
            return (i['href'])

        
        
"""
Running:
    give the initial link
    run the while loop if the next page is available

"""
    

def run():
    # initial the city link
    all_hotel_url=[]
    hotel_rating_result=[]
    cityLink = get_city_page('https://www.tripadvisor.com/Tourism-g60745-Boston_Massachusetts-Vacations.html') 
    c=0
    while(True):
        c+=1
        # get the html
        html = get_hotellist_page(cityLink,c)
        # extract the data from the html
        parse_hotellist_page(html)
        # assign the new link 
        cityLink = get_next_page(html)
        if cityLink==None:
            print()
            print("finished!")
            break
    # remove the duplicated hotel links
    all_hotel_url= list(set(all_hotel_url))
    # remove the duplicated hotel ratings
    hotel_rating_result=list(set(hotel_rating_result))
        
        




In [4]:

# running cell

import pandas as pd
run()
all_hotel_url= list(set(all_hotel_url))
hotel_rating_result=list(set(hotel_rating_result))
rating_df=pd.DataFrame(hotel_rating_result,columns=['hotel_name','Excellent','Very good','Average','Poor','Terrible'])
rating_df.to_csv('traverler_ratings.csv')

url_list = pd.DataFrame(all_hotel_url, columns = ['hotel_url'])
url_list.to_csv('all_hotel_url.csv')

('The Inn at Longwood Medical', 480, 247, 133, 65, 51)
('Marriott Vacation Club Pulse at Custom House, Boston', 509, 176, 13, 13, 12)
('Boston Harbor Hotel', 1284, 230, 55, 21, 16)
('Seaport Boston Hotel', 3094, 733, 132, 50, 37)
('Four Seasons Hotel Boston', 1139, 198, 53, 27, 17)
('Lenox Hotel', 2191, 552, 120, 45, 20)
('InterContinental Boston', 2130, 713, 174, 69, 61)
('Hotel Commonwealth', 3121, 576, 127, 50, 22)
('Courtyard Boston Copley Square', 1060, 377, 62, 16, 15)
('Mandarin Oriental, Boston', 436, 82, 34, 9, 8)
('Residence Inn Boston Back Bay/Fenway', 891, 289, 55, 15, 7)
('Eliot Hotel', 675, 254, 71, 36, 24)
('XV Beacon', 763, 170, 37, 30, 14)
('Residence Inn Boston Downtown/Seaport', 280, 100, 13, 9, 3)
('Hilton Garden Inn Boston Logan Airport', 476, 123, 24, 8, 3)
('Kimpton Nine Zero Hotel', 1494, 587, 127, 31, 27)
('The Verb Hotel', 929, 322, 75, 20, 14)
('The Langham, Boston', 1134, 490, 141, 50, 34)
('The Godfrey Hotel Boston', 1002, 328, 94, 34, 12)
('Colonnade Hotel

Next, scrape all the reviews of each hotel for the star ratings of the following attributes: Value, Location, Sleep Quality, Rooms, Cleanliness, Service. Note that some reviews may not have attribute ratings and some may only have some of the attributes. **(25 pts)**

![Information to be scraped](attribute_ratings.png)

Save the data in "attribute_ratings.csv" in the following format:

hotel_name, review_id, attribute, star_value

In [9]:

#print(len(all_hotel_url))

attributes=['Value','Rooms','Location','Cleanliness','Sleep Quality','Service']

# passed in hotel_url, and generate the list of review links 
# pass the list to get the data
def get_review_page(hotel_url,rating):
    response = requests.get(hotel_url)
    html = response.text.encode('utf-8')
    soup = BeautifulSoup(html,"html.parser")
    
    # get all the reviews for this page
    review_link_list=[]
    
    reviews = soup.findAll('span',{"class":'noQuotes'})
    for ana in reviews:
        if str(ana.parent['href'])[0:9]=='/ShowUser':
            review_link_list.append( (base_url+ana.parent['href']) )
    
    # should pass the review links to some function to extract 
    # all the data. and write to a file.
    get_reviews_data(review_link_list,rating)
    
    
    
    # get the next review page url
    try:
        next_url=soup.findAll('a',{"class":"nav next rndBtn ui_button primary taLnk"})[0]['href']
        return base_url+next_url
        
    except:
        return None

def get_reviews_data(url_list,hotel_rating):

    # all reviews on one page 
    for url in url_list:
        response=requests.get(url)
        html = response.text.encode('utf-8')
        
        # soup object
        soup=BeautifulSoup(html,'html.parser')
        
        
        """ Infor we need to get:
            hotel name
            review_id
            attribute
            start_value
        
        """
        # hotel_name
        hotel_name = (soup.findAll('span',{'class':'altHeadInline'})[0].get_text()).strip('\n')
                
        # get review id
        review_id = soup.findAll('p',{"property":"reviewBody"})[0]['id']
        
        pair = {}
        try:
            # attributes
            attribute_list = soup.findAll('div', {'class':'rating-list'})[0].findAll('li', {'class':"recommend-answer"})
            
            for attribute in attribute_list:
                pair[attribute.find('div', {'class':'recommend-description'}).find(text=True)]=attribute.find('img')['alt'][0]
        except:
            pair={}
        

        # final data to write
        data_to_write = [hotel_name,review_id]
        
        for x in attributes:
            if x in pair:
                data_to_write.append(pair[x])
            else:
                data_to_write.append(None)
        data_to_write.append(hotel_rating)
        
        review_result_list.append(data_to_write)
#        print(data_to_write)

def get_rating(url):
    link=url
    response=requests.get(link)
    html = response.text.encode('utf-8')
    soup=BeautifulSoup(html,'html.parser')
    try:
        rating = soup.find('div',{'class':'prw_rup prw_common_bubble_rating bubble_rating'}).find('span')['content']
    except:
        rating = soup.find('div',{'class':'prw_rup prw_common_bubble_rating bubble_rating'})[0].find('span')['content']
        
    return rating

def findLaststop():
    # first get the last review
    last_review = final_list[len(final_list)-1]
    print(last_review)
    
    s = '_'.join([x for x in last_review[0].split(' ')[2:] if x!='-'] )
    print(s)
    #print(s)
    for x in all_hotel_url:
        if s in x:
            return all_hotel_url.index(x)
    return None
    
def run(url_index):
    hotel_url=all_hotel_url[url_index]
    print(hotel_url)
    hotel_rating=get_rating(hotel_url)
    while(True):
        new_url=get_review_page(hotel_url,hotel_rating)
        hotel_url=new_url
        if(hotel_url==None):
            print('We are done for this hotel!!! Move on!',url_index)
            review_list_df = pd.DataFrame(review_result_list, columns = ['hotel_name', 'review_id', 'Value','Rooms','Location','Cleanliness','Sleep Quality','Service','star_rating'])
            review_list_df.to_csv('final_attribute_ratings_part2.csv')
            print()
            break
        

# hotel_url='http://www.tripadvisor.com/Hotel_Review-g60745-d8145466-Reviews-Aloft_Boston_Seaport-Boston_Massachusetts.html'
# count=0
# while(True):
#     new_url = get_review_page(hotel_url,'4.5')
#     hotel_url=new_url
#     count+=1
#     print(hotel_url)
#     if(count==18 or hotel_url==None):
#         print()
#         print('we are done for this hotel!!!!!')
#         break


####################### running section ######################
# I just make the code work 100% 18 hours ago. and I don't have enough time to 
# mine all the reviews. So my attribute_rating.csv is partial. you can run any hotel by 
# chaing the for loop range in the run function. (range is of the index of all hotel urls)



In [10]:
# when I run the above code, my laptop went to sleep, that cost the connection lost..



loopindex=40
while(loopindex<82):
    try:
        print('start hotelurl[',loopindex,']')
        run(loopindex)
        loopindex+=1
    except:
        time.sleep(100)
        


start hotelurl[ 40 ]
http://www.tripadvisor.com/Hotel_Review-g60745-d268205-Reviews-Embassy_Suites_by_Hilton_Boston_at_Logan_Airport-Boston_Massachusetts.html
We are done for this hotel!!! Move on! 40

start hotelurl[ 41 ]
http://www.tripadvisor.com/Hotel_Review-g60745-d217546-Reviews-Wyndham_Boston_Beacon_Hill-Boston_Massachusetts.html
We are done for this hotel!!! Move on! 41

start hotelurl[ 42 ]
http://www.tripadvisor.com/Hotel_Review-g60745-d94330-Reviews-Seaport_Boston_Hotel-Boston_Massachusetts.html
We are done for this hotel!!! Move on! 42

start hotelurl[ 43 ]
http://www.tripadvisor.com/Hotel_Review-g60745-d287606-Reviews-Kimpton_Onyx_Hotel-Boston_Massachusetts.html
start hotelurl[ 43 ]
http://www.tripadvisor.com/Hotel_Review-g60745-d287606-Reviews-Kimpton_Onyx_Hotel-Boston_Massachusetts.html
We are done for this hotel!!! Move on! 43

start hotelurl[ 44 ]
http://www.tripadvisor.com/Hotel_Review-g60745-d222957-Reviews-BEST_WESTERN_PLUS_Roundhouse_Suites-Boston_Massachusetts.htm