# Webscraping demo
## Tripadvisor
### Changes may be required due to Tripadvisor's continous updates

(c) Nuno António 2020 - Rev. 1.00

### Load packages and do the initializations

In [45]:
# Load libraries
import numpy as np
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options

In [46]:
# Allow not verified SSL (Secure Socket Layer) certificates to be opened
ssl._create_default_https_context = ssl._create_unverified_context

In [47]:
# Get Firefox options (configurations)
options = Options()

In [48]:
# Load the list of the hotels to read the content
hotelsToScrap = pd.read_excel("HotelsToScrap.xlsx", sheet_name="Sheet1", index_col="ID")

In [49]:
# Create an empty dataframe for the resuls
hotelReviews = pd.DataFrame({'hotelID': pd.Series([], dtype='string'),
                             'user': pd.Series([], dtype='string'),
                             'rating': pd.Series([], dtype='float'),
                             'text': pd.Series([], dtype='string'),
                             })

### Functions to use in the Main Loop

In [50]:
# Open page and read HTML
def openPageReadHTML(url):

    # Open Firefox with Selenium
    binary = FirefoxBinary('/Applications/Firefox.app/Contents/MacOS/firefox-bin')  # replace the Firexfox path with the one on your computer
    browser = webdriver.Firefox(firefox_binary=binary, options=options)
    browser.get(url)

    # Move into first review and click the button 
    read_more_buttons = browser.find_elements_by_class_name("_3maEfNCR")
    if len(read_more_buttons) > 0:
        browser.execute_script("arguments[0].scrollIntoView(true);", read_more_buttons[0])
        browser.execute_script("arguments[0].click()", read_more_buttons[0])

    # Read the content close de browser
    html_source = browser.page_source  
    browser.quit()

    # Transform the html into a BeautifulSoup object
    soupObj = BeautifulSoup(html_source) 

    return soupObj

In [51]:
# Process each page
def processPage(soupObj, hotelID, extractedDF):

    # Read reviews
    reviews = soupObj.find_all("div", {"data-test-target": "reviews-tab"})

    # Get the list of reviews
    reviewsList = reviews[0].select("div[class*=_3hFEdNs8]")

    # Loop thru each review
    for i in range(0,len(reviewsList)):
        
        # Get Rating
        r1 = reviewsList[i].select("span[class*=ui_bubble_rating]")
        r2 = r1[0]["class"][1]
        reviewRating = int(''.join(filter(str.isdigit, r2)))/10

        # Get User
        user = reviewsList[i].select("a[class*=ui_header_link]")[0].string

        # Get review text
        t = reviewsList[i].select("q[class*=IRsGHoPm]")[0]
        reviewText = t.find_all("span")[0].string

        # Update extracted reviews dataframe
        extractedDF = extractedDF.append({'hotelID': hotelID,
                             'user': user,
                             'rating': reviewRating,
                             'text': reviewText,
                             }, ignore_index=True)

    # Return the resulting dataframe
    return extractedDF

### Main loop

In [43]:
# Because this is a demo, let's define the number of reviews to obtain per hotel
reviewsToGet = 15

In [53]:
# Loop for all hotels
for index, row in hotelsToScrap.iterrows():

    # Present feedback on which hotel is being processed
    print("Processing hotel", index)

    # Reset counter per hotel
    reviewsExtracted = 0    

    # Loop until it extracts the pre-defined number of reviews
    while reviewsExtracted<reviewsToGet:

        # Define URL to use based on the number of reviews extracted so far
        urlToUse = row['URL']
        if reviewsExtracted>0:
            repText = "-Reviews-or"+str(reviewsExtracted)+"-"
            urlToUse = urlToUse.replace("-Reviews-",repText)

        # Open and read the web page content
        soup = openPageReadHTML(urlToUse)

        # Process web page
        hotelReviews = processPage(soup, index, hotelReviews)

        # Update counter
        reviewsExtracted = reviewsExtracted + 5

        # Present feedback on the number of extracted reviews
        print("Extracted ",reviewsExtracted,"/",reviewsToGet)
     

Processing hotel Sheraton
Extracted  5 / 15
Extracted  10 / 15
Extracted  15 / 15
Processing hotel CorpoSanto
Extracted  5 / 15
Extracted  10 / 15
Extracted  15 / 15
Processing hotel Myriad
Extracted  5 / 15
Extracted  10 / 15
Extracted  15 / 15
Processing hotel AvenidaPalace
Extracted  5 / 15
Extracted  10 / 15
Extracted  15 / 15
Processing hotel Corinthia
Extracted  5 / 15
Extracted  10 / 15
Extracted  15 / 15


In [54]:
# Save the extracted reviews data frame to an Excel file
hotelReviews.to_excel("ExtractedReviews.xlsx")