# Data Mining: Tripadvisor
by Xiang-Yi, Huang (xiangyi.huang0213@gmail.com)

## 1. Required Packages

In [None]:
# Required Packages:
!pip install selenium
!pip install beautifulsoup4
!pip install fake_useragent

# More Packages: re, datetime, time, json, warnings
# You may use these packages if you crawl one by one：os、glob

## 2. Code

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from datetime import datetime
import re
import time
import json
import warnings 

warnings.filterwarnings('ignore') # ignore warnings
start = datetime.now() # calculate execution time

ua = UserAgent()
user_agent = ua.random
headers = {'user-agent': user_agent}

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome('chromedriver', options = options)
driver.maximize_window()
driver.implicitly_wait(10)

# TripAdvisor TOP 10 Popular Taipei Hotels in 2022
url = 'https://www.tripadvisor.com.tw/Hotels-g293913-Taipei-Hotels.html'
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

title = soup.find_all('a', {'class': 'property_title prominent'}) # Find hotel names

jsonlist = []
current_year = datetime.now().year # Get the current year for comment timestamps

hotel = 1 # hotel = 0 might be an ad-recommended hotel, and it may overlap with the next hotel, so skip it. Modify the number here if fetching one at a time.
hotel_num = 0 # Number of hotels with reviews fetched, modify this to fetch a different number of hotels

while True:
    hotel_name = title[hotel].text.strip() 
    try: # Sometimes, the hotel information may not be captured. If not, move on to the next hotel.
        if hotel >= 7: # When capturing the eighth hotel or later, there might be a "View All" button, which needs to be clicked.
            try:
                find_all = driver.find_element(By.XPATH, '//*[@id="component_6"]/div/button')
                find_all.click()
            except:
                pass
            
        hotel_page = driver.find_element(By.LINK_TEXT, hotel_name)
        hotel_page.click() # Click on the hotel name to go to the hotel's details page
        print('start to crawl: ', hotel_page.text)
        time.sleep(2)
    
        driver.switch_to.window(driver.window_handles[1]) # After clicking on the hotel's details page, a new tab is opened, so switch to that tab
        soup = BeautifulSoup(driver.page_source, 'html.parser')
    
        # Capture various data, but TripAdvisor does not have the hotel's phone number available, so it is omitted. However, additional features of the hotel are captured.
        name = soup.find('h1', {'class': 'QdLfr b d Pn'}).text # Hotel name
        grade = soup.find('span', {'class': 'uwJeR P'}).text # Overall rating, out of five
        star = soup.find('svg', {'class': 'JXZuC d H0'}).get('aria-label')[0:4] # Star rating, up to five stars
        introduction = soup.find('div', {'class': 'fIrGe _T'}).text # Hotel introduction
        address = soup.find('span', {'class': 'fHvkI PTrfg'}).text # Address
        
        feature = soup.find_all('div', {'class': 'yplav f ME H3 _c'}) # All features
        feature_list = []
        for num in range(len(feature)):
            feature_list.append(feature[num].text)
    
        comment_jsonlist = []
        id_num = 1 # The nth comment
        
        print('start to crawl all comments!')
        while True:
            
            # Get various comment data. Image locations are not found in the original HTML file, so they are omitted.
            comment = soup.find_all('div', {'class': 'YibKl MC R2 Gi z Z BB pBbQr'}) # First, capture all comments on the page
            
            # The data is complex, so each field is handled separately to prevent potential issues
            for k in range(len(comment)): # Number of comments on the page
                
                # Usernames may disappear, appearing as "Tripadvisor Member"
                try:
                    cmt_name = comment[k].find('a', {'class': 'ui_header_link uyyBf'}).text
                except:
                    cmt_name = 'Tripadvisor Member'
            
                # Star ratings may not always be available
                try:
                    cmt_star = str(int(comment[k].find('div', {'class': 'Hlmiy F1'}).span.get('class')[1][-2:]) / 10.0)
                except:
                    cmt_star = 'None'
        
                # Handle comment timestamp
                try:
                    cmt_time = comment[k].find('div', {'class': 'cRVSd'}).text
                    cmt_time = re.search('(?<=Posted a review on ).+', cmt_time).group(0)
                    if cmt_time.find('year') == -1:
                        cmt_time = str(current_year) + ' ' + cmt_time
                except:
                    cmt_time = 'None'
                
                # Stay time may not always be available
                try:
                    sty_time = comment[k].find('span', {'class': 'teHYY _R Me S4 H3'}).text
                    sty_time = re.search('(?<=Date of stay: ).+', sty_time).group(0)
                except:
                    sty_time = 'None'
                
                # Handle comment title
                try:
                    cmt_content_title = comment[k].find('div', {'class': 'KgQgP MC _S b S6 H5 _a'}).text
                except:
                    cmt_content_title = 'None'
            
                # Handle full comment text
                try:
                    cmt_content = comment[k].find('q', {'class': 'QewHA H4 _a'}).text
                except:
                    cmt_content = 'None'
        
                # Likes on comments may not always be available
                try:
                    cmt_likes_num = comment[k].find_all('span', {'class': 'yRNgz'})[1].text
                except:
                    cmt_likes_num = '0'
        
                # Store the captured data in JSON format:
                cmt_dict_temp = {}
                cmt_dict_temp['id'] = id_num # The nth comment
                cmt_dict_temp['comment_name'] = cmt_name # Comment username
                cmt_dict_temp['comment_star'] = cmt_star # Comment star rating  
                cmt_dict_temp['comment_time'] = cmt_time # Comment timestamp
                cmt_dict_temp['stay_time'] = sty_time # Stay time
                cmt_dict_temp['comment_content_title'] =  cmt_content_title # Comment title
                cmt_dict_temp['comment_content'] = cmt_content # Full comment text
                cmt_dict_temp['comment_likes_num'] = cmt_likes_num # Comment likes
                comment_jsonlist.append(cmt_dict_temp)
        
                id_num += 1
                
                # Check where the current capture is at
                if id_num % 100 == 0:
                    print('here is id: ', id_num)
        
            try:
                # Not sure if it might get locked, but still switch user_agent just in case
                ua = UserAgent()
                user_agent = ua.random
                headers = {'user-agent': user_agent}
                
                comment_page = driver.find_element(By.LINK_TEXT, 'Next').click() # Go to the next page
                time.sleep(2)
                soup = BeautifulSoup(driver.page_source, 'html.parser')  
                
            except: # If it's the last page, break out
                # Store the captured data in JSON format:
                dict_temp = {}
                dict_temp['id'] = hotel_num + 1 # If fetching one at a time, this id needs to be modified
                dict_temp['name'] = name
                dict_temp['grade'] = grade
                dict_temp['star'] = star
                dict_temp['introduction'] = introduction
                dict_temp['address'] = address
                dict_temp['feature'] = feature_list
                dict_temp['comment'] = comment_jsonlist
                jsonlist.append(dict_temp)
                break    

    
        driver.close()
        driver.switch_to.window(driver.window_handles[0])  # Switch back to the original page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        hotel += 1
        hotel_num += 1 
        print(name, ' is done!')  # Indicates successful data retrieval for a hotel
    
    except:  # If unable to fetch data for the current hotel, move on to the next one
        hotel += 1
    
    if hotel == 20:  # At most, there might be one or two hotels that couldn't be fetched; too many indicates another issue
        break
    
    if hotel_num == 10:  # Indicates successful data retrieval for ten hotels! Mission accomplished! # If fetching one at a time, it would be hotel_num == 1
        print('Everything is done!')
        break
    
driver.close()

# Save all data and generate a JSON file
with open('crawler_homework.json', 'w', encoding = 'utf-8') as file:  # If fetching one at a time, the filename needs to be changed
    file.write(json.dumps(json_list, indent = 2, ensure_ascii = False))
    
end = datetime.now()
print('Total execution time:', end - start)


In [33]:
# When fetching one hotel at a time, name each JSON file as crawler_homework_X.json, where X = 0 ~ 9
# When fetching one hotel at a time, merge all JSON files

import os
import glob

data = []

for file in glob.glob(os.path.join('./', '*.json')): # Fetch each JSON file
    with open(file, 'r', encoding='utf-8') as f:
        output = json.load(f)
        data.extend(output) # Merge each JSON file
        
with open('crawler_homework.json', 'w', encoding='utf-8') as file: # Convert the merged data back to a JSON file
    file.write(json.dumps(data, indent=2, ensure_ascii=False))
    
print('Everything is done!')


Everything is done!


## 3. Brief Discussion
1. I haven't fully grasped the logic of TripAdvisor, so when trying to fetch ten hotels at once, it sometimes fails -> I speculate it might be influenced by advertisements.
2. There are many inconsistent formats within TripAdvisor, so I had to rely on exception handling to address the issue.
3. Fetching ten hotels at once takes too long, so it's better to fetch them in batches, and each batch is more likely to succeed. (The test run above was interrupted, and it can be observed that the "Mandarin Oriental" hotel was captured twice.)
4. The above code is designed to fetch ten hotels at once. If you want to change it to fetching one at a time, you can make the following modifications:
    * 4-1. Variable "hotel" needs to be changed, and you can set it from 1 to 20 without any issues.
    * 4-2. "dicttemp['id']" also needs to be changed, such as setting "dicttemp['id'] = 10".
    * 4-3. Change "if hotel_num == 10:" to "if hotel_num == 1:".
    * 4-4. The file name needs to be changed, for example, 'crawler_homework_1.json'.
