In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import ujson
import time
import re
from tqdm import tqdm_notebook
from datetime import date
import telegram

In [2]:
today = date.today()

In [None]:
def CollectReviewUrls():
    
    review_url_list = GetAllUrlList()
    df = pd.DataFrame(review_url_list)
    df.to_csv("seoul_hotel_url_list_2.csv")
    
    return review_url_list

review_url_list = CollectReviewUrls()

Collecting review urls in page#1 ...
Collecting review urls in page#2 ...
Collecting review urls in page#3 ...
Collecting review urls in page#4 ...
Collecting review urls in page#5 ...
Collecting review urls in page#6 ...
Collecting review urls in page#7 ...


In [5]:
DEBUG=False

def GetSoup(url):   
   
    resp = requests.get(url)
    html = resp.text
    soup = BeautifulSoup(html, "lxml")
    time.sleep(1.0)    
    
    return soup

def GetAllUrlList():
    
    hotel_page_urls = []
    
    page_num = 0
    
    while True:
        
        if page_num > 103 :
            break

        print("Collecting review urls in page#{} ...".format(page_num+1))
        page_url = "https://www.tripadvisor.com/Hotels-g294197-oa{}-Seoul-Hotels.html".format(page_num*30)
        url_list_in_each_page = GetReviewUrls(page_url)
        
        for url in url_list_in_each_page :
            hotel_page_urls.append(url)
    
        page_num += 1
        
    print("Done! - {} urls collected".format(len(hotel_page_urls)))
    
    
    return hotel_page_urls
    
    
def GetReviewUrls(page_url):
    
    """Get hotel urls from given page"""
    page_soup = GetSoup(page_url)
    review_url_elems = page_soup.find_all("a", class_ = "review_count")
    try :
        review_urls = ["https://www.tripadvisor.com" + each_elem["href"] for each_elem in review_url_elems]
        
    except KeyError:
        
        review_urls = []

        for each_elem in review_url_elems:

            if each_elem.text == "0 reviews":
                continue

            review_url = "https://www.tripadvisor.com" + each_elem["href"]
            review_urls.append(review_url)
    
    return review_urls

In [6]:
def GetAllBodies(page_soup):
    
    review_bodies = page_soup.find_all("div", class_ = "hotels-community-tab-common-Card__card--ihfZB hotels-community-tab-common-Card__section--4r93H")
    
    return review_bodies # 5개 list of soup

def ExtractSocialPower(review_body):
    
    social_power_elems = review_body.find_all("span", class_ = "social-member-MemberHeaderStats__bold--3z3qh")
    
    if len(social_power_elems) == 2:
        contribution = social_power_elems[0].text   
        helpful_review = social_power_elems[1].text

    elif len(social_power_elems) == 1:
        contribution = social_power_elems[0].text   
        helpful_review = 0    

    elif len(social_power_elems) == 0:
        contribution = 0
        helpful_review = 0
        
    return contribution, helpful_review

def ExtractReview(review_body):
    
    review_elem = review_body.find("q", class_ = "hotels-review-list-parts-ExpandableReview__reviewText--3oMkH")
    review = review_elem.text.strip()
    
    return review

def ExtractRating(review_body):
    
    if review_body.find("span", class_ = "ui_bubble_rating bubble_50") :
        rating = 5
        
    elif review_body.find("span", class_ = "ui_bubble_rating bubble_40") :
        rating = 4
        
    elif review_body.find("span", class_ = "ui_bubble_rating bubble_30") :
        rating = 3
        
    elif review_body.find("span", class_ = "ui_bubble_rating bubble_20") :
        rating = 2
                          
    elif review_body.find("span", class_ = "ui_bubble_rating bubble_10") :
        rating = 1
                          
    return rating

def ExtractStayDate(review_body):
    
    try:
        stay_date_elem = review_body.find("span", class_ = "hotels-review-list-parts-EventDate__event_date--CRXs4")
        stay_date = stay_date_elem.text.split(":")[1]
        
    except AttributeError:
        stay_date = " "
        
    if stay_date == "yesterday":
        
        stay_date = today.strftime("%d/%m/%Y")
    
    return stay_date

def ExtractPostDate(review_body):
    
    post_date_elem = review_body.find("div", class_ = "social-member-event-MemberEventOnObjectBlock__event_type--3njyv")
    post_date = post_date_elem.text.split("review ")[1]
    
    return post_date


def ExtractAvgPrice(review_body):
    
    price_elems = review_body.find_all("div", class_ = "hotels-hotel-offers-DetailChevronOffer__price--py2LH")
    
    if len(price_elems) == 0:
        price_elems = review_body.find_all("div", class_ = "hotels-hotel-offers-DominantOffer__price--D-ycN")

    price_list = []
    if len(price_elems) == 0:    
        price_list = []
        
    elif len(price_elems) >= 1:
        
        for price_elem in price_elems:
            price = price_elem.text.strip()
            price_list.append(price)
                
    return price_list

In [7]:
def GetPageUrlFO(page_soup, page_url) :
    
    """Get review page url by pages for each hotel"""
    try :
        page_num_elem = page_soup.find_all("a", class_ = "pageNum")
        total_page = page_num_elem[len(page_num_elem)-1].text.strip()

        url_lists = []
        for page_num in list(range(int(total_page))):

            page_num_str = "Reviews-or" + str(page_num*5) + "-"
            post_page_url = re.sub("Reviews-", page_num_str, page_url)

            url_lists.append(post_page_url)
            
    except IndexError:
        url_lists = [page_url]        
        
    return url_lists

In [8]:
def WriteJsonDoc(output_file, name, address, price, review, rating, contribution, helpful_review, post_date, stay_date):
    
    post = {"name" : name,
            "address" : address,
            "price" : price, 
            "review": review, 
            "rating": rating,
            "contribution" : contribution,
            "helpful_review" : helpful_review,
            "post_date": post_date, 
            "stay_date": stay_date,
            "rating": rating}
    
    print(ujson.dumps(post, ensure_ascii=False), file=output_file)

def CollectReviews(output_file_name, review_url_list):
    try : 
        with open(output_file_name, "w", encoding="utf-8") as output_file:
            for idx, page_url in tqdm_notebook(enumerate(review_url_list), desc = "Scrapping the contents...",
                                         total = len(review_url_list)):

                if idx < 102:
                    continue

                page_soup = GetSoup(page_url)
                name = page_soup.find("h1", class_ = "hotels-hotel-review-atf-info-parts-Heading__heading--2ZOcD").text.strip()
                address = page_soup.find("span", class_ = "public-business-listing-ContactInfo__ui_link--1_7Zp public-business-listing-ContactInfo__level_4--3JgmI").text.strip()
                price_list = ExtractAvgPrice(page_soup)
                page_url_list = GetPageUrlFO(page_soup, page_url)
                print("Collecting the data of : ", name)

                for page_url in page_url_list :

                    each_page_soup = GetSoup(page_url)

                    review_bodies = GetAllBodies(each_page_soup)
                    for review_body in review_bodies :

                        review = ExtractReview(review_body)
                        rating = ExtractRating(review_body)
                        contribution, helpful_review = ExtractSocialPower(review_body)
                        stay_date = ExtractStayDate(review_body)
                        post_date = ExtractPostDate(review_body)

                        WriteJsonDoc(output_file, name, address, price_list, review, rating, contribution, helpful_review, post_date, stay_date)
                print("Collected Reviews : ", len(page_url_list))
                print("\n")

                if idx % 100 == 0:
                    text = "Progress done : " + str(idx)+"/"+str(len(review_url_list))
                    my_bot.sendMessage(chat_id = chat_id, text = text)
    
    except Exception as ex:
        text = "Error : " + str(ex) + " / Stopped hotel : " + str(idx) + " " + str(name)  
        my_bot.sendMessage(chat_id = chat_id, text = text)

In [None]:
my_token = "934648125:AAEz0Doww95XXYoYbwdS4gp5cVZqpKmJxPY"
chat_id = "760642301"
my_bot = telegram.Bot(token = my_token)

def Main(review_url_list):
    output_file_name = "./data/hotel_review_3.txt"
    CollectReviews(output_file_name, review_url_list)
    
Main(review_url_list)