In [18]:
from datetime import datetime
from datetime import date
import time
from lxml import html,etree
import requests,re
import os,sys
import unicodecsv as csv
import argparse
import urllib3



class TAHotelList:
    
    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy
        
    #returns TA api in json format 
    def __request_taapi(city):
        try:
            url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time.time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+city
            json_response = requests.get(url, verify=False).json()
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from TripAdvisor')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return json_response    

    #get the TA url from json to feeds in auto URL
    #response will be the url for the search without date
    def __auto_taurl(json):
        try:
            url_response = "https://www.tripadvisor.com"+json['results'][0]['url']
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from TripAdvisor')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return url_response 

    #get city geo from the json extracted from api    
    def __geo_taurl(json):
        try:
            geo = json['results'][0]['value']
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from TripAdvisor')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return geo

    def dateta(checkin_date,checkout_date,sort):
        date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
        
        form_data = {'changeSet': 'TRAVEL_INFO',
                'showSnippets': 'false',
                'staydates':date,
                'uguests': '2',
                'sortOrder':sort }
        return form_data

    def headerta(url):    
        #Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
        headers = {
                                'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
                                'Accept-Encoding': 'gzip,deflate',
                                'Accept-Language': 'en-US,en;q=0.5',
                                'Cache-Control': 'no-cache',
                                'Connection': 'keep-alive',
                                'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
                                'Host': 'www.tripadvisor.com',
                                'Pragma': 'no-cache',
                                'Referer': url,
                                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
                                'X-Requested-With': 'XMLHttpRequest'
                            }
        return headers

    #parsing the TA page with the complete information checkin date, checkout date, city, currency
    def parse_tapage(page_num, url, form_data, headers):
        cookies=  {"SetCurrency":"USD"}
        try:
            if page_num ==1:
                page_response  = requests.post(url = url,data=form_data,headers = headers, cookies = cookies, verify=False)
            else:
                url = 'https://www.tripadvisor.com'+url
                page_response = requests.get(url, verify=False)
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from TripAdvisor')
        except requests.RequestException:
            raise requests.RequestException
        else:
            parser = html.fromstring(page_response.text)
            return parser

    #get hotel information from parsed html
    #return list of hotel with information
    def get_tahotel(parser):    
        hotel_lists = parser.xpath('//div[contains(@class,"prw_rup prw_meta_hsx_responsive_listing ui_section listItem")]//div[contains(@class,"listing collapsed")]')
        hotel_data = []
        if not hotel_lists:
            hotel_lists = parser.xpath('//div[contains(@class,"prw_rup prw_meta_hsx_responsive_listing ui_section listItem")]//div[@class="listing "]')

        for hotel in hotel_lists:
            XPATH_HOTEL_LINK = './/a[contains(@class,"property_title prominent")]/@href'
            XPATH_REVIEWS  = './/a[@class="review_count"]//text()'
            XPATH_RANK = './/div[@class="popRanking"]//text()'
            XPATH_RATING = './/a[contains(@data-clicksource,"BubbleRating")]/@alt'
            XPATH_HOTEL_NAME = './/a[contains(@class,"property_title prominent")]//text()'
            XPATH_HOTEL_FEATURES = './/div[contains(@class,"common_hotel_icons_list")]//li//text()'
            XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-xthrough")]/text()'
            XPATH_HOTEL_DISCOUNTED_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
            XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()' 
            XPATH_BOOKING_PROVIDER = './/span[contains(@class,"provider_text")]//text()'
            time.sleep(60)

            raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
            raw_no_of_deals =  hotel.xpath(XPATH_VIEW_DEALS)
            raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
            raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
            raw_rank = hotel.xpath(XPATH_RANK)
            raw_rating = hotel.xpath(XPATH_RATING)
            raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
            raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
            raw_hotel_price_per_night  = hotel.xpath(XPATH_HOTEL_PRICE)
            raw_hotel_discounted_price_per_night  = hotel.xpath(XPATH_HOTEL_DISCOUNTED_PRICE)

            url = 'https://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else  None
            reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0 
            rank = ''.join(raw_rank) if raw_rank else None
            rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
            name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
            hotel_features = ','.join(raw_hotel_features)
            price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
            discounted_price_per_night = ''.join(raw_hotel_discounted_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_discounted_price_per_night else None
            no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
            booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None

            if no_of_deals:
                no_of_deals = no_of_deals[0]
            else:
                no_of_deals = 0

            data = {
                        'hotel_name':name,
                        'url':url,
                        'locality':locality,
                        'reviews':reviews,
                        'tripadvisor_rating':rating,
                        'checkOut':checkOut,
                        'checkIn':checkIn,
                        'hotel_features':hotel_features,
                        'price_per_night':price_per_night,
                        'discounted_price':discounted_price_per_night,
                        'no_of_deals':no_of_deals,
                        'booking_provider':booking_provider

            }
            hotel_data.append(data)
        return hotel_data


    #browsing next page by page number url - first item in the list [0]
    def next_tapage(page,parser):
        try:
            next_page = parser.xpath('.//a[contains(@data-page-number,"'+str(page)+'")]/@href')
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from TripAdvisor')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return next_page[0]

    #write the hotel information to tripadvisore csv file    
    def write_tocsv(data):
        with open(r'tripadvisor_data.csv', mode='a')as csvfile:
                fieldnames = ['hotel_name','url','locality','reviews','tripadvisor_rating','checkIn','checkOut','price_per_night','discounted_price','booking_provider','no_of_deals','hotel_features']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                for row in data:
                    writer.writerow(row)

    # function to check if the button is on the page, to avoid miss-click problem
    def check_exists_by_xpath(xpath):
        try:
            driver.find_element_by_xpath(xpath)
        except NoSuchElementException:
            return False
        return True


In [19]:
urllib3.disable_warnings()

h = TAHotelList()

locality = 'Paris'
checkin_date = date(2020, 1, 1)
checkout_date = date(2020, 1, 31)

#sort values (Price (low to high) = priceLow, Traveler Ranked = popularity, Best Value = recommended, Distance to city center = distLow)
#default sort value is Traveler Ranked = popularity 
sort= 'recommended'

checkIn = checkin_date.strftime("%Y/%m/%d")
checkOut = checkout_date.strftime("%Y/%m/%d")

json_api = h.__request_taapi(locality)
new_link = h.__auto_taurl(json_api)
geo = h.__geo_taurl(json_api)
form_data = h.dateta(checkin_date, checkout_date, sort)
header = h.headerta(new_link)
time.sleep(60)

#Going through the hotel list
i = 1
while i < 3:
    parsed_html = h.parse_tapage(i, new_link, form_data, header)
    time.sleep(60)
    #get hotel list
    hotel_list = h.get_tahotel(parsed_html)
    #write to csv file
    h.write_tocsv(hotel_list)
    #loop up to top 2 pages
    i += 1
    #passing i to get to next page link
    new_link = h.next_tapage(i,parsed_html)
    time.sleep(60)




reading complete


In [111]:
#class TA reviews
from random import choice
import json
import requests
from bs4 import BeautifulSoup
import urllib2
import ssl
from datetime import datetime
import re
import pandas as pd
from lxml import html,etree
import time
import unicodecsv as csv
from urlparse import urlparse
import urllib3

class TAHotelReviews:
    
    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy

    def __parse_url(self, url):
        try:
            response = requests.get(url, verify=False)
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from TripAdvisor')
        except requests.RequestException:
            raise requests.RequestException
        else:
            parser = html.fromstring(response.text)
            return parser

    def get_hotelid(self, url):
        try:
            path = urlparse(url)
            hotel_id = parsed[2].split('-')
        except Exception as e:
            raise e
        else:
            result = hotel_id[2]
        return result
            
    def get_hotel(self, hotel_id, url):
        try:
            parser = self.__parse_url(url)
            name = parser.xpath('.//h1[contains(@class,"hotels-hotel-review-atf-info-parts-Heading__heading--2ZOcD")]//text()')
            locality = 'Paris'
            address = parser.xpath('.//span[contains(@class,"public-business-listing-ContactInfo__ui_link--1_7Zp public-business-listing-ContactInfo__level_4--3JgmI")]//text()')
            rank = parser.xpath('.//b[contains(@class,"rank")]//text()')
        except Exception as e:
            raise e
        else:
            data = {
                        'hotel_id':[hotel_id],
                        'hotel_name':[name[0]],
                        'url':[url],
                        'locality':[locality],
                        'address':[address[0]],
                        'rank':[rank[0]]
                    }
            result = pd.DataFrame(data)
        return result

    #browsing next page by page number url - first item in the list [0]
    def next_reviewpage(self, url):
        try:
            parser = self.__parse_url(url)
            next_page = parser.xpath('.//a[contains(@class,"ui_button nav next primary ")]/@href')
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from TripAdvisor')
        except requests.RequestException:
            raise requests.RequestException
        else:
            result = 'https://www.tripadvisor.com'+str(next_page[0])+'#REVIEWS'
            return result


    def get_review(self, h_id, url): 
        parser = self.__parse_url(url)
        hotel_id = [h_id, h_id, h_id, h_id, h_id]
        hotel_reviews = parser.xpath('//div[contains(@id,"taplc_hr_community_content_ssronly_0")]')
             
        for review in hotel_reviews:
            XPATH_USER_NAME = './/a[contains(@class,"ui_header_link social-member-event-MemberEventOnObjectBlock__member--35-jC")]//text()'
            XPATH_REVIEW_DATE = './/div[contains(@class,"social-member-event-MemberEventOnObjectBlock__event_type--3njyv")]//text()[contains(.,"wrote")]'
            XPATH_USER_LOCATION = './/span[contains(@class,"default social-member-common-MemberHometown__hometown--3kM9S small")]//text()'
            XPATH_USER_CONTRIBUTIONS = './/span[contains(@class,"social-member-MemberHeaderStats__stat_item--34E1r")]//text()'
            XPATH_USER_RATING = './/div[contains(@data-test-target,"review-rating")]//span/@class'
            XPATH_REVIEW_TITLE = './/a[contains(@class, "location-review-review-list-parts-ReviewTitle__reviewTitleText--2tFRT")]//span//span//text()'
            XPATH_DATE_OF_STAY = './/span[contains(@class, "location-review-review-list-parts-EventDate__event_date--1epHa")]//text()[not(contains(.,"Date"))]'
            XPATH_REVIEW_BODY = './/div[contains(@class, "_2f_ruteS _1bona3Pu")]//div[contains(@class,"cPQsENeY")]//q//span//text()'
            
            raw_user_name = review.xpath(XPATH_USER_NAME)
            raw_review_date = review.xpath(XPATH_REVIEW_DATE)
            raw_user_location = review.xpath(XPATH_USER_LOCATION) or 'None'
            raw_user_contributions = review.xpath(XPATH_USER_CONTRIBUTIONS)
            raw_user_rating = review.xpath(XPATH_USER_RATING)
            raw_review_title = review.xpath(XPATH_REVIEW_TITLE)
            raw_date_of_stay = review.xpath(XPATH_DATE_OF_STAY)
            raw_review_body = review.xpath(XPATH_REVIEW_BODY)
            
            username = raw_user_name if raw_user_name else None
            review_date = raw_review_date if raw_review_date else None 
            user_location = raw_user_location if raw_user_location else None
            user_contributions = raw_user_contributions if raw_user_contributions else None
            user_rating = raw_user_rating if raw_user_rating else 0
            review_title = raw_review_title if raw_review_title else None
            date_of_stay = raw_date_of_stay if raw_date_of_stay else None
            review_body = raw_review_body if raw_review_body else None
            
            data = pd.DataFrame(list(zip(hotel_id,username,review_date, user_rating, review_title, date_of_stay, review_body)),
               columns=['hotel_id','username','review_date', 'user_rating','review_title', 'date_of_stay', 'review_body'])
        
        return data
 
    def loop_hotel(self, hotel_id, url):
        #Going through the reviews
        i = 1
        while i < 3:
            reviews = self.get_review(hotel_id, url)
            time.sleep(10)
            #write to csv file
            self.write_review_tocsv(reviews)
            #loop up to top 2 pages
            i += 1
            #passing i to get to next page link
            url = self.next_reviewpage(url)
            time.sleep(10)


    def write_review_tocsv(self, data):
        data['review_date'] = data['review_date'].str.replace(' wrote a review ','')
        data['user_rating'] = data['user_rating'].str.replace('ui_bubble_rating bubble_','') 
        data['date_of_stay'] = data['date_of_stay'].str.strip()
        data.to_csv('tripadvisor_review.csv', mode='a', index = False, sep=',', encoding='utf-8', header=False)
 
    def write_hotel_tocsv(self, data):
        data['rank'] = data['rank'].str.replace('#','')
        data.to_csv('tripadvisor_hotel.csv', mode='a', index = False, sep=',', encoding='utf-8', header=False)


In [112]:


urllib3.disable_warnings()

r = TAHotelReviews()
#Read TA Hotel list file 
df = pd.read_csv("tripadvisor_data.csv", delimiter=",")

#Looping through the list of links and saving information into a csv file 
for index, hotel_link in df.iterrows():
    url = hotel_link['url']
    hotel_id = r.get_hotelid(url)
    results = r.get_hotel(hotel_id, url)
    r.write_hotel_tocsv(results)
    r.loop_hotel(hotel_id, url)
    del hotel_id