In [1]:
!pip install selenium==3.141.0
!pip install beautifulsoup4
from _ast import operator

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl
import random

!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selenium==3.141.0
  Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)
[K     |████████████████████████████████| 904 kB 7.4 MB/s 
Installing collected packages: selenium
Successfully installed selenium-3.141.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Ign:1 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:6 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic In

In [2]:
from pathlib import Path
from copy import copy
from typing import Union, Optional
import numpy as np
import pandas as pd
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter


def copy_excel_cell_range(
        src_ws: openpyxl.worksheet.worksheet.Worksheet,
        min_row: int = None,
        max_row: int = None,
        min_col: int = None,
        max_col: int = None,
        tgt_ws: openpyxl.worksheet.worksheet.Worksheet = None,
        tgt_min_row: int = 1,
        tgt_min_col: int = 1,
        with_style: bool = True
) -> openpyxl.worksheet.worksheet.Worksheet:
    """
    copies all cells from the source worksheet [src_ws] starting from [min_row] row
    and [min_col] column up to [max_row] row and [max_col] column
    to target worksheet [tgt_ws] starting from [tgt_min_row] row
    and [tgt_min_col] column.

    @param src_ws:  source worksheet
    @param min_row: smallest row index in the source worksheet (1-based index)
    @param max_row: largest row index in the source worksheet (1-based index)
    @param min_col: smallest column index in the source worksheet (1-based index)
    @param max_col: largest column index in the source worksheet (1-based index)
    @param tgt_ws:  target worksheet.
                    If None, then the copy will be done to the same (source) worksheet.
    @param tgt_min_row: target row index (1-based index)
    @param tgt_min_col: target column index (1-based index)
    @param with_style:  whether to copy cell style. Default: True

    @return: target worksheet object
    """
    if tgt_ws is None:
        tgt_ws = src_ws

    # https://stackoverflow.com/a/34838233/5741205
    for row in src_ws.iter_rows(min_row=min_row, max_row=max_row,
                                min_col=min_col, max_col=max_col):
        for cell in row:
            tgt_cell = tgt_ws.cell(
                row=cell.row + tgt_min_row - 1,
                column=cell.col_idx + tgt_min_col - 1,
                value=cell.value
            )
            if with_style and cell.has_style:
                # tgt_cell._style = copy(cell._style)
                tgt_cell.font = copy(cell.font)
                tgt_cell.border = copy(cell.border)
                tgt_cell.fill = copy(cell.fill)
                tgt_cell.number_format = copy(cell.number_format)
                tgt_cell.protection = copy(cell.protection)
                tgt_cell.alignment = copy(cell.alignment)
    return tgt_ws


def append_df_to_excel(
        filename: Union[str, Path],
        df: pd.DataFrame,
        sheet_name: str = 'Sheet1',
        startrow: Optional[int] = None,
        max_col_width: int = 30,
        autofilter: bool = False,
        fmt_int: str = "#,##0",
        fmt_float: str = "#,##0.00",
        fmt_date: str = "yyyy-mm-dd",
        fmt_datetime: str = "yyyy-mm-dd hh:mm",
        truncate_sheet: bool = False,
        storage_options: Optional[dict] = None,
        **to_excel_kwargs
) -> None:
    """
    Append a DataFrame [df] to existing Excel file [filename]
    into [sheet_name] Sheet.
    If [filename] doesn't exist, then this function will create it.

    @param filename: File path or existing ExcelWriter
                     (Example: '/path/to/file.xlsx')
    @param df: DataFrame to save to workbook
    @param sheet_name: Name of sheet which will contain DataFrame.
                       (default: 'Sheet1')
    @param startrow: upper left cell row to dump data frame.
                     Per default (startrow=None) calculate the last row
                     in the existing DF and write to the next row...
    @param max_col_width: maximum column width in Excel. Default: 40
    @param autofilter: boolean - whether add Excel autofilter or not. Default: False
    @param fmt_int: Excel format for integer numbers
    @param fmt_float: Excel format for float numbers
    @param fmt_date: Excel format for dates
    @param fmt_datetime: Excel format for datetime's
    @param truncate_sheet: truncate (remove and recreate) [sheet_name]
                           before writing DataFrame to Excel file
    @param storage_options: dict, optional
        Extra options that make sense for a particular storage connection, e.g. host, port,
        username, password, etc., if using a URL that will be parsed by fsspec, e.g.,
        starting “s3://”, “gcs://”.
    @param to_excel_kwargs: arguments which will be passed to `DataFrame.to_excel()`
                            [can be a dictionary]
    @return: None

    Usage examples:

    >>> append_df_to_excel('/tmp/test.xlsx', df, autofilter=True,
                           freeze_panes=(1,0))

    >>> append_df_to_excel('/tmp/test.xlsx', df, header=None, index=False)

    >>> append_df_to_excel('/tmp/test.xlsx', df, sheet_name='Sheet2',
                           index=False)

    >>> append_df_to_excel('/tmp/test.xlsx', df, sheet_name='Sheet2',
                           index=False, startrow=25)

    >>> append_df_to_excel('/tmp/test.xlsx', df, index=False,
                           fmt_datetime="dd.mm.yyyy hh:mm")

    (c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
    """
    def set_column_format(ws, column_letter, fmt):
        for cell in ws[column_letter]:
            cell.number_format = fmt
    filename = Path(filename)
    file_exists = filename.is_file()
    # process parameters
    # calculate first column number
    # if the DF will be written using `index=True`, then `first_col = 2`, else `first_col = 1`
    first_col = int(to_excel_kwargs.get("index", True)) + 1
    # ignore [engine] parameter if it was passed
    if 'engine' in to_excel_kwargs:
        to_excel_kwargs.pop('engine')
    # save content of existing sheets
    if file_exists:
        wb = load_workbook(filename)
        sheet_names = wb.sheetnames
        sheet_exists = sheet_name in sheet_names
        sheets = {ws.title: ws for ws in wb.worksheets}

    with pd.ExcelWriter(
        filename.with_suffix(".xlsx"),
        engine="openpyxl",
        mode="a" if file_exists else "w",
        if_sheet_exists="new" if file_exists else None,
        date_format=fmt_date,
        datetime_format=fmt_datetime,
        storage_options=storage_options
    ) as writer:
        if file_exists:
            # try to open an existing workbook
            writer.book = wb
            # get the last row in the existing Excel sheet
            # if it was not specified explicitly
            if startrow is None and sheet_name in writer.book.sheetnames:
                startrow = writer.book[sheet_name].max_row
            # truncate sheet
            if truncate_sheet and sheet_name in writer.book.sheetnames:
                # index of [sheet_name] sheet
                idx = writer.book.sheetnames.index(sheet_name)
                # remove [sheet_name]
                writer.book.remove(writer.book.worksheets[idx])
                # create an empty sheet [sheet_name] using old index
                writer.book.create_sheet(sheet_name, idx)
            # copy existing sheets
            writer.sheets = sheets
        else:
            # file doesn't exist, we are creating a new one
            startrow = 0

        # write out the DataFrame to an ExcelWriter
        df.to_excel(writer, sheet_name=sheet_name, **to_excel_kwargs)
        worksheet = writer.sheets[sheet_name]

        if autofilter:
            worksheet.auto_filter.ref = worksheet.dimensions

        for xl_col_no, dtyp in enumerate(df.dtypes, first_col):
            col_no = xl_col_no - first_col
            width = max(df.iloc[:, col_no].astype(str).str.len().max(),
                        len(df.columns[col_no]) + 6)
            width = min(max_col_width, width)
            column_letter = get_column_letter(xl_col_no)
            worksheet.column_dimensions[column_letter].width = width
            if np.issubdtype(dtyp, np.integer):
                set_column_format(worksheet, column_letter, fmt_int)
            if np.issubdtype(dtyp, np.floating):
                set_column_format(worksheet, column_letter, fmt_float)

    if file_exists and sheet_exists:
        # move (append) rows from new worksheet to the `sheet_name` worksheet
        wb = load_workbook(filename)
        # retrieve generated worksheet name
        new_sheet_name = set(wb.sheetnames) - set(sheet_names)
        if new_sheet_name:
            new_sheet_name = list(new_sheet_name)[0]
        # copy rows written by `df.to_excel(...)` to
        copy_excel_cell_range(
            src_ws=wb[new_sheet_name],
            tgt_ws=wb[sheet_name],
            tgt_min_row=startrow + 1,
            with_style=True
        )
        # remove new (generated by Pandas) worksheet
        del wb[new_sheet_name]
        wb.save(filename)
        wb.close()

In [20]:
import sys
import os
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
def get_driver_object():
    sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
    # Creating the ChromeOptions object to pass the additional arguments to webdriver
    options = webdriver.ChromeOptions()

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    # Creating the Webdriver object of type Chrome by passing service and options arguments
    driver_object = webdriver.Chrome('chromedriver',options=chrome_options)

    return driver_object


def get_id_from_url(URL):
    # Split URL by -g to divide it before the ID
    prefix, suffix = URL.split('-g', maxsplit=1)
    # Divide the URL after the ID (first dash)
    id, slug = suffix.split('-', maxsplit=1)
    return int(id)


def get_id_from_url_hotel(URL):
    # Split URL by -g to divide it before the ID
    prefix, suffix = URL.split('-d', maxsplit=1)
    # Divide the URL after the ID (first dash)
    id, slug = suffix.split('-', maxsplit=1)
    return int(id)


def get_listing_url(page, base_url, per_page):
    assert page >= 0
    id = get_id_from_url(base_url)
    if page == 0:
        return base_url

    return base_url.replace(f'-g{id}-', f'-g{id}-oa{page * per_page}-')


def get_listing_url_hotel(page, base_url, per_page):
    assert page >= 0
    id = get_id_from_url_hotel(base_url)
    if page == 0:
        return base_url

    return base_url.replace(f'-d{id}-', f'-d{id}-Reviews-or{page * per_page}-')


def get_website_driver(driver=get_driver_object(),url=""):
    # Opening the URL with the created driver object
    print("The webdriver is created")
    driver.get(url)
    print(f"The URL '{url}' is opened")
    return driver


def parse_hotels_perpage(driver):
    # Getting the HTML page source
    html_source = driver.page_source
    # Creating the BeautifulSoup object with the html source
    soup = BeautifulSoup(html_source, "html.parser")
    # Finding all the Hotel Div's in the BeautifulSoup object
    hotel_tags = soup.find_all("div", {"data-prwidget-name": "meta_hsx_responsive_listing"})
    hotels_list_url=[]
    for tag in hotel_tags:
        tdTags = tag.find("a", {"class": "property_title prominent "})
        review_count= "0" if tag.find("a", {"class": "review_count unclickable"}) is not None else tag.find("a", {"class": "review_count"}).text.split(" ")[0].replace(",", "") 
        hotels_list_url.append((tdTags['href'],int(review_count)))

    return hotels_list_url


def navigate_inside_page_hotel(hotel_url):
    PER_PAGE = 10
    is_halt=False
    site_base="https://www.tripadvisor.com"
    reviews_by_hotel=[]
    driver = get_website_driver(url=site_base+hotel_url[0]+"?filterLang=ES")
    while not is_halt and len(driver.find_elements(By.XPATH,"//div[@data-reviewid]"))>0:
        time.sleep(random.randint(2,8))
        
        if(len(driver.find_elements_by_xpath(".//div[contains(@data-test-target, 'expand-review')]"))>0):
            driver.find_element_by_xpath(".//div[contains(@data-test-target, 'expand-review')]").click()

        container = driver.find_elements_by_xpath("//div[@data-reviewid]")
        dates = driver.find_elements(By.XPATH,".//div[@class='cRVSd']")
        time.sleep(2)
        for j in range(len(container)):  # A loop defined by the number of reviews
            time.sleep(0.5)
            rating = container[j].find_element(By.XPATH,".//span[contains(@class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
            title = container[j].find_element(By.XPATH,".//div[contains(@data-test-target,'review-title')]").text 
            review = container[j].find_element(By.XPATH,".//q[@class='QewHA H4 _a']").text.replace("\n", "  ")
            date = " ".join(dates[j].text.split(" ")[-2:])
            reviews_by_hotel.append((title,review,int(rating)/10,date))
        
        # change the page
        next=driver.find_elements(By.XPATH,'.//a[@class="ui_button nav next primary "]')
        if len(next)!=0 and next[0].get_attribute('class')!='ui_button nav next primary disabled':
            next[0].click()
        else:
          is_halt=True

    return reviews_by_hotel

def save_hotel_list_as_txt(hotels_list):
  with open(r'hotels_list.txt', 'w') as fp:
    for item in hotels_list:
        fp.write("{},{}\n".format(item[0],item[1]))
    print('Done')

def write_xlsx_file(reviews):
  df = pd.DataFrame(reviews, columns =['Title', 'Opinion', 'Class','Date'])

  if not os.path.exists('cuba_hotels_sentiment.xlsx'):
    append_df_to_excel('cuba_hotels_sentiment.xlsx', df, header=True, index=False,fmt_int='#')
  else:
    append_df_to_excel('cuba_hotels_sentiment.xlsx', df, header=None, index=False, fmt_int='#')

In [None]:
BASE_URL = "https://www.tripadvisor.com/Hotels-g147270-zft21371-Cuba-Hotels.html"
PER_PAGE = 30
NUM_PAGE = 30
hotels_list=[]
for page in range(NUM_PAGE):
    driver = get_website_driver(url=get_listing_url(page,BASE_URL,PER_PAGE))
    time.sleep(random.randint(2,8))
    hotels_list += parse_hotels_perpage(driver)

reviews=[]

save_hotel_list_as_txt(hotels_list)

In [None]:
hotels_list=[]
with open('hotels_list.txt') as f:
    lines = f.readlines()
    for line in lines:
      tuple_line=line.split(",")
      hotels_list.append((tuple_line[0],tuple_line[1]))

index_position=236
for hotel in hotels_list[index_position:]:
    write_xlsx_file(navigate_inside_page_hotel(hotel))
    index_position+=1
    print(index_position)

The webdriver is created
The URL 'https://www.tripadvisor.com/Hotel_Review-g616288-d15750328-Reviews-Casa_Entre_Montanas-Vinales_Pinar_del_Rio_Province_Cuba.html?filterLang=ES' is opened
237
The webdriver is created
The URL 'https://www.tripadvisor.com/Hotel_Review-g609122-d13294811-Reviews-Casa_Colonial_Ural-Cienfuegos_Cienfuegos_Province_Cuba.html?filterLang=ES' is opened
238
The webdriver is created
The URL 'https://www.tripadvisor.com/Hotel_Review-g679576-d15806850-Reviews-Hostal_Los_Balcones-Cardenas_Matanzas_Province_Cuba.html?filterLang=ES' is opened
239
The webdriver is created
The URL 'https://www.tripadvisor.com/Hotel_Review-g2053536-d584858-Reviews-Hotel_Guama-Playa_Larga_Matanzas_Province_Cuba.html?filterLang=ES' is opened
240
The webdriver is created
The URL 'https://www.tripadvisor.com/Hotel_Review-g616288-d16812121-Reviews-Casa_Campo_Gladys_Suarez-Vinales_Pinar_del_Rio_Province_Cuba.html?filterLang=ES' is opened
241
The webdriver is created
The URL 'https://www.tripadvis