In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.dmfxPLZeXn/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.cro9O2JUlp/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.VMmMruQ6Is/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian bust



In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

In [None]:
service = Service(executable_path=r'/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=service, options=options)

In [None]:
base_url = "https://www.buscalibre.cl/libros/"
categoría = "arte"
url = base_url + categoría

In [None]:
driver.get(url)

In [None]:
def get_href_from_book_html(html):
  # Get link from html
  link_start = html.index("https:")
  link_end = html.index('\" title=')
  link = html[link_start:link_end]
  return link

In [None]:
def get_review_data_from_html(html):
  stars_label = "small stars stars-"
  stars_start = html.index(stars_label) + len(stars_label)
  if (html[stars_start + 1] == '"'):
    stars = float(html[stars_start])
  else:
    stars = float(html[stars_start] + "." + html[stars_start+1])

  review_span_idx = html.index('<span id="texto-review-')
  parsed_html = html[review_span_idx:]
  review_start = parsed_html.index(">")
  review_end = parsed_html.index("</span>")
  review = parsed_html[review_start + 1: review_end]

  return stars, review

In [None]:
import json
def save_list_to_json(obj, path):
  # Convert list to JSON string
  json_data = json.dumps(obj, indent=4)

  # Save JSON to a file
  with open(path, "w") as json_file:
      json_file.write(json_data)

def load_list_from_json(path):
  with open(path, "r") as json_file:
      return json.load(json_file)

In [None]:
driver.implicitly_wait(10)
if driver.find_element(By.ID, "cambiarPais").is_displayed():
  close_buttons = driver.find_elements(By.CLASS_NAME, "cerrar")
  for button in close_buttons:
    if (button.text == "Me quedo aquí"):
      button.click()

In [None]:
book_links_local_path = "/content/book_links.json"
old_book_links_local_path = "/content/old_book_links.json"
labeled_data_local_path = "/content/labeled_data.json"

In [None]:
load_from_file = True

if load_from_file:
  labeled_data =load_list_from_json(labeled_data_local_path)
else:
  labeled_data = []

In [None]:
load_from_file = True

if load_from_file:
  book_links = set(load_list_from_json(book_links_local_path))
  old_book_links = set()
else:
  book_links = set()
  old_book_links = set(load_list_from_json(old_book_links_local_path))

In [None]:
len(old_book_links)

0

In [None]:
# Query parameters
num_review_pages = 30

In [None]:
for _ in range (num_review_pages):
  book_substring = "box-producto producto"
  book_elements = driver.find_elements(By.XPATH, f"//*[contains(@class, '{book_substring}')]")
  for book_element in book_elements:
    book_html = book_element.get_attribute("innerHTML")
    book_link = get_href_from_book_html(book_html)
    if book_link not in old_book_links:
      book_links.add(book_link)

  página_siguiente = driver.find_element(By.ID, "pagnNextString")
  página_siguiente.click()

file_path = book_links_local_path
save_list_to_json(list(book_links), file_path)

In [None]:
import math

total_book_links = len(book_links)
batch_size = 50
num_batches = math.ceil(total_book_links / batch_size)

In [None]:
wait = WebDriverWait(driver, 10)

for j in range (0, 20):
  start = j * batch_size
  for i, book_link in enumerate(sorted(list(book_links))[start:start + batch_size]):
    if (i % 25 == 0): print (start + i)

    driver.get(book_link)

    try:
      title = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tituloProducto"))).text

      load_more_button = driver.find_elements(By.ID, "load-comments")
      if load_more_button:
        load_more_button[0].click()

      review_substring = "review-n-"
      review_elements = driver.find_elements(By.XPATH, f"//*[contains(@class, '{review_substring}')]")

      for review_element in review_elements:
        review_html = review_element.get_attribute("innerHTML")
        stars, review = get_review_data_from_html(review_html)
        labeled_data.append (
            {
                "rating": stars,
                "review": review,
                "title": title,
            }
        )

    except:
      print ("No title found")
      continue
  save_list_to_json(labeled_data, labeled_data_local_path)

300
325
350
375
400


ReadTimeoutError: HTTPConnectionPool(host='localhost', port=47951): Read timed out. (read timeout=120)

In [None]:
save_list_to_json(labeled_data, labeled_data_local_path)

In [None]:
len (labeled_data)

8935

Check the ROBOTS.TXT-- https://www.buscalibre.cl/robots.txt