# PART I: WEB SCRAPPING

#### 1. Importing libraries

In [94]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service

#web driver wait library
from selenium.webdriver.support.ui import WebDriverWait

#os library
import os

import requests

import re

##### Create the driver

In [95]:
# Define the path to your Chrome driver
DRIVER_PATH = "D:/Applications/SeleniumDriver/chromedriver.exe"

# The Search URL
SEARCH_URL = "https://www.goodreads.com/search?q="

# Show book url
BOOK_URL = "https://www.goodreads.com/book/show/"



In [96]:
def run_driver():
     # Restart the driver service:
    DRIVER_SERVICE = Service(executable_path=DRIVER_PATH)
    options = webdriver.ChromeOptions()
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-extensions")
    # Set up the Chrome driver with the service
    #driver = webdriver.Chrome(service=DRIVER_SERVICE)
    driver = webdriver.Chrome( service=DRIVER_SERVICE, options=options)
    return driver

#### Generate book id from the book title:

In [97]:
#Function to convert title to book id
#Example   1. "Pride and Prejudice" -> "Pride-and-Prejudice"
#Example   2. "The Handmaid's Tale" -> "The+Handmaid%27s+Tale"
def format_book_title(book_title):
    #Convert to lower case
    book_title = book_title.lower()
    book_id = book_title.replace(" ", "-")
    book_id = book_id.replace("'", "%27")
    book_id = book_id.replace(":", "")
    book_id = book_id.replace(",", "")
    book_id = book_id.replace("?", "")
    book_id = book_id.replace("!", "")
    book_id = book_id.replace(".", "")
    book_id = book_id.replace(";", "")
    book_id = book_id.replace("(", "")
    book_id = book_id.replace(")", "")
    book_id = book_id.replace("-", "+")
    book_id = book_id.replace("__", "+")
    return book_id

In [98]:
# Function to get the book title from the input
def get_book_title():
    book_title = input("Enter the book title: ")
    return book_title

In [99]:
# Function to form the search query
def get_search_url(formatted_book_title):
    search_url = SEARCH_URL + formatted_book_title
    return search_url

### GET THE SOUP FOR THE BOOK PAGE

### Search for the book using book title

In [100]:
FORMATTED_BOOK_TITLE = format_book_title(get_book_title())
#print(FORMATTED_BOOK_TITLE)

In [101]:
FORMATTED_SEARCH_URL = get_search_url(FORMATTED_BOOK_TITLE)

In [102]:
print(FORMATTED_SEARCH_URL)

https://www.goodreads.com/search?q=harry+potter


In [103]:
# Function to get the soup object from the url
def get_soup(search_url):

    # Step 1: Send GET request
    response = requests.get(search_url)

    # Step 2: Parse HTML document
    soup = BeautifulSoup(response.text, 'html.parser')

    # Step 3: Close the connection
    response.close()
    return soup

In [104]:
SOUP = get_soup(FORMATTED_SEARCH_URL)

#print(SOUP.prettify())

In [105]:
# Get the first result from the search
def get_first_result(soup):
    first_result = soup.find('a', class_='bookTitle')
    return first_result

In [106]:
print(get_first_result(SOUP))

<a class="bookTitle" href="/book/show/72193.Harry_Potter_and_the_Philosopher_s_Stone?from_search=true&amp;from_srp=true&amp;qid=IpEqYTU8Pt&amp;rank=1" itemprop="url">
<span aria-level="4" itemprop="name" role="heading">Harry Potter and the Philosopher’s Stone (Harry Potter, #1)</span>
</a>


In [107]:
# Reformat the book title and get the book id
def get_book_show_query(first_result):
    book_id = first_result['href']
    book_id = book_id.replace("/book/show/", "")
    book_id = book_id.split("-")[0]
    return book_id

In [108]:
BOOK_SHOW_QUERY = get_book_show_query(get_first_result(SOUP))
print(BOOK_SHOW_QUERY)

72193.Harry_Potter_and_the_Philosopher_s_Stone?from_search=true&from_srp=true&qid=IpEqYTU8Pt&rank=1


In [109]:
# Get book id from the book_show_query
def get_book_id(book_show_query):
    book_id = book_show_query.split("?")[0]
    return book_id

In [110]:
BOOK_FULL_ID = get_book_id(BOOK_SHOW_QUERY)
print(BOOK_FULL_ID)

72193.Harry_Potter_and_the_Philosopher_s_Stone


In [111]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [112]:
# Function to get the book reviews url:
def get_book_details_page_source(book_show_url, book_full_id):
   # Restart the driver service:
    driver = run_driver()

    # Get the book details page
    driver.get(book_show_url + book_full_id)

   # Wait for the "ReviewsList" element to become visible
    wait_time = 1 # wait for 1 second
    wait = WebDriverWait(driver, wait_time)
      
   # # If there's no ReviewsList class on the page_source, then wait for 1 more second
    while "Loading" in driver.page_source:
       wait_time += 1 # wait for 1 second
       wait = WebDriverWait(driver, wait_time)
       

       
   # Get the page source
    page_source = driver.page_source
    # Clean up the HTML using BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")
   #Then close the driver
   #  driver.close()
    return soup

In [113]:
BOOK_DETAILS_PAGE_SOURCE = get_book_details_page_source(BOOK_URL, BOOK_FULL_ID)

In [134]:
print(BOOK_DETAILS_PAGE_SOURCE.prettify())

<html data-theme="light" lang="en">
 <head>
  <script async="" src="//c.amazon-adsystem.com/aax2/apstag.js">
  </script>
  <script async="" src="https://images-na.ssl-images-amazon.com/images/G/01/csminstrumentation/ue-full-ef584a44e8ea58e3d4d928956600a9b6._V1_.js">
  </script>
  <script>
   var ue_t0=window.ue_t0||+new Date();(function(e){var c=e,a={main_scope:"mainscopecsm",q:[],t0:c.ue_t0||+new Date(),d:g};function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.push({n:h,a:arguments,t:a.d()})}}function b(k,j,h){var i={m:k,f:j,l:h,fromOnError:1,args:arguments};c.ueLogError(i);return false}b.skipTrace=1;e.onerror=b;function f(){c.uex("ld")}if(e.addEventListener){e.addEventListener("load",f,false)}else{if(e.attachEvent){e.attachEvent("onload",f)}}a.tag=d("tag");a.log=d("log");a.reset=d("rst");c.ue_csm=c;c.ue=a;c.ueLogError=d("err");c.ues=d("ues");c.uet=d("uet");c.uex=d("uex");c.uet("ue")})(window);(function(e,d){var a=e.ue||{};function c(g){if(!g){return}var f=d

#### 2. A function to get the soup and extract review details:

In [115]:
#Function to get all review cards:
def get_review_cards(soup):
    review_cards = []
    reviewCards = soup.find_all('div', {'class': 'ReviewCard'})
    for card in reviewCards:
        review_cards.append(card)
    return review_cards

In [116]:
#Function to get the reviwsList
def get_reviewsList(soup):
    reviewsList = soup.find_all('div', {'class': 'ReviewsList'})
    return reviewsList

In [117]:
REVIEWS_LIST = get_reviewsList(BOOK_DETAILS_PAGE_SOURCE)

In [118]:
#A function to get all the reviews
def get_all_reviews(allReviewsCards):
    allReviews = []
    for review in allReviewsCards:
        allReviews.append(review)
    return allReviews

In [119]:
def get_all_reviews(soup):
    #Get reviewsList:
    reviews_list = get_reviewsList(soup)
    #Get review cards:
    allReviewsCards = reviews_list[1].find_all('article', {'class': 'ReviewCard'})
    #Get all reviews:
    allReviews = []
    for review in allReviewsCards:
        allReviews.append(review)
    
    return allReviews
    


In [120]:
all_reviews_detail = get_all_reviews(BOOK_DETAILS_PAGE_SOURCE)

In [121]:
all_reviews_detail

[<article aria-label="Review by Miranda Reads" class="ReviewCard"><div class="ReviewCard__profile"><div class="ReviewerProfile ReviewerProfile--medium"><section class="ReviewerProfile__avatar"><a as="div" class="Avatar Avatar--medium" href="https://www.goodreads.com/user/show/71848701-miranda-reads"><img alt="Profile Image for Miranda Reads." class="Avatar__image" data-testid="image" src="https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/users/1620306186i/71848701._UX200_CR0,0,200,200_.jpg"/></a></section><section class="ReviewerProfile__info"><span class="Text Text__title4"><div class="ReviewerProfile__name" data-testid="name"><a href="https://www.goodreads.com/user/show/71848701-miranda-reads">Miranda Reads</a></div></span><span class="Text Text__body3 Text__subdued"><div class="ReviewerProfile__meta"><span>1,589 reviews</span><span><span>154k followers</span></span></div></span></section><div class="ReviewerProfile__follow" data-testid="follow"><div class="FollowButton

In [122]:
#Get review text
def get_reviewer_name(review):
    reviewName = review.find('div', {'class': 'ReviewerProfile__name'}).text
    return reviewName

In [123]:
#Function to get review Name:
def get_review_text(review):
    reviewText = review.find('div', {'class': 'TruncatedContent'}).text
    return reviewText

In [124]:
import re

#function to get review rating
def get_review_rating(review):
    reviewRating = review.find('span', {'class': 'RatingStars RatingStars__small'})
    #Check if the review has a rating
    if reviewRating is None:
        return None
    ratingString = reviewRating['aria-label']  # extract the aria-label value
    rating = re.findall(r'\d+', ratingString)  # extract the number from the string
    #return rating
    #Return the first element of the list
    return rating[0]
    

In [125]:
def get_review_detail(review):
    #Get reviewer name:
    user = get_reviewer_name(review)

    #Get reviewer text:
    text = get_review_text(review)

    # Get reviewer rating:
    rating = get_review_rating(review)
    #if rating = None: rating = 0
    if rating is None:
        rating = 0

    return {
        'user': user,
        'review': text,
        'rating': rating
    }


In [126]:
#This function will get all the reviews information
def get_all_reviews_info(all_reviews):
    allReviewsInfo = []
    for review in all_reviews:
        allReviewsInfo.append(get_review_detail(review))
    return allReviewsInfo

In [127]:
all_reviews_info = get_all_reviews_info(all_reviews_detail)

In [128]:
all_reviews_info

[{'user': 'Miranda Reads',
  'review': " \nStuck at home? Got some time on your hands? Want to start a long series? But you don't want a dud?Check out this\n booktube video\n all about which series are worth your time (and which ones aren't)! \nHere's the Written Review!\nCan you hear me screaming?As expected, the illustrations brought this book to\xa0a whole new level.\xa0I legitimately want to buy another copy, solely to take it apart and\xa0frame it.Each page is just bursting with new life.\xa0Honestly, how could you\xa0not\xa0love such beautiful images?Hogwarts, Hogwarts. Hoggy Hoggy warty warts.I think at this point, everyone and their great-great-grandmother has heard of this book.Harry Potter, orphaned before he was one, was sent to live with his Aunt Petunia and Uncle Vernon. He was always a bit of\xa0an odd child\xa0- much to his family's dismay.Things just...\xa0happened\xa0...around him. Like when he was running from bullies and jumped to the school roof. Or when he got an a

In [129]:
#Add the all_reviews_info to the dataframe putting each review as a row
import pandas as pd
reviews_df = pd.DataFrame(all_reviews_info, columns=['user', 'review', 'rating'])


In [130]:
reviews_df

Unnamed: 0,user,review,rating
0,Miranda Reads,\nStuck at home? Got some time on your hands?...,5
1,Matthew,Update – 4/4/2022 – Reread out loud to my kids...,5
2,Lora,I'm going to keep this brief since there isn't...,5
3,★ Jess,My original review was a comparison of sorts b...,5
4,"Khanh, first of her name, mother of bunnies",Rereading for the 3rd time for the group read....,5
5,Shep,,5
6,Zoë,I will never ever rate this lower than 5 stars...,5
7,Navessa,There are no words to do this book justice.,5
8,Jayson,(A-) 83% | Very GoodNotes: Despite a weak clim...,4
9,Voldemort,This is a disgrace to all pure bloods. My head...,1


In [131]:
#Get all the text reviews:
all_reviews_texts = reviews_df['review'].tolist()
print(all_reviews_texts)

[" \nStuck at home? Got some time on your hands? Want to start a long series? But you don't want a dud?Check out this\n booktube video\n all about which series are worth your time (and which ones aren't)! \nHere's the Written Review!\nCan you hear me screaming?As expected, the illustrations brought this book to\xa0a whole new level.\xa0I legitimately want to buy another copy, solely to take it apart and\xa0frame it.Each page is just bursting with new life.\xa0Honestly, how could you\xa0not\xa0love such beautiful images?Hogwarts, Hogwarts. Hoggy Hoggy warty warts.I think at this point, everyone and their great-great-grandmother has heard of this book.Harry Potter, orphaned before he was one, was sent to live with his Aunt Petunia and Uncle Vernon. He was always a bit of\xa0an odd child\xa0- much to his family's dismay.Things just...\xa0happened\xa0...around him. Like when he was running from bullies and jumped to the school roof. Or when he got an atrocious haircut (courtesy of Aunt Pet

In [132]:
with open(f'./files/{BOOK_FULL_ID}.txt', 'w', encoding='utf-8') as file:
    # Write all the reviews to the file
    for review in all_reviews_texts:
        file.write(review + '\n')
    # Close the file
    file.close()