# Webscraping Notebook

## Installs

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import requests
from bs4 import BeautifulSoup
import math
import numpy as np
import pandas as pd

## Scrape list of books and urls 

In [None]:
page="1"
url="https://www.goodreads.com/shelf/show/fiction?page=" + page
url

In [None]:
response=requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
book_titles = [book.text.strip() for book in soup.find_all('a', class_='bookTitle')]

In [None]:
len(book_titles)

In [None]:
book_titles[1]

In [None]:
tkamb_title = book_titles[1]

In [None]:
book_urls = ['https://www.goodreads.com' + book['href'] for book in soup.find_all('a', class_='bookTitle')]

In [None]:
len(books)

In [None]:
books[1]

## Get top reviews for each book

In [None]:
response=requests.get(books[1])
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
type(soup)

In [None]:
len(soup.find_all('section', 'ReviewText'))

In [None]:
len(soup.find_all('article', 'ReviewCard'))

In [None]:
review_containers = soup.find_all('article', 'ReviewCard')
reviews = []
for container in review_containers:
    review_text = container.find('section', 'ReviewText').text.strip()
    reviews.append(review_text)

In [None]:
len(reviews)

In [None]:
type(reviews[1])

In [None]:
reviews[0]

In [None]:
tkamb_reviews = reviews.copy()

## Get the average rating and title

In [None]:
rating = soup.find('div', 'RatingStatistics__column')["aria-label"]

In [None]:
rating

In [None]:
float(rating[18:22])

In [None]:
book_rating = float(rating[18:22])

In [None]:
book_title = soup.select('h1', class_='Text Text__Title1')[0].text

## Saving title, rating, and 30 reviews together in a class

In [None]:
class BookInfo:
    def __init__(self, rating, reviews, title):
        self.title = title
        self.rating = rating
        self.reviews = reviews
    
    def get_title(self):
        return self.title
    def get_rating(self):
        return self.rating
    def get_reviews(self):
        return self.reviews

In [None]:
book1 = BookInfo(rating=tkamb_rating, reviews=tkamb_reviews, title=tkamb_title)

In [None]:
book1.get_rating()

## Saving data into dataframe

In [None]:
print(book_urls[0])

In [None]:
response=requests.get(book_urls[0])
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
review_containers = soup.find_all('article', 'ReviewCard')
reviews = []
for container in review_containers:
    review_text = container.find('section', 'ReviewText').text.strip()
    reviews.append(review_text)

In [None]:
print(reviews[0])

In [None]:
rating = soup.find('div', 'RatingStatistics__column')["aria-label"][18:22]

In [None]:
print(rating)

In [None]:
book_title = soup.select('h1', class_='Text Text__Title1')[0].text

In [None]:
print(book_title)

In [None]:
book_dict = {
    'title': book_title,
    'rating': rating
}

In [None]:
reviews_titles = [f'Review {str(num)}' for num in range(1,31)]

In [None]:
reviews_titles

In [None]:
for heading, review in zip(reviews_titles, reviews):
    book_dict[heading] = review

In [None]:
print(book_dict['Review 5'])

In [None]:
books_df = pd.DataFrame(book_dict, index=[0])

In [None]:
books_df.info()

# Making functions

## Retrieving book urls

In [None]:
def retrieve_books(num_books):
    page=1
    num_pages = math.floor(num_books/50)
    book_urls = []
    while page <= num_pages:
        url="https://www.goodreads.com/shelf/show/fiction?page=" + str(page)
        response=requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        for book in soup.find_all('a', class_='bookTitle'):
            book_urls.append('https://www.goodreads.com' + book['href'])
        page+=1
    if num_books%50:
        url="https://www.goodreads.com/shelf/show/fiction?page=" + str(page)
        response=requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        for book in soup.find_all('a', class_='bookTitle', limit=(num_books%50)):
            book_urls.append('https://www.goodreads.com' + book['href'])
    return book_urls

In [None]:
len(book_urls)

## Retrieving top 30 reviews for each book and storing in dataframe

In [None]:
def retrieve_data_for_book(book_url):
    book_dict = {}
    response=requests.get(book_url)
    soup = BeautifulSoup(response.content, "html.parser")
    review_containers = soup.find_all('article', 'ReviewCard')
    reviews = []
    for container in review_containers:
        review_text = container.find('section', 'ReviewText').text.strip()
        reviews.append(review_text)
    book_dict['rating'] = float(soup.find('div', class_='RatingStatistics__rating').text)
    book_dict['book_title'] = soup.select('h1', class_='Text Text__Title1')[0].text
    reviews_titles = [f'Review {str(num)}' for num in range(1,len(reviews)+1)]
    for heading, review in zip(reviews_titles, reviews):
        book_dict[heading] = review
    return book_dict

In [None]:
def retrieve_data_for_all_books_in_df(book_urls):
    initial = True
    book_dicts=[]
    for book_url in book_urls:
        book_dicts.append(retrieve_data_for_book(book_url))
    books_df = pd.DataFrame.from_dict(book_dicts)
    return books_df

In [None]:
books_df = retrieve_data_for_all_books_in_df(book_urls[:5])

## Testing Stuff

In [None]:
book_urls[4]

In [None]:
response=requests.get(book_urls[4])
soup = BeautifulSoup(response.content, "html.parser")
book_title = soup.select('h1', class_='Text Text__Title1')[0].text
book_title

In [None]:
response=requests.get(book_urls[2])
book_rating = float(soup.find('div', class_='RatingStatistics__rating').text)
book_rating