In [1]:
import time
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup as _BeautifulSoup
import csv
from dotenv import load_dotenv

In [2]:
load_dotenv()

In [3]:
genre = ['mystery', 'nonfiction', 'poetry', 'religion', 'romance', 'science-fiction', 'self-help', 'suspense', 'spirituality', 'sports', 'thriller', 'travel', 'young-adult']

In [4]:
def create_csv(genre):
    with open(genre + ".csv", "a", newline="", encoding="utf-8") as csv_file:
        fieldnames = ['Id', 'Title', 'Author', 'Average Rating', 'Number of ratings', 'Year of publishing', 'Url of image', 'Genre']
        writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
        writer.writeheader()

In [5]:
def get_id(titles):
    book_id = []
    for i in titles:
        book_id.append(i['href'].strip('/book/show/'))
    return book_id
def get_title(titles):
    titles_content = []
    for i in titles:
        titles_content.append(i.contents)
    return titles_content

In [6]:
def get_author(authors):
    author_name = []
    for i in authors:
        author_name.append(i.find('span').contents)
    return author_name

In [7]:
def get_details(details):
    details_list = []
    for i in range(len(details)):
        detail_string = ''.join(details[i].contents).split('—')
        detail = []
        for j in detail_string:
            detail.append(j.strip('\n                ').strip('avg rating ').strip(' ratings').strip('published '))
        details_list.append(detail)
    return details_list

In [8]:
def get_image(images):
    img = []
    for i in images:
        if i != None:
            img.append(i.find('img')['src'])
        else:
            img.append('Null')
    return img

In [9]:
def write_data(book_id, titles_content, author_name, details_list, img, genre ):
    with open(genre + ".csv", "a", newline="", encoding="utf-8" ) as csv_file:
        fieldnames = ['Id', 'Title', 'Author', 'Average Rating', 'Number of ratings', 'Year of publishing', 'Url of image', 'Genre']
        writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
        dict = {}
        for i in range(len(book_id)):
            dict['Id'] = book_id[i]
            dict['Title'] = ''.join(titles_content[i])
            dict['Author'] = ''.join(author_name[i])
            dict['Average Rating'] = float(details_list[i][0])
            dict['Number of ratings'] = int(details_list[i][1].replace(',', ''))
            if details_list[i][2] != '':
                dict['Year of publishing'] = int(details_list[i][2])
            else:
                dict['Year of publishing'] = 'Null'
            dict['Url of image'] = img[i]
            dict['Genre'] = genre 
            writer.writerow(dict)

In [10]:
def scrape(genre, page_number):
    session_requests = requests.session()
    for i in range(page_number):
        url = 'https://www.goodreads.com/shelf/show/' + genre + '?page=' + str(i+1)
        headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 
                   'accept-encoding': 'gzip, deflate, br', 
                   'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7', 
                   'cache-control': 'max-age=0', 
                   'dnt': '1', 'sec-fetch-dest': 'document', 
                   'sec-fetch-mode': 'navigate', 
                   'sec-fetch-site': 'none', 
                   'sec-fetch-user': '?1', 
                   'upgrade-insecure-requests': '1', 
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36', 
                   'referer': url,
                   'Cookie': os.environ.get('cookies')      #load this from your .env file which stores _session_id2 cookie
                  }
        result = session_requests.get(url, headers = headers)


        bs4_page = _BeautifulSoup(result.content, features="html.parser")

        titles = bs4_page.find_all('a', class_='bookTitle')
        titles_content = get_title(titles)

        book_id = get_id(titles)

        authors = bs4_page.find_all('a', class_='authorName')
        author_name = get_author(authors)

        details = bs4_page.find_all('span', class_='greyText smallText')
        details_list = get_details(details)

        images = bs4_page.find_all('a', class_='leftAlignedImage')
        img = get_image(images)

        write_data( book_id, titles_content, author_name, details_list, img, genre)



In [11]:
for i in genre:
    create_csv(i)
    scrape(i, 25)