# Scrapping from goodreads

## Installations, Imports, and loging into browser

In [0]:
!pip install -q MechanicalSoup

In [0]:
import mechanicalsoup
import bs4
import json
import re
from google.colab import files

In [0]:
# Connect to goodreads login
browser = mechanicalsoup.StatefulBrowser()
browser.open("https://www.goodreads.com/user/sign_in?source=home")

# login using credentials
browser.get_current_page()
browser.select_form('form[name="sign_in"]')
browser["user[email]"] = 'supermoz125@gmail.com'
browser["user[password]"] = 'superbat999'
resp = browser.submit_selected()

## Scrapping Genres

In [0]:
# if login successful
dic = {}
if(str(resp) == '<Response [200]>'):
    for page_num in range(1, 6):
        # opening genres
        url = 'https://www.goodreads.com/genres/list?page=' + str(page_num)
        browser.open(url)
        
        # getting all links with specific href
        links = browser.links('/genres/')

        for link in links:
            name = link.getText()
            href = 'https://www.goodreads.com/shelf/show/' + name
            dic[name] = href
        
    with open("genres.json", "w", encoding="utf-8") as myfile:
        json.dump(dic, myfile, indent=4, ensure_ascii=False,)

else:
    print("oh noo!")

## Scrapping Book Lists

In [0]:
dic = {}
with open("genres.json", encoding="utf8") as myfile:

  data = json.load(myfile)
  sr = 1
  for shelf, href in data.items():
    dic[shelf] = {}

    for page_num in range(1, 10):
      browser.open(href + '?page=' + str(page_num))

      links = browser.links(class_='bookTitle')
      
      for link in links:
          name = link.getText()
          href_ = 'https://www.goodreads.com' + link.get('href')
          dic[shelf][name] = href_

      if len(links) < 50: 
        break
    
    print(sr, shelf, "done")
    sr += 1
      
  with open("book_list.json", "w", encoding="utf-8") as myfile:
    json.dump(dic, myfile, indent=4, ensure_ascii=False,) 


## Scrapping Books


In [0]:
def getBook(url):
  browser = mechanicalsoup.StatefulBrowser()
  browser.open(url)
  page = browser.get_current_page()
  page = bs4.BeautifulSoup(str(page.select("body")))

  details = {}

  # details # id, div for date, and other details

  # GENRES
  genres = []
  for link in browser.links('/genres/', class_='actionLinkLite'):
    genres.append(link.getText())
  details['genres'] = genres

  # COVER IMAGE
  img = page.find(id='coverImage')['src']
  details['cover_image'] = img

  # AUTHORS
  authors = []
  for author in browser.links(class_='authorName'):
    authors.append(author.getText())
  details['authors'] = authors

  # AVERAGE RATING
  avg_ratings = page.find(itemprop='ratingValue').text
  details['avg_ratings'] = avg_ratings

  # DESCRIPTION
  details['description'] = "No description";
  raw_description = page.find(id='description')
  if raw_description != None:
    des1 = raw_description.select_one('span[style]')
    des2 = raw_description.select_one('span')
    if des1 != None:
      details['description'] = des1.text
    else:
      details['description'] = des2.text

  # ONLINE LINKS
  links = {}
  for link in browser.links('/book_link/'):
    links[link.getText()] = 'https://www.goodreads.com/' + link.get('href')
  details['links'] = links


  # REVIEWS
  raw_reviews = page.find_all(class_="reviewText stacked")
  reviews = []
  for raw_review in raw_reviews:
    review1 = raw_review.span.select_one('span[style]')
    review2 = raw_review.span.select_one('span')
    if review1 == None:
      reviews.append(review2.text)
    else:
      reviews.append(review1.text)

  details['reviews'] = reviews
  
  # BOOK DETAILS
  raw_details = page.find(class_='uitext darkGreyText')

  top_details = ""
  for span in raw_details.div.find_all('span'):
    top_details += span.text + " "
  details['top_details'] = top_details.strip()

  published = raw_details.div.find_next_sibling().text
  published = re.sub('\s+', ' ', published)
  details['published'] = published.strip()


  infoKeys = raw_details.find_all(class_='infoBoxRowTitle')
  infoValues = raw_details.find_all(class_='infoBoxRowItem')
  for x in range(len(infoKeys) - 1):
    key = re.sub('\s+', ' ', infoKeys[x].text)
    
    value = re.sub('\s+', ' ', infoValues[x].text)
    value = re.sub('...more', '', value)
    value = re.sub('...less', '', value)

    details[key.strip()] = value.strip()

  return details

In [0]:
with open("book_list.json", encoding="utf8") as myfile:
  data = json.load(myfile)
  for shelf in data:
    book_list = {}
    for book_name, url in data[shelf].items():
      if x == 0: break
      book_list[book_name] = getBook(url)
      x -= 1
    
    with open("books/" + shelf + ".json", "w", encoding="utf8") as myfile:
      json.dump(book_list, myfile, indent=4, ensure_ascii=False,) 
    break


In [0]:
browser = mechanicalsoup.StatefulBrowser()
browser.open("https://www.goodreads.com/book/show/1051091.Dude_You_re_a_Fag")
page = browser.get_current_page()
page = bs4.BeautifulSoup(str(page))

In [19]:
data = getBook("https://www.goodreads.com/book/show/52357.Beowulf")
print(data)

{'genres': ['Classics', 'Poetry', 'Fiction', 'Fantasy', 'Academic', 'School', 'Fantasy', 'Mythology'], 'cover_image': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1327878125l/52357.jpg', 'authors': ['Unknown', 'Seamus Heaney', 'Francis Barton Gummere'], 'avg_ratings': '\n  3.44\n', 'description': 'Beowulf is a major epic of Anglo-Saxon literature, probably composed between the first half of the seventh century and the end of the first millennium. The poem was inspired by the oral tradition Anglo-Saxon and Germanic transcribed a verse epic, recounting the exploits of Beowulf hero who gave his name to the poem, on which are grafted chretiens additions.', 'links': {'Audible': 'https://www.goodreads.com//book_link/follow/10?book_id=52357&source=dropdown', 'Barnes & Noble': 'https://www.goodreads.com//book_link/follow/3?book_id=52357&source=dropdown', 'Walmart eBooks': 'https://www.goodreads.com//book_link/follow/1027?book_id=52357&source=dropdown', 'Apple Books': 

In [20]:
import pprint

pp = pprint.PrettyPrinter(depth=6)

pp.pprint(data)

{'Characters': 'Beowulf, Grendel, Onela, Unferth, Breca, Wiglaf, Wealhtheow, '
               'Hrothgar',
 'Edition Language': 'English',
 'ISBN': '0393320979 (ISBN13: 9780393320978)',
 'authors': ['Unknown', 'Seamus Heaney', 'Francis Barton Gummere'],
 'avg_ratings': '\n  3.44\n',
 'cover_image': 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1327878125l/52357.jpg',
 'description': 'Beowulf is a major epic of Anglo-Saxon literature, probably '
                'composed between the first half of the seventh century and '
                'the end of the first millennium. The poem was inspired by the '
                'oral tradition Anglo-Saxon and Germanic transcribed a verse '
                'epic, recounting the exploits of Beowulf hero who gave his '
                'name to the poem, on which are grafted chretiens additions.',
 'genres': ['Classics',
            'Poetry',
            'Fiction',
            'Fantasy',
            'Academic',
            'Sch