In [1]:
import requests 
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as cp
import itertools


# Scraping Books

## Step 1: Scraping book links or titles

In [None]:
def scrapeBookTags(link):
  response = requests.get(link)
  book_soup = BeautifulSoup(response.text, 'html.parser')

  book_tags = book_soup.find_all('a', attrs={'class': 'link'})

  book_tags = [tag.attrs['href'] for tag in book_tags
                if tag.attrs['href'].startswith('/ebooks') & tag.attrs['href'][-1].isdigit()]

  book_tags = list(dict.fromkeys(book_tags))

  return book_tags

book_tags = []
indices = ['0', '25', '50', '75']
url = "https://www.gutenberg.org/ebooks/search/?sort_order=downloads&start_index="

for index in indices:
  book_tags = book_tags + scrapeBookTags(url+index)

print("In total we have " + str(len(book_tags)) + " book titles") # Comment out afterwards
print("Displaying 10 titles") # Comment out afterwards
print(book_tags[:10]) # Comment out afterwards

In total we have 100 book titles
Displaying 10 titles
['/ebooks/1342', '/ebooks/84', '/ebooks/11', '/ebooks/16328', '/ebooks/1661', '/ebooks/2701', '/ebooks/25344', '/ebooks/1232', '/ebooks/1952', '/ebooks/174']


## Step 2: Extract the longest paragraph from each book and clean text

In [None]:
def cleanTitle(title):
    # try remove 'gutenberg ebook of ' and ', by .....'
    if (title.lower().find('gutenberg ebook of ') != -1):
        title = title[title.lower().find('gutenberg ebook of ') + len('gutenberg ebook of '):]
    if (title.lower().find('gutenberg e-text of ') != -1):
        title = title[title.lower().find('gutenberg e-text of ') + len('gutenberg e-text of '):]
    if (title.lower().find('gutenberg ebook ') != -1):
        title = title[title.lower().find('gutenberg ebook ') + len('gutenberg ebook '):]
    if (title.lower().find('gutenberg book of ') != -1):
        title = title[title.lower().find('gutenberg book of ') + len('gutenberg book of '):]
    if (title.lower().find('gutenberg book ') != -1):
        title = title[title.lower().find('gutenberg book ') + len('gutenberg book '):]

    if (title.lower().find(', by') != -1):
        title = title[:title.lower().find(', by')]
    if (title.lower().find(',by') != -1):
        title = title[:title.lower().find(',by')]
    if (title.lower().find('. by') != -1):
        title = title[:title.lower().find('. by')]
    if (title.lower().find('.by') != -1):
        title = title[:title.lower().find('.by')]
    if (title.lower().find('translated by') != -1):
        title = title[:title.lower().find('translated by')]
    return title

In [None]:
def cleanText(text):
  text = re.sub(r'[^\x00-\x7F]+',' ', text) # replace non ascii char
  text = text.replace('\r', '  ').replace('\n', '  ').replace('\t', '  ').strip() # remove \r \n \t and spaces in head and tail
  text = text.replace(':', ': ').replace(';', '; ').replace(',', ', ').replace('.', '. ') # add space after the symbols

  while '  ' in text:
      text = text.replace('  ',  ' ') # remove redundant spaces
  text = text.replace(' :', ':').replace(' ;', ';').replace(' ,', ',').replace(' .', '.') # remove space before the symbols

  return text

In [None]:
list_of_tags = [tag.replace("/ebooks/", "") for tag in book_tags]

book_links = [f"https://www.gutenberg.org/files/{tag}/{tag}-h/{tag}-h.htm" for tag in list_of_tags]

bookDf = pd.DataFrame(data={'Book': [], 'Url': [], 'Paragraph': []})
for index, book_link in enumerate(book_links):
    print('\rProcessing[' + str(index).zfill(3) + ']: ' + book_link, end = '', flush=True)

    # Getting book
    r = requests.get(book_link)
    # Check if book has a htm online reading
    if (r.status_code == 404):
        continue
    
    # Reading book
    encoding = r.encoding if 'charset' in r.headers.get('content-type', '').lower() else None
    book = BeautifulSoup(r.content, from_encoding=encoding)

    # Get title
    title = cleanText(book.title.get_text()).replace('"', '')
    title = cleanTitle(title=title)

    # Get paragraphs
    paragraphs = book.find_all('p')
    paragraphs = [cleanText(paragraph.get_text()) for paragraph in paragraphs]

    # Save the longest paragraph
    paragraph = max(paragraphs, key=len)
    bookDf.loc[bookDf.shape[0]] = [title, book_link, paragraph]
    del r, encoding, book, title, paragraphs, paragraph


Processing[099]: https://www.gutenberg.org/files/10/10-h/10-h.htm

## Step 3: Output into csv 


In [None]:
bookDf.to_csv('gutenBerg_100_paragraphs.csv', index=False)