In [30]:
import requests 
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as cp
import itertools


# Scraping Books

## Step 1: Scraping book links or titles

In [31]:
def scrapeBookTags(link):
  response = requests.get(link)
  book_soup = BeautifulSoup(response.text, 'html.parser')

  book_tags = book_soup.find_all('a', attrs={'class': 'link'})

  book_tags = [tag.attrs['href'] for tag in book_tags
                if tag.attrs['href'].startswith('/ebooks') & tag.attrs['href'][-1].isdigit()]

  book_tags = list(dict.fromkeys(book_tags))

  return book_tags

book_tags = []
indices = ['0', '25', '50', '75', '100']
url = "https://www.gutenberg.org/ebooks/search/?sort_order=downloads&start_index="

for index in indices:
  book_tags = book_tags + scrapeBookTags(url+index)

print("In total we have " + str(len(book_tags)) + " book titles") # Comment out afterwards
print("Displaying 10 titles") # Comment out afterwards
print(book_tags[:10]) # Comment out afterwards

In total we have 125 book titles
Displaying 10 titles
['/ebooks/1342', '/ebooks/84', '/ebooks/11', '/ebooks/16328', '/ebooks/25344', '/ebooks/1661', '/ebooks/2701', '/ebooks/1952', '/ebooks/1232', '/ebooks/174']


## Step 2: Extract the longest paragraph and title from each book and clean text

In [32]:
def cleanTitle(title):
    # try remove 'gutenberg ebook of ' and ', by .....'
    title = title.lower()
    if (title.find('gutenberg ebook of ') != -1):
        title = title[title.find('gutenberg ebook of ') + len('gutenberg ebook of '):]
    if (title.find('gutenberg e-text of ') != -1):
        title = title[title.find('gutenberg e-text of ') + len('gutenberg e-text of '):]
    if (title.find('gutenberg ebook ') != -1):
        title = title[title.find('gutenberg ebook ') + len('gutenberg ebook '):]
    if (title.find('gutenberg book of ') != -1):
        title = title[title.find('gutenberg book of ') + len('gutenberg book of '):]
    if (title.find('gutenberg book ') != -1):
        title = title[title.find('gutenberg book ') + len('gutenberg book '):]

    if (title.find(', by') != -1):
        title = title[:title.find(', by')]
    if (title.find(',by') != -1):
        title = title[:title.find(',by')]
    if (title.find('. by') != -1):
        title = title[:title.find('. by')]
    if (title.find('.by') != -1):
        title = title[:title.find('.by')]
    if (title.find('translated by') != -1):
        title = title[:title.find('translated by')]
    return title

In [33]:
def cleanText(text):
  text = re.sub(r'[^\x00-\x7F]+',' ', text) # replace non ascii char
  text = text.replace('\r', '  ').replace('\n', '  ').replace('\t', '  ').strip() # remove \r \n \t and spaces in head and tail
  text = text.replace(':', ': ').replace(';', '; ').replace(',', ', ').replace('.', '. ') # add space after the symbols

  while '  ' in text:
      text = text.replace('  ',  ' ') # remove redundant spaces
  text = text.replace(' :', ':').replace(' ;', ';').replace(' ,', ',').replace(' .', '.') # remove space before the symbols

  return text

In [34]:
list_of_tags = [tag.replace("/ebooks/", "") for tag in book_tags]

book_links = [f"https://www.gutenberg.org/files/{tag}/{tag}-h/{tag}-h.htm" for tag in list_of_tags]

bookDf = pd.DataFrame(data={'Book': [], 'Url': [], 'Paragraph': []})

max_paragraphs = 100

for index, book_link in enumerate(book_links):
    print('\rProcessing[' + str(index).zfill(3) + ']: ' + book_link, end = ' ', flush=True)

    # Getting book
    request = requests.get(book_link)
    # Check if book has a htm online reading
    if (request.status_code == 404):
        print(f"\nError, link {book_link} was not found.")
        continue
    
    # Reading book
    encoding = request.encoding if 'charset' in request.headers.get('content-type', '').lower() else None
    book = BeautifulSoup(request.content, from_encoding=encoding)

    # Get title
    title = cleanText(book.title.get_text()).replace('"', '')
    title = cleanTitle(title=title)

    # Get paragraphs
    paragraphs = book.find_all('p')
    paragraphs = [cleanText(paragraph.get_text()) for paragraph in paragraphs]

    # Save the longest paragraph
    paragraph = max(paragraphs, key=len)
    bookDf.loc[bookDf.shape[0]] = [title, book_link, paragraph]
    
    del request, encoding, book, title, paragraphs, paragraph

    # stop scraping if the limit of 100 paragraphs have been scraped
    if len(bookDf.index) == max_paragraphs:
        print("\n100 paragraphs have been scraped, ending process")
        break
    
bookDf


Processing[061]: https://www.gutenberg.org/files/26184/26184-h/26184-h.htm 
Error, link https://www.gutenberg.org/files/26184/26184-h/26184-h.htm was not found.
Processing[100]: https://www.gutenberg.org/files/3825/3825-h/3825-h.htm 
100 paragraphs have been scraped, ending process


Unnamed: 0,Book,Url,Paragraph
0,pride and prejudice,https://www.gutenberg.org/files/1342/1342-h/13...,"By this time, my dearest sister, you have rece..."
1,frankenstein,https://www.gutenberg.org/files/84/84-h/84-h.htm,One of the phenomena which had peculiarly attr...
2,alice s adventures in wonderland,https://www.gutenberg.org/files/11/11-h/11-h.htm,"You are old, Father William, the young man sai..."
3,beowulf: an anglo-saxon epic poem,https://www.gutenberg.org/files/16328/16328-h/...,Perhaps every Anglo-Saxon scholar has his own ...
4,the scarlet letter,https://www.gutenberg.org/files/25344/25344-h/...,"The father of the Custom-House the patriarch, ..."
...,...,...,...
95,"the extraordinary adventures of ars ne lupin, ...",https://www.gutenberg.org/files/6133/6133-h/61...,"And then, exclaimed Ars ne Lupin, I held in my..."
96,the entire original maupassant short stories,https://www.gutenberg.org/files/3090/3090-h/30...,For a whole week his mind was occupied with th...
97,sense and sensibility,https://www.gutenberg.org/files/161/161-h/161-...,"That is exactly what I said, my dear. Lord! sa..."
98,the mysterious affair at styles,https://www.gutenberg.org/files/863/863-h/863-...,"One can only guess, but I believe my guess to ..."


## Step 3: Output into csv 


In [35]:
bookDf.to_csv('gutenBerg_100_paragraphs.csv', index=False)