In [1]:
from bs4 import BeautifulSoup
import requests
import re

### Helper functions

In [43]:
def scrapeText(url: str, currentPage: int):

    completeText = ''
    
    while (currentPage):
        if currentPage == 1:
            currentUrl = requests.get(url.format('')).text
        else:
            currentUrl = requests.get(url.format('_' + str(currentPage))).text
 
        soup = BeautifulSoup(currentUrl, 'lxml')

        if '404' in soup.find('title').text or '301' in soup.find('title').text:
            break
        
        textDiv = soup.find('div', {'class': 'text'})
        paragraphs = textDiv.find_all('p')
        
        for sentence in paragraphs:
            completeText += sentence.text + '%'
        
        print('Page No. ' + str(currentPage) + ': ' + str(len(paragraphs)) + ' paragraphs')
        currentPage += 1
        
    return completeText

def cleanText(text, removeUntil):
    if len(removeUntil) > 0:
        text = re.sub(fr'.*(?<={removeUntil})', '', text)
        
    # 1. Split in paragraphs to filter headings, chapter numbers etc.
    
    paragraphs = text.split('%')
    paragraphs = [re.sub(r'^Chapter [0-9]+|([A-Z|\s|-]+)$', '', el) for el in paragraphs]
    text = ''.join(paragraphs)
   
    # 2. Escape in-text colons temporarily
    
    text = text.replace('S.R.', 'S#R#')
    text = text.replace('F.A.', 'F#A#')
    text = text.replace('Mr.', 'Mr#')
    text = text.replace('Mrs.', 'Mrs#')
    text = text.replace('S.-B.', 'S#-B#')
        
    # 3. Replace some stuff and split into single sentences
    
    text = text.replace('’', '')
    text = text.replace('‘', '')
    text = text.replace('(', '')
    text = text.replace(')', '')
    text = text.replace('“', '')
    text = text.replace('”', '')

    splitted = text.split('.')
    splitted = [el.strip() for el in splitted]
    splitted = [el.replace('S#R#', 'S.R.') for el in splitted]
    splitted = [el.replace('F#A#', 'F.A.') for el in splitted]
    splitted = [el.replace('Mr#', 'Mr.') for el in splitted]
    splitted = [el.replace('Mrs#', 'Mrs.') for el in splitted]
    splitted = [el.replace('S#-B#', 'S.-B.') for el in splitted]
    splitted = [el for el in splitted if el]
    
    return splitted

### Scrape and preprocess texts

In [44]:
lotr1Raw = scrapeText('https://thefreebooksonline.net/classics/u5691{}.html', 7)
lotr2Raw = scrapeText('https://thefreebooksonline.net/classics/u5690{}.html', 1)
lotr3Raw = scrapeText('https://thefreebooksonline.net/classics/u5689{}.html', 2)
hobbitRaw = scrapeText('https://thefreebooksonline.net/classics/u5688{}.html', 1)
silmarillionRaw = scrapeText('https://thefreebooksonline.net/classics/u5687{}.html', 1)

Page No. 7: 16 paragraphs
Page No. 8: 19 paragraphs
Page No. 9: 24 paragraphs
Page No. 10: 12 paragraphs
Page No. 11: 17 paragraphs
Page No. 12: 33 paragraphs
Page No. 13: 31 paragraphs
Page No. 14: 25 paragraphs
Page No. 15: 33 paragraphs
Page No. 16: 25 paragraphs
Page No. 17: 25 paragraphs
Page No. 18: 27 paragraphs
Page No. 19: 21 paragraphs
Page No. 20: 20 paragraphs
Page No. 21: 23 paragraphs
Page No. 22: 23 paragraphs
Page No. 23: 25 paragraphs
Page No. 24: 26 paragraphs
Page No. 25: 20 paragraphs
Page No. 26: 23 paragraphs
Page No. 27: 22 paragraphs
Page No. 28: 26 paragraphs
Page No. 29: 24 paragraphs
Page No. 30: 26 paragraphs
Page No. 31: 36 paragraphs
Page No. 32: 33 paragraphs
Page No. 33: 22 paragraphs
Page No. 34: 23 paragraphs
Page No. 35: 25 paragraphs
Page No. 36: 25 paragraphs
Page No. 37: 28 paragraphs
Page No. 38: 26 paragraphs
Page No. 39: 24 paragraphs
Page No. 40: 25 paragraphs
Page No. 41: 19 paragraphs
Page No. 42: 15 paragraphs
Page No. 43: 26 paragraphs
Page

Page No. 41: 23 paragraphs
Page No. 42: 17 paragraphs
Page No. 43: 26 paragraphs
Page No. 44: 24 paragraphs
Page No. 45: 27 paragraphs
Page No. 46: 19 paragraphs
Page No. 47: 25 paragraphs
Page No. 48: 30 paragraphs
Page No. 49: 22 paragraphs
Page No. 50: 22 paragraphs
Page No. 51: 18 paragraphs
Page No. 52: 20 paragraphs
Page No. 53: 14 paragraphs
Page No. 54: 17 paragraphs
Page No. 55: 18 paragraphs
Page No. 56: 18 paragraphs
Page No. 57: 12 paragraphs
Page No. 58: 16 paragraphs
Page No. 59: 18 paragraphs
Page No. 60: 17 paragraphs
Page No. 61: 23 paragraphs
Page No. 62: 20 paragraphs
Page No. 63: 26 paragraphs
Page No. 64: 22 paragraphs
Page No. 65: 16 paragraphs
Page No. 66: 15 paragraphs
Page No. 67: 28 paragraphs
Page No. 68: 19 paragraphs
Page No. 69: 20 paragraphs
Page No. 70: 15 paragraphs
Page No. 71: 18 paragraphs
Page No. 72: 23 paragraphs
Page No. 73: 15 paragraphs
Page No. 74: 23 paragraphs
Page No. 75: 20 paragraphs
Page No. 76: 23 paragraphs
Page No. 77: 23 paragraphs
P

In [46]:
lotr1Clean = cleanText(lotr1Raw, 'Chapter 1')
lotr2Clean = cleanText(lotr2Raw, '')
lotr3Clean = cleanText(lotr3Raw, 'battle in the West')
lotr1Clean = lotr1Clean +lotr2Clean + lotr3Clean
hobbitClean = cleanText(hobbitRaw, 'Chapter I')
silmarillionClean = cleanText(silmarillionRaw, 'Christopher Tolkien')

### Save text to .js files

In [47]:
with open('lotr.js', 'w', encoding='utf-8') as f:
    f.write(f'const lotr = {lotr1Clean}\n\n')
    f.write('module.exports = lotr;')
with open('hobbit.js', 'w', encoding='utf-8') as f:
    f.write(f'const hobbit = {hobbitClean}\n\n')
    f.write('module.exports = hobbit;')
with open('silmarillion.js', 'w', encoding='utf-8') as f:
    f.write(f'const silmarillion = {silmarillionClean}\n\n')
    f.write('module.exports = silmarillion;')