### Extract all Books about Poetry form Project Guthenberg

In [11]:
import json
import re

import requests
from tqdm import tqdm

# for downloading the txt file directly
TXT_URL = "https://www.gutenberg.org/cache/epub/{}/pg{}.txt"


In [None]:
poetry_books = {}
topic = "poetry"
result = requests.get(url=f"https://gutendex.com/books/?page=1&topic={topic}")
content = json.loads(result.content)

while True:
    for book in content["results"]:
        if book["languages"] == ["en"]:
            # account for different translations
            if book["title"] in poetry_books.keys():
                poetry_books[book["title"]].append(
                    TXT_URL.format(book["id"], book["id"])
                )
            else:
                poetry_books[book["title"]] = [TXT_URL.format(book["id"], book["id"])]
    print(f"Total number of books so far: {len(list(poetry_books.keys()))}")
    if not content["next"]:
        break
    result = requests.get(content["next"])
    content = json.loads(result.content)

with open("guthenberg_poetry_books.json", "w") as f:
    f.write(json.dumps(poetry_books, indent=4))


### Download all Books

In [2]:
with open("guthenberg_poetry_books.json", "r") as f:
    poetry_books = json.load(f)


In [14]:
failed_to_download = {}
for title, urls in tqdm(poetry_books.items()):
    downloaded = False
    for url in urls:
        result = requests.get(url=url)
        # always download the first available translation
        if result.status_code == 200:
            downloaded = True
            break
    if downloaded:
        clean_title = re.sub("[^A-Za-z]", "", title)
        # truncate titles that are way too long
        clean_title = clean_title[:75]
        with open(f"guthenberg_books/{clean_title}.txt", "w", encoding="utf-8") as f:
            f.write(result.content.decode("utf-8"))
    else:
        failed_to_download[title] = urls

with open("failed_to_download.json", "w") as f:
    f.write(json.dumps(failed_to_download, indent=4))

100%|██████████| 2713/2713 [34:13<00:00,  1.32it/s]


### Process books

In [25]:
import numpy as np
from tqdm import tqdm
import os
import re

In [3]:
books = os.listdir("guthenberg_books")
random_book = np.random.choice(books)

In [19]:
def extract_content(filename: str):
    with open(filename, "r", encoding="utf-8") as f:
        book_data = f.read()
    lines = book_data.split("\n")
    start_idx = None
    end_idx = None
    for i, line in enumerate(lines):
        if line.strip().startswith("*** END OF THE PROJECT"):
           end_idx = i
        elif line.strip().startswith("*** START OF THE PROJECT"):
           start_idx = i
    if not start_idx or not end_idx:
        print(f"Could not find content markers for {filename}")
        return None
    return "\n".join(lines[start_idx+1:end_idx-1])

In [21]:
for book_name in tqdm(books):
    try:
        extracted_content = extract_content(f"guthenberg_books/{book_name}")
    except Exception as _:
        print(f"Could not process {book_name}")
    with open(f"guthenberg_books_clean/{book_name}", "w", encoding="utf-8") as f:
        f.write(extracted_content)

  0%|          | 0/2511 [00:00<?, ?it/s]

 31%|███       | 782/2511 [00:08<00:44, 38.62it/s] 

Could not process Kalevala 


 63%|██████▎   | 1572/2511 [00:17<00:08, 106.68it/s]

Could not process The complete works of John Gower, volume 4 
Could not process The vision of hell.


100%|██████████| 2511/2511 [00:34<00:00, 73.02it/s] 


In [28]:
def remove_illustrations(book_content: str):
    pattern = re.compile(r'\[ILLUSTRATION\]\n')
    return pattern.sub('', book_content)

def remove_too_many_newlines(book_content: str):
    pattern = re.compile(r'\n{3,}')
    return pattern.sub('\n', book_content)

In [29]:
for book_name in tqdm(os.listdir("guthenberg_books_clean")):
    with open(f"guthenberg_books_clean/{book_name}", "r", encoding="utf-8") as f:
        content = f.read()
    clean_content = remove_illustrations(content)
    clean_content = remove_too_many_newlines(clean_content)
    with open(f"guthenberg_books_clean/{book_name}", "w", encoding="utf-8") as f:
        f.write(clean_content)

100%|██████████| 2511/2511 [00:24<00:00, 102.28it/s]


### Count words

In [30]:
number_of_words = 0
for book_name in tqdm(os.listdir("guthenberg_books_clean")):
    with open(f"guthenberg_books_clean/{book_name}", "r", encoding="utf-8") as f:
        content = f.read()
    number_of_words += len(content.split())
print(f"Number of words from all books: {number_of_words:,}")

100%|██████████| 2511/2511 [00:12<00:00, 202.76it/s]

Number of words from all books: 77,696,677



