In [8]:
import os
import requests
import re
import tarfile
import glob
import time

In [9]:
# Downloading tar 
catalog_url = "https://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2"
catalog_file = "gutenberg_catalog.rdf"

response = requests.get(catalog_url)

# Save tar for future use
with open("rdf-files.tar.bz2", "wb") as file:
    file.write(response.content)

print("✅ Catalog downloaded, let's unpack it")

✅ Каталог завантажено! Тепер потрібно його розпакувати.


In [10]:
# Unpacking tar
with tarfile.open("rdf-files.tar.bz2", "r:bz2") as tar:
    tar.extractall("rdf_catalog")

print("📂 Done")

📂 Каталог розпаковано!


In [11]:
# Looking for all files in tar
rdf_files = glob.glob("rdf_catalog/**/*.rdf", recursive=True)

ebook_ids = []

for rdf_file in rdf_files:
    match = re.search(r"ebooks/(\d+)", rdf_file)
    if match:
        ebook_ids.append(int(match.group(1)))

ebook_ids = list(set(ebook_ids)) 
ebook_ids.sort()

print(f"📚 Found {len(ebook_ids)} books in the catalog!")

📚 Знайдено 0 книг у каталозі!


In [None]:
# Creating folder for saving books 
os.makedirs("poetry_books", exist_ok=True)

# Download only first 25000 books 
ebook_ids = ebook_ids[:25000]

def download_gutenberg_book(ebook_id):
    url = f"https://www.gutenberg.org/cache/epub/{ebook_id}/pg{ebook_id}.txt"
    
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            file_path = f"poetry_books/book_{ebook_id}.txt"

            with open(file_path, "w", encoding="utf-8") as file:
                file.write(response.text)
            
            print(f"✅ Downloaded: {file_path}")
            return True
        else:
            print(f"❌ Can't download book with id: {ebook_id}")
            return False
    except Exception as e:
        print(f"⚠ Something wrong with downloading {ebook_id}: {e}")
        return False

# Downloading all books from list
count = 0
for ebook_id in ebook_ids:
    if download_gutenberg_book(ebook_id):
        count += 1
        if count >= 25000:  
            break
    time.sleep(1)  

print(f"🎉 Downloaded {count} books!")

✅ Завантажено: poetry_books/book_12242.txt
✅ Завантажено: poetry_books/book_1041.txt
✅ Завантажено: poetry_books/book_16328.txt
✅ Завантажено: poetry_books/book_22681.txt
✅ Завантажено: poetry_books/book_15553.txt
📚 Всі книги завантажено!


In [5]:
def clean_text(text):
    """ Deleting meta data in book"""
    parts = text.split("*** START OF THE PROJECT GUTENBERG EBOOK")
    if len(parts) > 1:
        text = parts[1]
    text = text.split("*** END OF THE PROJECT GUTENBERG EBOOK")[0]
    text = re.sub(r'\n+', '\n', text)  
    text = re.sub(r'[^\w\s.,!?;:\'-]', '', text)  
    return text.strip()

# Clearing all downloaded books 
for ebook_id in ebook_ids[:25000]:
    file_path = f"poetry_books/book_{ebook_id}.txt"

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            raw_text = file.read()

        cleaned_text = clean_text(raw_text)

        with open(file_path, "w", encoding="utf-8") as file:
            file.write(cleaned_text)

        print(f"🧹 Clear: {file_path}")

    except FileNotFoundError:
        print(f"⚠ File don't found : {file_path}")

print("✅ All books clear!")

🧹 Очищено: poetry_books/book_12242.txt
🧹 Очищено: poetry_books/book_1041.txt
🧹 Очищено: poetry_books/book_16328.txt
🧹 Очищено: poetry_books/book_22681.txt
🧹 Очищено: poetry_books/book_15553.txt
✅ Усі книги очищено!
