In [None]:
import concurrent.futures
import pandas as pd
import requests
import random

In [None]:
def get_book(book_id: int) -> (str, str):
    """Get a book from Project Gutenberg

    Arg:
        book_id: The id of the book to get

    Returns:
        A tuple containing the book id and the book text
    """

    gutendex_url = f"https://gutendex.com/books/{book_id}/"

    response = requests.get(gutendex_url)
    response.raise_for_status()
    data = response.json()

    book_language = data["languages"]

    book_title = data["title"]
    book_url = None

    mime_types = ["text/plain; charset=us-ascii", "text/plain"]

    for mime_type in mime_types:
        if mime_type in data["formats"]:
            book_url = data["formats"][mime_type]
            break

    if book_url is None:
        raise Exception(f"No text/plain format found for {book_id}")

    response = requests.get(book_url)
    response.raise_for_status()

    return book_id, book_language, book_title, response.text

In [None]:
def get_n_books(n: int) -> pd.DataFrame:
    """Get n books from Project Gutenberg

    Arg:
        n: The number of books to get

    Returns:
        A dataframe containing the book id, language, title, and text
    """

    max_book_count = requests.get("https://gutendex.com/books/").json()["count"]
    random_book_numbers = random.sample(range(max_book_count), n)

    books = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(get_book, book_id) for book_id in random_book_numbers
        ]

        for future in concurrent.futures.as_completed(futures):
            try:
                book_id, book_language, book_title, book_text = future.result()
                books.append((book_id, book_language, book_title, book_text))
            except Exception as e:
                print(e)
                continue

    return pd.DataFrame(books, columns=["book_id", "languages", "title", "text"])

In [None]:
df_books1000 = get_n_books(1000)
df_books1000.to_parquet("books1000.parquet", engine="fastparquet", compression="gzip")