# Project Gutenberg Book Sampling Pipeline

This notebook processes Project Gutenberg books from the BabyLM corpus, fetches metadata from Gutendex API, and organizes books by genre for targeted sampling.

## Features
- Extract individual books from the corpus
- Fetch metadata (author, title, subjects, bookshelves) from Gutendex
- Organize books by literary genres
- Generate genre-specific datasets

In [None]:
import requests
import os
import json

## 1. Book Extraction and Metadata Collection

In [None]:

# fetch the text of the books from the Gutenberg dataset (too large to upload to GitHub)
with open(f"../datasets/train_100M/gutenberg.train", "r", encoding="utf-8") as f:
    all_books = f.read()
lines = all_books.split("\n")

# split up the books
texts = {}
beginning_indices = []
for i in range(len(lines)):
    if lines[i].startswith("= = = "):
        beginning_indices.append(i)

for i in range(len(beginning_indices) - 1):
    gutenberg_id = lines[beginning_indices[i]][8:].split()[0]
    if not gutenberg_id.isdigit():
        print("Id-length: ", len(gutenberg_id), "for id:", gutenberg_id)
        continue
    text = " ".join(lines[(beginning_indices[i] + 1):(beginning_indices[i + 1] - 1)])
    texts[gutenberg_id] = text
print(f"Found {len(texts)} books in the Gutenberg dataset.")
print("Example book text:", texts.get("52018", "")[:500])  # Print first 500 characters of a sample book

books_with_metadata = {} # id -> { 'text': str, 'author': str, 'title': str, 'subjects': list, 'bookshelves': list }

# if metadata is locally available, load it
if os.path.exists("../datasets/gutenberg/dataset_books_with_metadata.json"):
    with open("../datasets/gutenberg/dataset_books_with_metadata.json", "r", encoding="utf-8") as f:
        books_with_metadata = json.load(f)
    print(f"Loaded {len(books_with_metadata)} books with metadata from local file.")
    print("Example book metadata:", books_with_metadata.get("52018", {}))

# otherwise, fetch metadata from Gutendex
else: 
    # Fetch metadata for each book from Gutendex
    count = 0
    for gutenberg_id, text in texts.items():
        count += 1
        if count % 50 == 0:
            print(f"Processing book ID {gutenberg_id}... ({count}/{len(texts)})")
        try:
            response = requests.get(f"https://gutendex.com/books/{gutenberg_id}", timeout=30)
            response.raise_for_status()
            metadata = response.json()

            authors = metadata.get("authors", [])
            if not authors:
                author = "Unknown"
            else:
                author = authors[0].get("name", "Unknown")
            
            books_with_metadata[gutenberg_id] = {
                
                'author': author,
                'title': metadata["title"],
                'subjects': metadata["subjects"],
                'bookshelves': metadata["bookshelves"]
            }
        except requests.exceptions.RequestException as e:
            print(f"Failed to get metadata for book {gutenberg_id}: {e}")
        except Exception as e:
            print(f"Error processing book {gutenberg_id}: {e}")
    with open("../datasets/gutenberg/dataset_books_with_metadata.json", "w", encoding="utf-8") as f:
        json.dump(books_with_metadata, f, ensure_ascii=False, indent=4)
    print(len(books_with_metadata), "books with metadata found.")
    print("Example book metadata:", books_with_metadata.get(52018, {}))

for gutenberg_id, book in books_with_metadata.items():
    if gutenberg_id in texts:
        book['text'] = texts[gutenberg_id]


In [None]:
all_bookshelves = set()
all_subjects = set()
bookshelve_counts = {}
bookshelve_word_counts = {}
subject_counts = {}
subject_word_counts = {}
for book in books_with_metadata.values():
    word_count = len(book["text"].split())
    for bookshelve in book["bookshelves"]:
        all_bookshelves.add(bookshelve)
        if bookshelve in bookshelve_counts:
            bookshelve_counts[bookshelve] += 1
            bookshelve_word_counts[bookshelve] += word_count
        else:
            bookshelve_counts[bookshelve] = 1
            bookshelve_word_counts[bookshelve] = word_count
    for subject in book["subjects"]:
        all_subjects.add(subject)
        if subject in subject_counts:
            subject_counts[subject] += 1
            subject_word_counts[subject] += word_count
        else:
            subject_counts[subject] = 1
            subject_word_counts[subject] = word_count

print(f"Total unique bookshelves: {len(all_bookshelves)}")
print(f"Total unique subjects: {len(all_subjects)}")


In [None]:

bookshelf_list = sorted(list(all_bookshelves))
subject_list = sorted(list(all_subjects))
# print 10 most common bookshelves and subjects with their counts
print("\nMost common bookshelves:")
sorted_bookshelves = sorted(bookshelve_counts.items(), key=lambda x: x[1], reverse=True)
for bookshelf, count in sorted_bookshelves[:10]:
    print(f"{bookshelf}: {count} books")
print("\nMost common subjects:")
sorted_subjects = sorted(subject_counts.items(), key=lambda x: x[1], reverse=True)
for subject, count in sorted_subjects[:10]:
    print(f"{subject}: {count} books")

# print 10 longest bookshelves and subjects with their total word counts
print("\nLongest bookshelves:")
sorted_bookshelves_by_length = sorted(bookshelve_word_counts.items(), key=lambda x: x[1], reverse=True)
for bookshelf, word_count in sorted_bookshelves_by_length[:10]:
    print(f"{bookshelf}: {word_count} words")
print("\nLongest subjects:")
sorted_subjects_by_length = sorted(subject_word_counts.items(), key=lambda x: x[1], reverse=True)
count = 1
for subject, word_count in sorted_subjects_by_length:
    if word_count < 500_000:
        break
    print(f"{count}: {subject}: {word_count} words")
    count += 1



## 2. Metadata Analysis

Analysis of book subjects and bookshelves to understand genre distribution and content availability.

In [None]:
# create a mapping from subjects to books
subject_to_book_map = {}
for subject in all_subjects:
    subject_to_book_map[subject] = []
for book_id, book in books_with_metadata.items():
    for subject in book["subjects"]:
        subject_to_book_map[subject].append(book_id)

# look at all pairs of subject, and check how many books they have in common
subject_pairs = {}
for i in range(len(subject_list)):
    for j in range(i + 1, len(subject_list)):
        subject1 = subject_list[i]
        subject2 = subject_list[j]
        common_books = set(subject_to_book_map[subject1]) & set(subject_to_book_map[subject2])
        if common_books:
            subject_pairs[(subject1, subject2)] = len(common_books)
print("\nMost common subject pairs:")
sorted_subject_pairs = sorted(subject_pairs.items(), key=lambda x: x[1], reverse=True)
for (subject1, subject2), count in sorted_subject_pairs[:10]:
    print(f"{subject1} & {subject2}: {count} common books")

In [None]:
# takes in a list of tuples (gutenberg_id, name) a filename and a word limit
# writes file with the books that have a word count less than the limit to 
# ./datasets/gutenberg/genres/filename.train
def create_dataset_from_list(books, filename, word_limit):
    if os.path.exists(f"../datasets/gutenberg/genres/{filename}.train"):
        os.remove(f"../datasets/gutenberg/genres/{filename}.train")
    word_count = 0
    for gutenberg_id, name in books:
        try:
            response = requests.get(f"https://www.gutenberg.org/cache/epub/{gutenberg_id}/pg{gutenberg_id}.txt", timeout=30)
            response.raise_for_status()
            text = response.text
            print(f"Processing book {gutenberg_id} - {name}... ({len(text.split())} words)")
            # print(f"First 100 characters: {text[:100]}")
            if word_count + len(text.split()) > word_limit:
                print(f"Word limit reached. Stopping.")
                # write the remaining number of words to hit the word limit exactly
                text = " ".join(text.split()[:word_limit - word_count])
                with open(f"../datasets/gutenberg/genres/{filename}.train", "a", encoding="utf-8") as f:
                    f.write(f"= = = {gutenberg_id} {name}\n")
                    f.write(text + "\n")
                print(f"Wrote {word_count + len(text.split())} words to {filename}.train with word limit {word_limit}.")
                break
            word_count += len(text.split())
            with open(f"../datasets/gutenberg/genres/{filename}.train", "a", encoding="utf-8") as f:
                f.write(f"= = = {gutenberg_id} {name}\n")
                f.write(text + "\n")
        except requests.exceptions.RequestException as e:
            print(f"Failed to get book {gutenberg_id}: {e}")
        except Exception as e:
            print(f"Error processing book {gutenberg_id}: {e}")
    print(f"Total words written to {filename}.train: {word_count}")

## 3. Genre Dataset Creation

Creates focused datasets for specific literary genres with 1M words for training and 200k words for development.

In [None]:
sci_fi_fantasy = [
    (55, "The Wonderful Wizard of Oz"),
    (36, "The War of the Worlds"),
    (20000, "Twenty Thousand Leagues Under the Sea"),
    (7477, "The Book of Wonder"),
    (20782, "Triplanetary"),
    (10002, "The House on the Borderland"),
    (35, "The Time Machine"),
    (18857, "A Journey to the Centre of the Earth"),
    (1250, "Anthem"),
    (159, "The Island of Doctor Moreau"),
    (21279, "2 B R 0 2 B"),
    (8395, "The Gods of Pegāna"),
    (11, "Alice's Adventures in Wonderland"),   
    (7506, "The Steam Man of the Prairies"),              
    (289, "The Night Land"),                              
    (1230, "The Lost World"),                            
    (22615, "The Princess Nobody: A Tale of Fairyland"),  
    (1505, "The First Men in the Moon"),                
    (12163, "The Sleeper Awakes"),                      
    (3479, "The Metal Monster"),                       
    (17355, "The Runaway Skyscraper"),                  
    (829, "Gulliver's Travels"),
    (12, "Through the Looking‑Glass"),
    (780, "The War in the Air"),    
]

sci_fi_fantasy_dev = [
    (765, "The Moon Pool"),               
    (1013, "The First Men in the Moon")
    ]

create_dataset_from_list(sci_fi_fantasy, "1M_sci-fi_fantasy", 1_000_000)
create_dataset_from_list(sci_fi_fantasy_dev, "1M_sci-fi_fantasy_dev", 200_000)

In [None]:
romance = [
    (1342, "Pride and Prejudice"),                    # :contentReference[oaicite:0]{index=0}
    (161, "Sense and Sensibility"),                   # :contentReference[oaicite:1]{index=1}
    (158, "Emma"),                                    # :contentReference[oaicite:2]{index=2}
    (105, "Persuasion"),                              # :contentReference[oaicite:3]{index=3}
    (121, "Northanger Abbey"),                        # :contentReference[oaicite:4]{index=4}
    (946, "Lady Susan"),                              # :contentReference[oaicite:5]{index=5}
    (768, "Wuthering Heights"),                       # :contentReference[oaicite:7]{index=7}
    (2413, "Madame Bovary"),                          # :contentReference[oaicite:9]{index=9}
    (107, "Far from the Madding Crowd"),              # :contentReference[oaicite:10]{index=10}
    (2641, "A Room with a View"),                     # :contentReference[oaicite:11]{index=11}
    (541, "The Age of Innocence"),                    # :contentReference[oaicite:12]{index=12}
    (284, "The House of Mirth"),                      # :contentReference[oaicite:13]{index=13}
    (40619, "Camilla; or, A Picture of Youth"),       # :contentReference[oaicite:17]{index=17}
    (6346, "Cecilia; Or, Memoirs of an Heiress"),     # :contentReference[oaicite:18]{index=18}
    (498, "Rebecca of Sunnybrook Farm"),              # :contentReference[oaicite:19]{index=19}
]
romance_dev = [
    (1399, "Anna Karenina"),  
    (1260, "Jane Eyre"),    
    (4517, "Ethan Frome"),            
    (969, "The Tenant of Wildfell Hall"),
    (6053, "Evelina"),
]



create_dataset_from_list(romance, "1M_romance", 1_000_000)
create_dataset_from_list(romance_dev, "1M_romance_dev", 200_000)

In [None]:
self_help_non_fiction = [
    (935, "Self-Help"),  # :contentReference[oaicite:0]{index=0}
    (14418, "Thrift"),  # :contentReference[oaicite:1]{index=1}
    (2541, "Character"),  # :contentReference[oaicite:2]{index=2}
    (4507, "As a Man Thinketh"),  # :contentReference[oaicite:3]{index=3}
    (59844, "The Science of Getting Rich"),  # :contentReference[oaicite:4]{index=4}
    (33917, "The Science of Being Well"),  # :contentReference[oaicite:5]{index=5}
    (36898, "Increasing Personal Efficiency"),  # :contentReference[oaicite:6]{index=6}
    (2274, "How to Live on 24 Hours a Day"),  # :contentReference[oaicite:7]{index=7}
    (74178, "Out from the Heart"),  # :contentReference[oaicite:8]{index=8}
    (74878, "The Game of Life and How to Play It"),  # :contentReference[oaicite:9]{index=9}
    (147, "Common Sense"),  # :contentReference[oaicite:10]{index=10}
    (20203, "Autobiography of Benjamin Franklin"),  # :contentReference[oaicite:11]{index=11}
    (43855, "Franklin's Way to Wealth; or, \"Poor Richard Improved\""),  # :contentReference[oaicite:12]{index=12}
    (205, "Walden, and On The Duty Of Civil Disobedience"),  # :contentReference[oaicite:13]{index=13}
    (3600, "Essays of Michel de Montaigne — Complete"),  # :contentReference[oaicite:14]{index=14}
    (1232, "The Prince"),  # :contentReference[oaicite:15]{index=15}
    (816, "Democracy in America — Volume 2"),  # :contentReference[oaicite:17]{index=17}
]

self_help_non_fiction_dev = [
    (815, "Democracy in America — Volume 1"),  # :contentReference[oaicite:16]{index=16}
    (3741, "The American Crisis"),  # :contentReference[oaicite:18]{index=18}
]

create_dataset_from_list(self_help_non_fiction, "1M_self_help_non_fiction", 1_000_000)
create_dataset_from_list(self_help_non_fiction_dev, "1M_self_help_non_fiction_dev", 200_000)

In [None]:
old_english_drama_poetry = [
    (1514,  "A Midsummer Night's Dream"),
    (1523,  "As You Like It"),           
    (1531,  "Othello, the Moor of Venice"),
    (1524,  "Hamlet, Prince of Denmark"),  
    (1533,  "Macbeth"),                    
    (1532,  "King Lear"),                  
    (1515,  "The Merchant of Venice"), 
    (1774,  "Love's Labour's Lost"), 
    (1103,  "King Richard III"),                       # :contentReference[oaicite:12]{index=12}
    (1508,  "The Taming of the Shrew"),                # :contentReference[oaicite:13]{index=13}
    (15272, "The Faerie Queene, Book I"),              # :contentReference[oaicite:14]{index=14}
    (779,   "The Tragical History of Doctor Faustus"), # :contentReference[oaicite:15]{index=15}
    (56375, "Sir P.S.: His Astrophel and Stella"),     # :contentReference[oaicite:16]{index=16}
    (2232,  "The Duchess of Malfi"),                   # :contentReference[oaicite:17]{index=17}
    (1094,  "Tamburlaine the Great — Part 1"),         # :contentReference[oaicite:18]{index=18}
    (35330, "The Spanish Tragedy"),                    # :contentReference[oaicite:19]{index=19}
    (1589,  "Tamburlaine the Great — Part 2"),         # :contentReference[oaicite:20]{index=20}
    (4039,  "Volpone; Or, The Fox"),                   # :contentReference[oaicite:21]{index=21}
    (4081,  "The Alchemist"),                          # :contentReference[oaicite:22]{index=22}
    (3694,  "Every Man in His Humour"),                # :contentReference[oaicite:23]{index=23}
    (1041, "Shakespeare's Sonnets"),                         # :contentReference[oaicite:0]{index=0}
    (1045, "Venus and Adonis"),                              # :contentReference[oaicite:1]{index=1}
    (1505, "The Rape of Lucrece"),                           # :contentReference[oaicite:2]{index=2}
    (18781, "Hero and Leander"),                             # :contentReference[oaicite:3]{index=3}
    (20288, "Edward the Second"),                            # :contentReference[oaicite:4]{index=4}
    (901, "The Jew of Malta"),                               # :contentReference[oaicite:5]{index=5}
    (4011, "Epicoene; Or, The Silent Woman"),                # :contentReference[oaicite:6]{index=6}
    (12915, "The White Devil"),                              # :contentReference[oaicite:7]{index=7}
    (20, "Paradise Lost"),                                   # :contentReference[oaicite:8]{index=8}
    (58, "Paradise Regained"),                               # :contentReference[oaicite:9]{index=9}
    (42607, "The Shepheard's Calender"),                     # :contentReference[oaicite:10]{index=10}
    (72698, "Spenser's Faerie Queene, Vol. 2: Books IV–VII"), # :contentReference[oaicite:11]{index=11}
]
old_english_drama_poetry_dev = [
    (1540, "The Tempest"),                     # EBook #1540 :contentReference[oaicite:0]{index=0}
    (1526, "Twelfth Night; Or, What You Will"),# EBook #1526 :contentReference[oaicite:1]{index=1}
    (1504, "The Comedy of Errors"),            # EBook #1504 :contentReference[oaicite:2]{index=2}
    (1513, "Romeo and Juliet"),                # EBook #1513 :contentReference[oaicite:3]{index=3}
    (1522, "Julius Caesar"),                   # EBook #1522 :contentReference[oaicite:4]{index=4}
    (1539, "The Winter's Tale"),               # EBook #1539 :contentReference[oaicite:5]{index=5}
    (1107, "The Taming of the Shrew"),         # EBook #1107 :contentReference[oaicite:6]{index=6}
    (1509, "The Two Gentlemen of Verona"),     # EBook #1509 :contentReference[oaicite:7]{index=7}
    (1519, "Much Ado About Nothing"),          # EBook #1519 :contentReference[oaicite:8]{index=8}
    (2237, "The Merry Wives of Windsor"),      # EBook #2237 :contentReference[oaicite:9]{index=9}
]

create_dataset_from_list(old_english_drama_poetry, "1M_old_english_drama_poetry", 1_000_000)
create_dataset_from_list(old_english_drama_poetry_dev, "1M_old_english_drama_poetry_dev", 200_000)

In [None]:
mystery_books = [
    (1661, "The Adventures of Sherlock Holmes"),          # Conan Doyle’s famed short stories :contentReference[oaicite:0]{index=0}
    (244, "A Study in Scarlet"),                          # Holmes’s first novel :contentReference[oaicite:1]{index=1}
    (3070, "The Hound of the Baskervilles"),              # Classic Holmes thriller :contentReference[oaicite:2]{index=2}
    (863, "The Mysterious Affair at Styles"),             # Christie’s debut detective novel :contentReference[oaicite:4]{index=4}
    (2097, "The Sign of the Four"),                       # Holmes mystery sequel :contentReference[oaicite:5]{index=5}
    (1685, "The Mystery of the Yellow Room"),             # Ingénious locked-room puzzle :contentReference[oaicite:6]{index=6}
    (564, "The Mystery of Edwin Drood"),                   # Dickens’s final (unfinished) mystery :contentReference[oaicite:8]{index=8}
    (1155, "The Secret Adversary"),                       # Christie’s Tommy & Tuppence spy yarn :contentReference[oaicite:9]{index=9}
    (558, "The Thirty-Nine Steps"),                       # Buchan’s espionage thriller :contentReference[oaicite:10]{index=10}
    (2147, "The Mystery of Marie Rogêt"),                  # Poe’s true-crime inspired tale :contentReference[oaicite:12]{index=12}
    (220, "The Secret Sharer"),                            # Conrad’s psychological thriller :contentReference[oaicite:13]{index=13}
    (34973, "The Spy of the Rebellion"),                   # Pinkerton’s Civil War espionage memoir :contentReference[oaicite:14]{index=14}
    (39940, "Ashton Kirk, Secret Agent"),                  # Early pulp spy adventures :contentReference[oaicite:15]{index=15}
    (155, "The Moonstone"),                               # Early English detective novel :contentReference[oaicite:3]{index=3}
    (583, "The Woman in White"),                          # Collins’s suspenseful thriller :contentReference[oaicite:7]{index=7}
    (4919, "The Murder on the Links"),                    # Poirot’s second case :contentReference[oaicite:11]{index=11}

]
mystery_books_dev = [
    (38131, "On Secret Service"),                          # Taft’s real-life government cases :contentReference[oaicite:16]{index=16}
    (48823, "Spies and Secret Service"),                   # Le Queux’s WWI espionage tales :contentReference[oaicite:17]{index=17}
    (61069, "German Spies in England: An Exposure"),       # Le Queux’s exposé of wartime espionage :contentReference[oaicite:18]{index=18}
    (41186, "Sant of the Secret Service: Some Revelations of Spies and Spying"),  # Le Queux on WWI spycraft :contentReference[oaicite:19]{index=19}
]

create_dataset_from_list(mystery_books, "1M_mystery_books", 1_000_000)
create_dataset_from_list(mystery_books_dev, "1M_mystery_books_dev", 200_000)

In [None]:
youth_and_ya_gutenberg = [
    (11,    "Alice's Adventures in Wonderland"),
    (74,    "The Adventures of Tom Sawyer"),
    (76,    "Adventures of Huckleberry Finn"),
    (16,    "Peter Pan"),
    (514,   "Little Women"),
    (1448,  "Heidi"),
    (45,    "Anne of Green Gables"),
    (17396, "The Secret Garden"),
    (120,   "Treasure Island"),
    (271,   "Black Beauty"),
    (55,    "The Wonderful Wizard of Oz"),
    (1450,  "Pollyanna"),
    (146,   "A Little Princess"),

]

youth_and_ya_gutenberg_dev = [
    (236,   "The Jungle Book"),
    (1874,  "The Railway Children"),
    (3836,  "Swiss Family Robinson"),
    (17314, "Five Children and It"),
    (1480,  "Tom Brown's School Days"),
    (421,   "Kidnapped"),
    (1018,  "The Water-Babies"),
]
create_dataset_from_list(youth_and_ya_gutenberg, "1M_youth_and_ya_gutenberg", 1_000_000)
create_dataset_from_list(youth_and_ya_gutenberg_dev, "1M_youth_and_ya_gutenberg_dev", 200_000)