In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

BASE_URL = "https://poeticneuroscience.blogspot.com/"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def get_all_post_links():
    links = set()
    next_page = BASE_URL

    while next_page:
        print(f"Scraping: {next_page}")
        res = requests.get(next_page, headers=HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')

        # Find all individual post links
        for a in soup.find_all('a', href=True):
            href = a['href']
            if '/20' in href and 'html' in href:  # heuristic to filter posts
                links.add(href)

        # Find next page link
        next_link = soup.find('a', {'class': 'blog-pager-older-link'})
        next_page = next_link['href'] if next_link else None
        time.sleep(1)  # be kind to the server

    return list(links)

def extract_poem(url):
    res = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(res.text, 'html.parser')

    title_tag = soup.find('h3', class_='post-title')
    content_div = soup.find('div', class_='post-body')

    if not title_tag or not content_div:
        return None

    title = title_tag.text.strip()
    content = content_div.get_text(separator="\n").strip()

    return {
        'title': title,
        'url': url,
        'text': content
    }

def main():
    links = get_all_post_links()
    poems = []

    for link in links:
        poem = extract_poem(link)
        if poem:
            poems.append(poem)
        time.sleep(0.5)

    with open('poetic_neuroscience.json', 'w', encoding='utf-8') as f:
        json.dump(poems, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(poems)} poems to poetic_neuroscience.json")

if __name__ == "__main__":
    main()


Scraping: https://poeticneuroscience.blogspot.com/
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2021-09-16T06:14:00-07:00&max-results=7
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2021-02-02T20:13:00-08:00&max-results=7&start=7&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2021-02-02T16:59:00-08:00&max-results=7&start=14&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2020-03-23T21:51:00-07:00&max-results=7&start=21&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2016-11-05T05:14:00-07:00&max-results=7&start=28&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2016-08-13T14:51:00-07:00&max-results=7&start=35&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2016-06-19T18:07:00-07:00&max-results=7&start=42&by-date=false
Scraping: https://poeticneuroscience.blogspot.com

In [8]:
import re
import json

# Load the text
with open("pl2kespeare.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

# List of all Shakespeare play titles in ALL CAPS (simplified from Gutenberg)
play_titles = [
    "ALL'S WELL THAT ENDS WELL",
    "ANTONY AND CLEOPATRA",
    "AS YOU LIKE IT",
    "THE COMEDY OF ERRORS",
    "CORIOLANUS",
    "CYMBELINE",
    "HAMLET, PRINCE OF DENMARK",
    "JULIUS CAESAR",
    "KING HENRY IV, PART I",
    "KING HENRY IV, PART II",
    "KING HENRY V",
    "KING HENRY VI, PART I",
    "KING HENRY VI, PART II",
    "KING HENRY VI, PART III",
    "KING HENRY VIII",
    "KING JOHN",
    "KING LEAR",
    "LOVE'S LABOUR'S LOST",
    "MACBETH",
    "MEASURE FOR MEASURE",
    "THE MERCHANT OF VENICE",
    "THE MERRY WIVES OF WINDSOR",
    "A MIDSUMMER NIGHT'S DREAM",
    "MUCH ADO ABOUT NOTHING",
    "OTHELLO, THE MOOR OF VENICE",
    "PERICLES, PRINCE OF TYRE",
    "RICHARD II",
    "RICHARD III",
    "ROMEO AND JULIET",
    "THE TAMING OF THE SHREW",
    "THE TEMPEST",
    "TIMON OF ATHENS",
    "TITUS ANDRONICUS",
    "TROILUS AND CRESSIDA",
    "TWELFTH NIGHT; OR, WHAT YOU WILL",
    "TWO GENTLEMEN OF VERONA",
    "THE WINTER'S TALE"
]

# Create a regex that matches any title on its own line
title_pattern = r"\n({})\n".format("|".join(re.escape(title) for title in play_titles))

# Split the file using the play titles
splits = re.split(title_pattern, full_text)

# The first chunk is the preamble — skip it
plays = []
for i in range(1, len(splits) - 1, 2):
    title = splits[i].title()
    text = splits[i + 1].strip()
    if len(text) > 100:
        plays.append({"title": title, "text": text})

# Save to JSON
with open("shakespeare_gutenberg.json", "w", encoding="utf-8") as f:
    json.dump(plays, f, indent=2, ensure_ascii=False)

print(f"Saved {len(plays)} plays to shakespeare_gutenberg.json")


Saved 13 plays to shakespeare_gutenberg.json
