In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

BASE_URL = "https://poeticneuroscience.blogspot.com/"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def get_all_post_links():
    links = set()
    next_page = BASE_URL

    while next_page:
        print(f"Scraping: {next_page}")
        res = requests.get(next_page, headers=HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')

        # Find all individual post links
        for a in soup.find_all('a', href=True):
            href = a['href']
            if '/20' in href and 'html' in href:  # heuristic to filter posts
                links.add(href)

        # Find next page link
        next_link = soup.find('a', {'class': 'blog-pager-older-link'})
        next_page = next_link['href'] if next_link else None
        time.sleep(1)  # be kind to the server

    return list(links)

def extract_poem(url):
    res = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(res.text, 'html.parser')

    title_tag = soup.find('h3', class_='post-title')
    content_div = soup.find('div', class_='post-body')

    if not title_tag or not content_div:
        return None

    title = title_tag.text.strip()
    content = content_div.get_text(separator="\n").strip()

    return {
        'title': title,
        'url': url,
        'text': content
    }

def main():
    links = get_all_post_links()
    poems = []

    for link in links:
        poem = extract_poem(link)
        if poem:
            poems.append(poem)
        time.sleep(0.5)

    with open('poetic_neuroscience.json', 'w', encoding='utf-8') as f:
        json.dump(poems, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(poems)} poems to poetic_neuroscience.json")

if __name__ == "__main__":
    main()


Scraping: https://poeticneuroscience.blogspot.com/
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2021-09-16T06:14:00-07:00&max-results=7
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2021-02-02T20:13:00-08:00&max-results=7&start=7&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2021-02-02T16:59:00-08:00&max-results=7&start=14&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2020-03-23T21:51:00-07:00&max-results=7&start=21&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2016-11-05T05:14:00-07:00&max-results=7&start=28&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2016-08-13T14:51:00-07:00&max-results=7&start=35&by-date=false
Scraping: https://poeticneuroscience.blogspot.com/search?updated-max=2016-06-19T18:07:00-07:00&max-results=7&start=42&by-date=false
Scraping: https://poeticneuroscience.blogspot.com

In [None]:
import re
import json

# Load the full Shakespeare file
with open("pl2kespeare.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

# Use regex to find the start of each play
play_splits = re.split(r"\*\*\* *THE .*? OF .*? \*\*\*", full_text, flags=re.IGNORECASE)
play_titles = re.findall(r"\*\*\* *THE (.*?) \*\*\*", full_text, flags=re.IGNORECASE)

# Clean and pair titles with corresponding texts
plays = []
for title, text in zip(play_titles, play_splits[1:]):  # skip the preamble
    clean_title = title.title().strip()
    clean_text = text.strip()
    if len(clean_text) > 100:  # filter out empty chunks
        plays.append({
            "title": clean_title,
            "text": clean_text
        })

# Save to JSON
with open("shakespeare_gutenberg.json", "w", encoding="utf-8") as f:
    json.dump(plays, f, indent=2, ensure_ascii=False)

print(f"Saved {len(plays)} plays to shakespeare_gutenberg.json")


Loaded 5378662 characters from pl2kespeare.txt
Saved 0 plays to shakespeare_gutenberg.json
