In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re

class NeuralNetScraper:
    def __init__(self, base_url='http://neuralnetworksanddeeplearning.com/'):
        self.base_url = base_url
        self.scraped_content = {}

    def extract_text_by_topic(self, soup):
        """Extract main topics and subtopics along with their content."""
        content = {}
        main_heading = soup.find('h1')  # Assuming chapter title is in <h1>
        
        if main_heading:
            chapter_title = main_heading.get_text(strip=True)
            content[chapter_title] = {}

            # Find all headings and paragraphs under the main content area
            for section in soup.find_all(['h2', 'h3', 'p']):
                if section.name in ['h2', 'h3']:
                    subtopic_title = section.get_text(strip=True)
                    content[chapter_title][subtopic_title] = []
                elif section.name == 'p':
                    # Append paragraphs to the last seen subtopic
                    if chapter_title in content and content[chapter_title]:
                        last_subtopic = list(content[chapter_title].keys())[-1]
                        content[chapter_title][last_subtopic].append(section.get_text(strip=True))
        
        return content

    def scrape_page(self, page_name):
        """Scrape a single page and extract chapter/subtopic content."""
        url = self.base_url + page_name
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract chapter/subtopic content
            page_content = self.extract_text_by_topic(soup)
            
            # Find all links for subtopics
            subtopic_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.html') and a['href'] != page_name]
            
            return page_content, subtopic_links
        
        except requests.RequestException as e:
            print(f"Error scraping {url}: {e}")
            return {}, []

    def scrape_all_pages(self, start_page='index.html'):
        """Scrape all chapters and their subtopics starting from the index page."""
        pages_to_visit = [start_page]
        visited_pages = set()

        while pages_to_visit:
            current_page = pages_to_visit.pop(0)
            if current_page in visited_pages:
                continue
            
            print(f"Scraping: {current_page}")
            visited_pages.add(current_page)
            
            chapter_content, subtopics = self.scrape_page(current_page)
            self.scraped_content.update(chapter_content)
            pages_to_visit.extend(subtopics)

    def save_to_json(self, filename='nn_deeplearning_content.json'):
        """Save the scraped content to a JSON file."""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.scraped_content, f, ensure_ascii=False, indent=4)
        print(f"Content saved to {filename}")

def main():
    scraper = NeuralNetScraper()
    scraper.scrape_all_pages()
    scraper.save_to_json()

if __name__ == '__main__':
    main()


Scraping: index.html
Scraping: about.html
Scraping: exercises_and_problems.html
Scraping: chap1.html
Scraping: chap2.html
Scraping: chap3.html
Scraping: chap4.html
Scraping: chap5.html
Scraping: chap6.html
Scraping: sai.html
Scraping: acknowledgements.html
Scraping: faq.html
Scraping: supporters.html
Scraping: bugfinder.html
Scraping: http://www.deeplearning.net/tutorial/gettingstarted.html
Error scraping http://neuralnetworksanddeeplearning.com/http://www.deeplearning.net/tutorial/gettingstarted.html: 404 Client Error: Not Found for url: http://neuralnetworksanddeeplearning.com/http://www.deeplearning.net/tutorial/gettingstarted.html
Scraping: http://www.scipy.org/install.html
Error scraping http://neuralnetworksanddeeplearning.com/http://www.scipy.org/install.html: 404 Client Error: Not Found for url: http://neuralnetworksanddeeplearning.com/http://www.scipy.org/install.html
Scraping: http://peekaboo-vision.blogspot.de/2010/09/mnist-for-ever.html
Error scraping http://neuralnetworksa