In [3]:
import requests
from bs4 import BeautifulSoup
import json

class ML101Scraper:
    def __init__(self, base_url='https://machinelearning101.readthedocs.io/en/latest/_pages/'):
        self.base_url = base_url
        self.scraped_content = {}

    def extract_text_by_topic(self, soup):
        """Extract chapter and subtopics with their content."""
        content = {}
        chapter_title = soup.find('h1').get_text(strip=True)  # Assuming the main chapter title is in <h1>
        
        content[chapter_title] = {}
        
        # Find all subheadings and paragraphs
        subtopics = soup.find_all(['h2', 'h3'])
        for subtopic in subtopics:
            subtopic_title = subtopic.get_text(strip=True)
            subtopic_content = []
            
            current = subtopic.find_next_sibling()
            while current and current.name not in ['h1', 'h2', 'h3']:
                if current.name in ['p', 'ul', 'ol']:
                    subtopic_content.append(current.get_text(strip=True))
                current = current.find_next_sibling()
            
            content[chapter_title][subtopic_title] = ' '.join(subtopic_content)
        
        return content

    def scrape_page(self, page_url):
        """Scrape content from a single page and find the next button."""
        try:
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract page content
            page_content = self.extract_text_by_topic(soup)
            
            # Find the link to the next page
            next_button = soup.find('a', {'accesskey': 'n'})  # 'n' access key for next button
            next_page_url = next_button['href'] if next_button else None
            
            return page_content, next_page_url
        
        except requests.RequestException as e:
            print(f"Error scraping {page_url}: {e}")
            return {}, None

    def scrape_all_pages(self, start_page='01_introduction.html'):
        """Scrape all pages starting from the initial page."""
        current_url = self.base_url + start_page
        
        while current_url:
            print(f"Scraping: {current_url}")
            page_content, next_relative_url = self.scrape_page(current_url)
            self.scraped_content.update(page_content)
            
            if next_relative_url:
                current_url = self.base_url + next_relative_url
            else:
                break

    def save_to_json(self, filename='ml101_content.json'):
        """Save the scraped content to a JSON file."""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.scraped_content, f, ensure_ascii=False, indent=4)
        print(f"Content saved to {filename}")

def main():
    scraper = ML101Scraper()
    scraper.scrape_all_pages()
    scraper.save_to_json()

if __name__ == '__main__':
    main()


Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/01_introduction.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/02_learning_models.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/03_bias_variance.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/04_covariance_correlation.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/05_model_metrics.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/06_underfitting_overfitting.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/07_model_performance.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/08_gradient_decent.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/09_regression.html
Scraping: https://machinelearning101.readthedocs.io/en/latest/_pages/10_simple_linear_regression.html
Scraping: https://machinelearning101.readthedocs.io/en/