In [1]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL
base_url = "https://builtin.com/machine-learning/machine-learning-basics"

# JSON object to store the extracted data
extracted_data = {}

def scrape_page(url):
    """Scrape the content of a single page."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract topic and content
        # Assuming topics are in <h2> tags and content in <p> tags
        topics = soup.find_all('h2')
        paragraphs = soup.find_all('p')
        
        for topic in topics:
            topic_text = topic.get_text(strip=True)
            related_content = []
            
            # Extract following paragraphs as content related to the topic
            for sibling in topic.find_next_siblings():
                if sibling.name == 'h2':  # Stop at the next topic
                    break
                if sibling.name == 'p':
                    related_content.append(sibling.get_text(strip=True))
            
            extracted_data[topic_text] = related_content
    else:
        print(f"Failed to retrieve {url} - Status Code: {response.status_code}")

def find_related_links(url):
    """Find all links related to machine learning from the base page."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all links related to machine learning
        # Assuming links contain the term "machine-learning"
        links = soup.find_all('a', href=True)
        related_links = [link['href'] for link in links if 'machine-learning' in link['href']]
        
        # Ensure full URLs
        related_links = [link if link.startswith('http') else f"https://builtin.com{link}" for link in related_links]
        return set(related_links)  # Use a set to avoid duplicates
    else:
        print(f"Failed to retrieve {url} - Status Code: {response.status_code}")
        return []

# Find related links
related_links = find_related_links(base_url)
print(f"Found {len(related_links)} related links.")

# Scrape each related link
for link in related_links:
    print(f"Scraping: {link}")
    scrape_page(link)

# Save data as JSON
with open('extracted_machine_learning_content.json', 'w', encoding='utf-8') as f:
    json.dump(extracted_data, f, ensure_ascii=False, indent=4)

print("Data extraction completed and saved to 'extracted_machine_learning_content.json'.")

Found 15 related links.
Scraping: https://builtin.com/auth/signup?destination=%2fmachine-learning%2fmachine-learning-basics
Scraping: https://builtin.com/tag/machine-learning-algorithms
Scraping: https://www.linkedin.com/shareArticle?url=https%3A%2F%2Fbuiltin.com%2Fmachine-learning%2Fmachine-learning-basics&mini=true
Scraping: https://builtin.com/machine-learning/unsupervised-learning
Scraping: https://builtin.com/machine-learning/common-loss-functions
Scraping: https://builtin.com/tag/machine-learning
Scraping: https://builtin.com/data-science/supervised-machine-learning-classification
Scraping: https://builtin.com/machine-learning/sigmoid-activation-function#:~:text=The%20sigmoid%20activation%20function%20has,chaining%20such%20matrix%20multiplications%20together.
Scraping: https://builtin.com/data-science/machine-learning-models-python
Scraping: https://builtin.com/machine-learning/machine-learning-models-explained
Scraping: https://builtin.com/machine-learning/classification-machine