In [14]:
import requests
from bs4 import BeautifulSoup

url = "https://www.cdc.gov/heart-disease/prevention/"

# 1. Fetch the page
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# 2. Try multiple possible containers
candidates = [
    {"name": "div", "attrs": {"class": "sf-detail-body-wrapper"}},
    {"name": "div", "attrs": {"role": "main"}},
    {"name": "div", "attrs": {"class": "main-content"}},
    {"name": "article", "attrs": {}},
]

content_div = None
for candidate in candidates:
    content_div = soup.find(candidate["name"], attrs=candidate["attrs"])
    if content_div:
        print(f"Found container: <{candidate['name']} {candidate['attrs']}>")
        break

if not content_div:
    # Debug: list top divs to inspect
    print("Could not find main content container. Here are some top-level divs for inspection:\n")
    for div in soup.find_all("div", recursive=False)[:10]:
        print(div.prettify()[0:300] + "\n---\n")
    raise ValueError("Main content not found. Please inspect the printed divs and update the selector.")

# 3. Extract relevant text
lines = []
for elem in content_div.find_all(["h1", "h2", "h3", "p", "li"]):
    text = elem.get_text(strip=True)
    if not text:
        continue

    prefix = ""
    if elem.name == "h1":
        prefix = "# "
    elif elem.name == "h2":
        prefix = "## "
    elif elem.name == "h3":
        prefix = "### "

    lines.append(prefix + text)

# 4. Write to file
out_path = "heart_disease_prevention.txt"
with open(out_path, "w", encoding="utf-8") as f:
    for line in lines:
        f.write(line + "\n")

print(f"✅ Scraping complete! Output saved to {out_path}")


Could not find main content container. Here are some top-level divs for inspection:



ValueError: Main content not found. Please inspect the printed divs and update the selector.

In [24]:
import requests
from bs4 import BeautifulSoup

url = "https://www.cdc.gov/heart-disease/prevention/"  # example page

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Get all visible text (without HTML tags)
page_text = soup.get_text(separator="\n", strip=True)

print(page_text[:1000])  # print first 1000 characters

Preventing Heart Disease | Heart Disease | CDC
Skip directly to site content
Skip directly to search
An official website of the United States government
Here's how you know
Official websites use .gov
A
.gov
website belongs to an official government organization in the United States.
Secure .gov websites use HTTPS
A
lock
(
) or
https://
means you've safely connected to the .gov website. Share sensitive information only on official, secure websites.
Heart Disease
Explore Topics
Search
Search
Clear
Search
For Everyone
About Heart Disease
Risk Factors
Preventing
Heart Disease Facts
Healthy People 2030
View all
Public Health
Heart Disease Communications Toolkit
Grady Implementation Guide
Emergency Medical Services (EMS) and Heart Disease
Heart Valve Disease Toolkits
American Heart Month Communications Toolkit
Healthy Eating Communications Kit
View all
Related Topics:
About Stroke
|
About High Blood Pressure
|
About Cholesterol
View All
search
close search
clear
search
Heart Disease
Menu
cle

In [31]:
import requests
from bs4 import BeautifulSoup

url = "https://www.cdc.gov/heart-disease/prevention/"

# Fetch page
response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

# ✅ CDC pages wrap article inside <main>
content = soup.find("main")
if not content:
    raise ValueError("Main content not found. Inspect HTML structure.")

# Collect only paragraphs and list items
lines = []
for elem in content.find_all(["p", "li"]):
    text = elem.get_text(strip=True)
    if text:
        lines.append(text)

# Save to .txt file
with open("./content_data/cdc_heart_disease_prevention.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("✅ Scraping complete! Saved to cdc_heart_disease_prevention.txt")


✅ Scraping complete! Saved to cdc_heart_disease_prevention.txt


In [35]:
import requests
from bs4 import BeautifulSoup

url = "https://www.nhs.uk/live-well/eat-well/food-types/salt-in-your-diet/"

# Fetch page
response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

# ✅ CDC pages wrap article inside <main>
content = soup.find("main")
if not content:
    raise ValueError("Main content not found. Inspect HTML structure.")

# Collect only paragraphs and list items
lines = []
for elem in content.find_all(["p", "li"]):
    text = elem.get_text(strip=True)
    if text:
        lines.append(text)

# Save to .txt file
with open("./content_data/salt_in_diet.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("✅ Scraping complete! ")


✅ Scraping complete! 


In [62]:
import requests
from bs4 import BeautifulSoup

url = "https://medlineplus.gov/diabetestype1.html"

responses = requests.get(url)
soup = BeautifulSoup(responses.text,"html.parser")


In [69]:
soup

<!DOCTYPE html>

<html class="nojs us" data-root="https://medlineplus.gov/" id="health_topic" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="_top" http-equiv="window-target"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://medlineplus.gov/diabetestype1.html" rel="canonical"/>
<link href="https://medlineplus.gov/diabetestype1.html" hreflang="en" rel="alternate"/>
<link href="https://medlineplus.gov/spanish/diabetestype1.html" hreflang="es" rel="alternate"/>
<meta content="medlineplus-ac-dictionary" name="ac-dictionary"/>
<meta content="If the pancreas doesn't make needed insulin, causing blood sugar levels to be too high you have type 1 or juvenile diabetes. Learn the signs." name="description"/>
<link href="http://purl.org/dc/elements/1.1/" rel="schema.DC" title="The Dublin Core metadata Element Set"/>


In [67]:
import requests
from bs4 import BeautifulSoup

def scrape_medlineplus_text(url, output_path="output.txt"):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0 Safari/537.36"
        )
    }
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # The content is in <div> elements that follow the "## Summary" heading.
    # We'll locate this heading and then gather only <p> and <li> tags below it.
    start = soup.find(string=lambda s: s and "Summary" in s and s.startswith("##"))
    if not start:
        raise ValueError("Could not locate the Summary section on the page.")

    # Traverse siblings after the Summary heading
    content_items = []
    for sibling in start.find_parent().find_next_siblings():
        # Stop when reaching another '## ' heading (next section like "Start Here")
        if sibling.name and sibling.name.startswith("h") and sibling.get_text(strip=True).startswith("##"):
            break
        # Collect paragraphs and list items
        for tag in sibling.find_all(["p", "li"]):
            text = tag.get_text(strip=True)
            if text:
                content_items.append(text)

    # Save the cleaned content to a file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(content_items))

    print(f"✅ Scraped content saved to '{output_path}'.")

if __name__ == "__main__":
    scrape_medlineplus_text("https://medlineplus.gov/diabetestype1.html", "diabetes_type1.txt")


ValueError: Could not locate the Summary section on the page.