In [27]:
import hashlib
import requests
import feedparser
from bs4 import BeautifulSoup

In [28]:
url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
feed = feedparser.parse(url)
print(feed.keys())
print(feed.feed.link)
print(feed.feed.description)

dict_keys(['bozo', 'entries', 'feed', 'headers', 'etag', 'updated', 'updated_parsed', 'href', 'status', 'encoding', 'version', 'namespaces'])
https://www.nasa.gov
Official National Aeronautics and Space Administration Website


In [29]:
for entry in feed.entries[:4]:
    print("Entry Title:", entry.title)
    print("Entry Link:", entry.link)
    print("Entry Published Date:", entry.published)
    print("Entry Summary:", entry.summary[:50])
    print("\n")

Entry Title: Combustor Facilities
Entry Link: https://www.nasa.gov/centers-and-facilities/glenn/combustor-facilities/
Entry Published Date: Mon, 03 Feb 2025 22:48:45 +0000
Entry Summary: Sector Combustor Studies (CE-5B-1) Combustion stud


Entry Title: NASA Awards Contract for Airborne Science Flight Services Support
Entry Link: https://www.nasa.gov/centers-and-facilities/ames/nasa-awards-contract-for-airborne-science-flight-services-support/
Entry Published Date: Mon, 03 Feb 2025 21:31:55 +0000
Entry Summary: NASA has awarded Dynamic Aviation Group Inc. of Br


Entry Title: NASA Presses Forward Search for VIPER Moon Rover Partner
Entry Link: https://www.nasa.gov/news-release/nasa-presses-forward-search-for-viper-moon-rover-partner/
Entry Published Date: Mon, 03 Feb 2025 21:22:57 +0000
Entry Summary: To advance plans of securing a public/private part


Entry Title: Station Science Top News: Jan. 31, 2025
Entry Link: https://www.nasa.gov/missions/station-science-top-news-jan-31-2025/
En

In [30]:
def is_full_content(entry):
    """Checks if an RSS entry contains the full article."""
    if hasattr(entry, "content"):
        return True  # Likely full content
    elif hasattr(entry, "description") and len(entry.description) < 300:
        return False  # Short description → likely a summary
    return False  # Assume it's a summary if no clear full content

In [31]:
for entry in feed.entries:
    print(is_full_content(entry))

True
True
True
True
True
True
True
True
True
True


In [46]:
import html
import re


def clean_rss_content(html_content):
    """Extracts meaningful text from RSS HTML while removing unnecessary elements."""
    
    # Parse HTML
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract all paragraph content (p tags contain the actual text)
    paragraphs = [p.get_text(separator=" ", strip=True) for p in soup.find_all("p")]
    
    # Join paragraphs into a single readable text block
    text = "\n\n".join(paragraphs)  # Keeps paragraph breaks for readability

    # Convert HTML entities (e.g., &nbsp; → space, &quot; → ")
    text = html.unescape(text)

    # Remove extra spaces, newlines, and multiple consecutive blank lines
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\n{2,}", "\n\n", text)  # Preserve paragraph separation

    return text

raw = feed.entries[0].content[0].value  # Extract HTML from RSS entry
clean = clean_rss_content(raw)

print(clean)  # Now readable without HTML tags or artifacts

9 min read Combustion studies are conducted in this two-test position facility specifically in support of the NOx-reduction research for the High Speed Research program and the Advanced Subsonic Technology program. CE-5B-1 is large enough to test sector arrangements of injector elements to include interactions of the elements and single larger elements. The facility receives filtered combustion air from the 450-psig system. The air is heated in a 1,100°F non-vitiated heater at flows up to 20 lb/s, which can be valved to either test stand. The airflow passes through the test section, is water spray quenched, and is then discharged to the altitude exhaust system or the atmospheric exhaust system. The facility preheater consists of a heat exchanger fired by four J-47 burner cans using natural gas for a fuel and the 40-psig combustion air. The research hardware uses ASTM Jet-A, JP-5, or JP-8 as a fuel. CE-5B-1 Special Features In addition to inlet and exit rakes and standard instrumentatio

In [63]:
import time
import hashlib
import psycopg
import requests
from dateutil import parser
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser

class WebScraper:
    def __init__(self, targets):
        self.targets = targets

    def permission_check(self, source):
        """Check robots.txt to determine if scraping is allowed."""
        
        # Construct robots.txt URL
        robots_url = urljoin("https://"+source, "/robots.txt")
        
        # Initialize parser
        rp = RobotFileParser()
        
        try:
            rp.set_url(robots_url)
            rp.read()  # Fetch and parse robots.txt
        except Exception as e:
            print(f"Error fetching robots.txt: {e}")
            return True  # Assume scraping is allowed if robots.txt is inaccessible

        # Check if we are allowed to scrape the homepage (change to any relevant path)
        return rp.can_fetch("*", source)
    

    def crawl_delay(self, source):
        """Fetch crawl delay from robots.txt if specified."""
        
        robots_url = urljoin("https://"+source, "/robots.txt")
        rp = RobotFileParser()
        
        try:
            rp.set_url(robots_url)
            rp.read()
            return rp.crawl_delay("*") or 0  # Default to 0 if no delay is specified
        except Exception as e:
            print(f"Error fetching robots.txt for crawl delay: {e}")
            return 0  # Assume no delay if robots.txt is inaccessible


    def scrape(self):
        """Scrapes web pages."""
        articles = []
        headers = {"User-Agent": "Mozilla/5.0"} # spoofs browser to avoid bot detection
        
        for url in self.targets:
            source = urlparse(url).netloc

            if not self.permission_check(source):
                print(f"Skipping {url} as per robots.txt")
                continue

            # Get crawl delay
            delay = self.crawl_delay(source)
            if delay > 0:
                print(f"Respecting crawl delay of {delay} seconds...")
                time.sleep(delay)  # Respect the delay before scraping
            try:
                # Retrieve HTML content of the page
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
            except requests.RequestException as e:
                print(f"Error fetching {url}: {e}")
                return None

            # Parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract title and main content
            soupTitle = soup.find("title")
            title = soupTitle.text.strip() if soupTitle else "No Title"
            soupContent = soup.find("article")
            content = soupContent.text.strip() if soupContent else "Content not found"

            # Extract metadata: search for <meta> tag with property = the published date
            date_meta = soup.find("meta", {"property": "article:published_time"})
            if date_meta and "content" in date_meta.attrs:
                published = date_meta["content"]
                if published:
                    try:
                        published = parser.parse(published)
                    except Exception as e:
                        print(f"Error parsing date '{published}': {e}")
                        published = None

            # Compute hash for deduplication
            hash = hashlib.md5((title + source).encode('utf-8')).hexdigest()

            articles.append({
                 "title": title,
                 "url": url,
                 "content": content,
                 "published": published,
                 "source": source, # Extract domain as source
                 "hash": hash
                 })

        return articles

In [62]:
targets = [
    "https://www.nasa.gov/news",
    "https://www.nasa.gov/news",
    "https://mail.google.com/mail/u/4/#inbox"
]

scraper = WebScraper(targets)
articles = scraper.scrape()

None
None


In [58]:
articles

[{'title': 'NASA News',
  'url': 'https://www.nasa.gov/news',
  'content': "News and EventsLatest NewsBlogsNewslettersSocial MediaBudgets and ReportsUpcoming LaunchesMoreMedia Contacts\n\n\n\n\nNASA News\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\nLatest News \n\n\n\n\n\t\t\t\t2025 News Releases                \t\t\t\n\n\n\n\n News Release3 Min ReadNASA Presses Forward Search for VIPER Moon Rover Partner News Release3 Min ReadNASA to Talk Science, Tech Aboard Next Intuitive Machines Moon Flight News Release2 Min ReadNASA Invites Media to Discuss PUNCH Mission to Study Solar Wind News Release4 Min ReadNASA, Partners to Welcome Fourth Axiom Space Mission to Space Station \n\nSign up to receive NASA news releases and other information.\n \n\n\n\t\t\t\tResources for News Media\t\t\t\n\n\n\n\n\n\n \n\n\n\n\n\nCommunications Policy\n\n\nPolicy on the release of information to news and information media\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\nMedia Accreditation Policy\n