In [None]:
#Official Objective: Create a python script that crawls specified news websites for articles matching given keywords, categorizes them by defined topics
#Unofficial Objective: to eliminate my daily habit of doomscorlling the news :D

#Assumptions: the results will all be relevant and the keywords selection is sufficient to capture the desired articles
#Constraints: must handle errors gracefully, avoid duplicate articles, and respect website crawling policies (robots

#first we import everything we may need

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import defaultdict
import time
import re

In [None]:
#create the news web crawler as a class with functions

class NewsCrawler:
    def __init__(self, urls, keywords):
        """
        Initialize the news crawler.
        Args:
            urls: List of news site URLs to crawl
            keywords: Dictionary mapping topic names to lists of keywords
                     e.g., {'Technology': ['AI', 'tech', 'software'], 'Politics': ['election', 'senate']}
        """
        self.urls = urls
        self.keywords = keywords
        self.articles_by_topic = defaultdict(list)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
    
    def fetch_page(self, url):
        """Fetch a webpage and return its content."""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            return response.text
        except Exception as e:
            print(f"Error fetching {url}: {e}")  #show the error message.  Some sites may require login
            return None
    
    def extract_articles(self, html, base_url):
        """Extract article titles and links from HTML."""
        soup = BeautifulSoup(html, 'html.parser')
        articles = []
        
        # Look for common article elements
        article_tags = soup.find_all(['article', 'div', 'li'], class_=re.compile(r'(article|story|post|item|card)'))
        
        for tag in article_tags:
            # Try to find headline and link
            link_tag = tag.find('a', href=True)
            if not link_tag:
                continue
            
            # Get title from various possible locations
            title = None
            title_tag = tag.find(['h1', 'h2', 'h3', 'h4'])
            if title_tag:
                title = title_tag.get_text(strip=True)
            elif link_tag.get_text(strip=True):
                title = link_tag.get_text(strip=True)
            
            if title and len(title) > 10:  # Filter out very short titles
                url = urljoin(base_url, link_tag['href'])
                articles.append({'title': title, 'url': url})
        
        # Also search for standard headline links if no articles found
        if not articles:
            for tag in soup.find_all('a', href=True):
                text = tag.get_text(strip=True)
                if len(text) > 20 and len(text) < 200:  # Reasonable title length
                    url = urljoin(base_url, tag['href'])
                    if '/article/' in url or '/news/' in url or '/story/' in url:
                        articles.append({'title': text, 'url': url})
        
        return articles
    
    def categorize_article(self, article_text):
        """Determine which topic(s) an article belongs to based on keywords."""
        topics = []
        article_lower = article_text.lower()
        
        for topic, keywords in self.keywords.items():
            for keyword in keywords:
                if keyword.lower() in article_lower:
                    topics.append(topic)
                    break  # Move to next topic once we find a match
        
        return topics
    
    def crawl(self):
        """Crawl all news sites and categorize articles."""
        print("Starting news crawl...\n")
        
        for url in self.urls:
            print(f"Crawling {url}...")
            html = self.fetch_page(url)
            
            if not html:
                continue
            
            articles = self.extract_articles(html, url)
            print(f"Found {len(articles)} articles")
            
            for article in articles:
                topics = self.categorize_article(article['title'])
                for topic in topics:
                    self.articles_by_topic[topic].append(article)
            
            #add delay between requests
            time.sleep(1)
        
        print("\nCrawl complete!\n")
    
    def generate_summary(self):
        """Generate a bulleted summary of news by topic."""
        summary = []
        summary.append("=" * 60)
        summary.append("NEWS SUMMARY BY TOPIC")
        summary.append("=" * 60)
        summary.append("")
        
        if not self.articles_by_topic:
            summary.append("No articles found matching the specified topics.")
            return "\n".join(summary)
        
        for topic in sorted(self.articles_by_topic.keys()):
            articles = self.articles_by_topic[topic]
            #Remove duplicates based on title by creating a dictionary of unique articles
            unique_articles = {a['title']: a for a in articles}.values()
            
            summary.append(f"\n{topic.upper()}")
            summary.append("-" * 60)
            
            for article in list(unique_articles)[:10]:  #Limit to 10 per topic
                summary.append(f"  • {article['title']}")
                summary.append(f"    {article['url']}")
            
            summary.append(f"\n  Total articles: {len(unique_articles)}")
            summary.append("")
        
        return "\n".join(summary)




In [2]:
# List of news site URLs to crawl
    
news_urls = [
    "https://www.wsj.com",                     # Wall Street Journal
    "https://news.ycombinator.com",            # Y Combinator News (Hacker News)
    "https://techcrunch.com",                  # TechCrunch
    "https://www.wired.com",                   # Wired
    "https://www.economist.com",               # The Economist
    "https://www.mddionline.com",              # MD+DI (Medical Device and Diagnostic Industry)
    "https://www.bbc.com/news",                # BBC News
    "https://www.nytimes.com",                 # New York Times
    "https://www.ocregister.com"               # Orange County Register
]



# Define topics and their associated keywords
topics_keywords = {
        "Technology": ["AI", "artificial intelligence", "tech", "software", "computer", "digital"],
        "Politics": ["election", "president", "senate", "congress", "政治", "政府"],
        "Business": ["economy", "market", "stock", "company", "business", "financial"],
        "Science": ["research", "study", "science", "discovery", "climate"],
        "Health": ["health", "medical", "vaccine", "disease", "hospital"]
    }

In [3]:
# Example usage
if __name__ == "__main__":
    
    # Create crawler and run
    crawler = NewsCrawler(news_urls, topics_keywords)
    crawler.crawl()
    
    # Print summary
    print(crawler.generate_summary())
    
    # Optionally save to file
    with open("news_summary.txt", "w", encoding="utf-8") as f:
        f.write(crawler.generate_summary())

Starting news crawl...

Crawling https://www.wsj.com...
Error fetching https://www.wsj.com: 401 Client Error: HTTP Forbidden for url: https://www.wsj.com/
Crawling https://news.ycombinator.com...
Found 2 articles
Crawling https://techcrunch.com...
Found 366 articles
Crawling https://www.wired.com...
Found 134 articles
Crawling https://www.economist.com...
Error fetching https://www.economist.com: 403 Client Error: Forbidden for url: https://www.economist.com/
Crawling https://www.mddionline.com...
Error fetching https://www.mddionline.com: 403 Client Error: Forbidden for url: https://www.mddionline.com/
Crawling https://www.bbc.com/news...
Found 39 articles
Crawling https://www.nytimes.com...
Found 29 articles
Crawling https://www.ocregister.com...
Found 220 articles

Crawl complete!

NEWS SUMMARY BY TOPIC


BUSINESS
------------------------------------------------------------
  • Coinbase CEO Brian Armstrong trolls the prediction markets
    https://techcrunch.com/2025/11/01/coinbase-