In [2]:
import feedparser
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import re
import json
import os

# Output directory
output_path = "/workspaces/langgraph/data/"

def scrape_govexec_from_date(rss_url, start_date):
    """
    Scrape articles from the given RSS feed starting from a specific date.

    Args:
        rss_url (str): The RSS feed URL.
        start_date (datetime): The date from which to fetch articles.

    Returns:
        list: A list of articles with titles, links, and publication dates.
    """
    feed = feedparser.parse(rss_url)
    return [{'title': entry.title, 
             'link': entry.link, 
             'published': datetime(*entry.published_parsed[:6]).strftime('%Y-%m-%d')}
            for entry in feed.entries 
            if datetime(*entry.published_parsed[:6]) >= start_date]

def extract_clean_content(url):
    """
    Extract and clean the content of an article from the given URL.

    Args:
        url (str): The URL of the article.

    Returns:
        dict: A dictionary containing the title, author, content, and link of the article.
    """
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Title
    title = soup.select_one('h1.content-title')
    title = title.get_text(strip=True) if title else "No title"
    
    # Author  
    author = soup.select_one('span.authors-multiple a, a.gemg-author-link')
    author = author.get_text(strip=True) if author else "No author"
    
    # Content - handle embedded links properly
    main_content = soup.select_one('div.content-body')
    paragraphs = []
    
    if main_content:
        # Remove unwanted elements
        for element in main_content.find_all(['svg', 'script', 'noscript']):
            element.decompose()
        
        # Process each paragraph
        for p in main_content.find_all('p'):
            # Get clean text content, preserving text from links
            text = p.get_text(separator=' ', strip=True)
            
            # Clean up extra whitespace
            text = re.sub(r'\s+', ' ', text).strip()

            # Replace " , " with ","
            text = re.sub(r'\s*,\s*', ', ', text)

            # Filter out unwanted content
            if (len(text) > 30 and 
                not any(phrase in text for phrase in [
                    'Share your', 'NEXT STORY:', 'Help us tailor', 
                    'Thank you', 'Stay Connected', 'Newsletter page'
                ])):
                paragraphs.append(text)
    
    return {
        'title': title,
        'author': author, 
        'content': ''.join(paragraphs),
        'link': url
    }

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Input: Specify the start date
start_date_str = input("Enter the start date (YYYY-MM-DD): ")
try:
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
except ValueError:
    print("Invalid date format. Please use YYYY-MM-DD.")
    exit()

# Process all articles
articles = scrape_govexec_from_date("https://govexec.com/rss/all/", start_date)
all_results = []  # Store all results for saving

print(f"Found {len(articles)} articles from {start_date.strftime('%Y-%m-%d')} onwards\n")

for i, article in enumerate(articles, 1):
    try:
        print(f"[{i}/{len(articles)}] Processing: {article['title']}")
        result = extract_clean_content(article['link'])
        
        # Add publication date to the result
        result['published'] = article['published']
        
        # Add to results list
        all_results.append(result)
        
        # Your original console output
        print(f"TITLE: {result['title']}")
        print(f"AUTHOR: {result['author']}")
        print(f"PUBLISHED: {result['published']}")
        print("CONTENT:")
        
        for j, para in enumerate(result['content'], 1):
            print(f"{j}. {para}")
        
        print("-" * 80)
        
    except Exception as e:
        print(f"Error processing {article['title']}: {e}")
        print("-" * 80)

# Also save as "latest" for easy loading
latest_filepath = os.path.join(output_path, "govexec_articles_latest.json")
with open(latest_filepath, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

Found 25 articles from 2025-01-01 onwards

[1/25] Processing: Withholding agency funds at the end of the year under consideration, White House says
TITLE: Withholding agency funds at the end of the year under consideration, White House says
AUTHOR: Eric Katz
PUBLISHED: 2025-06-25
CONTENT:
1. T
2. h
3. e
4.  
5. h
6. e
7. a
8. d
9.  
10. o
11. f
12.  
13. t
14. h
15. e
16.  
17. W
18. h
19. i
20. t
21. e
22.  
23. H
24. o
25. u
26. s
27. e
28. ’
29. s
30.  
31. b
32. u
33. d
34. g
35. e
36. t
37.  
38. o
39. f
40. f
41. i
42. c
43. e
44.  
45. c
46. o
47. n
48. f
49. i
50. r
51. m
52. e
53. d
54.  
55. t
56. h
57. e
58.  
59. T
60. r
61. u
62. m
63. p
64.  
65. a
66. d
67. m
68. i
69. n
70. i
71. s
72. t
73. r
74. a
75. t
76. i
77. o
78. n
79.  
80. i
81. s
82.  
83. c
84. o
85. n
86. s
87. i
88. d
89. e
90. r
91. i
92. n
93. g
94.  
95. t
96. a
97. k
98. i
99. n
100. g
101.  
102. u
103. n
104. i
105. l
106. a
107. t
108. e
109. r
110. a
111. l
112.  
113. a
114. c
115. t
116. i
117. o