In [16]:
import feedparser
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import re
import json
import os

# Output directory
output_path = "/workspaces/langgraph/data/"

def scrape_govexec_past_week(rss_url):
    feed = feedparser.parse(rss_url)
    week_ago = datetime.now() - timedelta(days=7)
    return [{'title': entry.title, 'link': entry.link} 
            for entry in feed.entries 
            if datetime(*entry.published_parsed[:6]) >= week_ago]

def extract_clean_content(url):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Title
    title = soup.select_one('h1.content-title')
    title = title.get_text(strip=True) if title else "No title"
    
    # Author  
    author = soup.select_one('span.authors-multiple a, a.gemg-author-link')
    author = author.get_text(strip=True) if author else "No author"
    
    # Content - handle embedded links properly
    main_content = soup.select_one('div.content-body')
    paragraphs = []
    
    if main_content:
        # Remove unwanted elements
        for element in main_content.find_all(['svg', 'script', 'noscript']):
            element.decompose()
        
        # Process each paragraph
        for p in main_content.find_all('p'):
            # Get clean text content, preserving text from links
            text = p.get_text(separator=' ', strip=True)
            
            # Clean up extra whitespace
            text = re.sub(r'\s+', ' ', text).strip()
            
            # Filter out unwanted content
            if (len(text) > 30 and 
                not any(phrase in text for phrase in [
                    'Share your', 'NEXT STORY:', 'Help us tailor', 
                    'Thank you', 'Stay Connected', 'Newsletter page'
                ])):
                paragraphs.append(text)
    
    return {
        'title': title,
        'author': author, 
        'content': ' '.join(paragraphs),
        'link': url
    }

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Process all articles
articles = scrape_govexec_past_week("https://govexec.com/rss/all/")
all_results = []  # Store all results for saving

print(f"Found {len(articles)} articles from the past week\n")

for i, article in enumerate(articles, 1):
    try:
        print(f"[{i}/{len(articles)}] Processing: {article['title']}")
        result = extract_clean_content(article['link'])
        
        # Add to results list
        all_results.append(result)
        
        # Your original console output
        print(f"TITLE: {result['title']}")
        print(f"AUTHOR: {result['author']}")
        print("CONTENT:")
        
        for j, para in enumerate(result['content'], 1):
            print(f"{j}. {para}")
        
        print("-" * 80)
        
    except Exception as e:
        print(f"Error processing {article['title']}: {e}")
        print("-" * 80)

# Also save as "latest" for easy loading
latest_filepath = os.path.join(output_path, "govexec_articles_latest.json")
with open(latest_filepath, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

Found 20 articles from the past week

[1/20] Processing: With DeJoy out, postal stakeholders push for pause in criticized Delivering for America overhaul plan
TITLE: With DeJoy out, postal stakeholders push for pause in criticized Delivering for America overhaul plan
AUTHOR: Sean Michael Newhouse
CONTENT:
1. A
2. s
3.  
4. a
5.  
6. n
7. e
8. w
9.  
10. l
11. e
12. a
13. d
14. e
15. r
16.  
17. t
18. a
19. k
20. e
21. s
22.  
23. t
24. h
25. e
26.  
27. h
28. e
29. l
30. m
31.  
32. o
33. f
34.  
35. t
36. h
37. e
38.  
39. U
40. .
41. S
42. .
43.  
44. P
45. o
46. s
47. t
48. a
49. l
50.  
51. S
52. e
53. r
54. v
55. i
56. c
57. e
58. ,
59.  
60. a
61.  
62. H
63. o
64. u
65. s
66. e
67.  
68. p
69. a
70. n
71. e
72. l
73.  
74. o
75. n
76.  
77. T
78. u
79. e
80. s
81. d
82. a
83. y
84.  
85. d
86. e
87. b
88. a
89. t
90. e
91. d
92.  
93. t
94. h
95. e
96.  
97. f
98. u
99. t
100. u
101. r
102. e
103.  
104. o
105. f
106.  
107. t
108. h
109. e
110.  
111. i
112. n
113. d
114. e
115

In [None]:
# Example: How to load the data later
def load_saved_articles(filename="govexec_articles_latest.json"):
    """Load articles from saved JSON file"""
    filepath = os.path.join(output_path, filename)
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            articles = json.load(f)
        print(f"Loaded {len(articles)} articles from {filename}")
        return articles
    except FileNotFoundError:
        print(f"File {filename} not found in {output_path}")
        return []

# Uncomment to test loading:
loaded_articles = load_saved_articles()
if loaded_articles:
    print(f"First article: {loaded_articles[0]['title']}")