In [1]:
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import json
import os
import requests
    
load_dotenv()

def get_recent_articles(sites, topics, api_key, cse_id):
    
    articles = []
    
    for site in sites:
        for topic in topics:
            # Build Google Custom Search query
            url = f"https://www.googleapis.com/customsearch/v1"
            params = {
                'q': f'site:{site} {topic}',
                'key': api_key,
                'cx': cse_id,
                'num': 10,  # Max results per request
                'siteSearch': site,
                'dateRestrict': 'w2',
                'lr': 'lang_en'
            }
            
            try:
                response = requests.get(url, params=params)
                results = response.json()
                
                for item in results.get('items', []):
                    article = {
                        'title': item.get('title'),
                        'url': item.get('link'),
                        'snippet': item.get('snippet'),
                        'source': site,
                        'content': ''
                    }
                    
                    # Scrape full content
                    try:
                        page = requests.get(article['url'], timeout=10)
                        soup = BeautifulSoup(page.content, 'html.parser')
                        
                        # Generic content extraction (customize per site)
                        main_content = soup.find('article') or soup.find('main') or soup.find('div', class_='content')
                        if main_content:
                            # Remove unnecessary elements
                            for elem in main_content(['script', 'style', 'nav', 'footer']):
                                elem.decompose()
                            article['content'] = ' '.join(main_content.stripped_strings)
                        else:
                            article['content'] = 'Content not found'
                            
                    except Exception as e:
                        article['content'] = f'Content extraction failed: {str(e)}'
                    
                    articles.append(article)
                    
            except Exception as e:
                print(f"Error searching {site} for {topic}: {str(e)}")
    
    return articles

# Configuration
parent_sites = ['cnn.com', 'bbc.com']
topics = ['Donald Trump', 'Bitcoin']
google_api_key = os.environ.get("GOOGLE_API_KEY")
google_cse_id = os.environ.get("GOOGLE_CSE_ID")

# Get articles
recent_articles = get_recent_articles(parent_sites, topics, google_api_key, google_cse_id)

# Save results
with open('recent_articles.json', 'w') as f:
    json.dump(recent_articles, f, indent=2)

print(f"Found {len(recent_articles)} articles in the last 7 days")


Found 40 articles in the last 7 days
