In [1]:
import feedparser
import requests
import time

def scrape_google_news(topic, limit):
    base_url = "https://news.google.com/rss"
    results = []
    page = 1
    
    while len(results) < limit:
        # Construct the URL for the Google News RSS feed
        url = f"{base_url}/search?q={topic}&hl=en-US&gl=US&ceid=US:en"
        if page > 1:
            url += f"&page={page}"
        
        # Parse the RSS feed
        feed = feedparser.parse(url)
        
        # Check if there are no more articles
        if len(feed.entries) == 0:
            break
        
        for article in feed.entries:
            if len(results) >= limit:
                break
            results.append({
                'title': article.title,
                'link': article.link,
                'published': article.published,
                'summary': article.summary if 'summary' in article else ''
            })
        
        page += 1
        time.sleep(1)  # Add a delay to avoid overwhelming the server
    
    return results[:limit]

# Example usage
topic = "artificial intelligence"
limit = 200

news_articles = scrape_google_news(topic, limit)

print(f"Total articles fetched: {len(news_articles)}")

for i, article in enumerate(news_articles, 1):
    print(f"{i}. Title: {article['title']}")
    print(f"   Link: {article['link']}")
    print(f"   Published: {article['published']}")
    print(f"   Summary: {article['summary'][:100]}...")  # Truncate summary for brevity
    print("-" * 50)

ModuleNotFoundError: No module named 'feedparser'

In [2]:
import urllib.parse
from math import ceil

def get_time_range():
    print("\nSelect a time range:")
    print("1. Recent past hours")
    print("2. Last 24 hours")
    print("3. Past week")
    print("4. Last month")
    print("5. Last year")
    print("6. Custom date range")
    
    choice = input("Enter your choice (1-6): ")
    
    if choice == '1':
        hours = int(input("Enter the number of past hours: "))
        return f"qdr:h{hours}"
    elif choice == '2':
        return "qdr:d"
    elif choice == '3':
        return "qdr:w"
    elif choice == '4':
        return "qdr:m"
    elif choice == '5':
        return "qdr:y"
    elif choice == '6':
        start_date = input("Enter start date (MM/DD/YYYY): ")
        end_date = input("Enter end date (MM/DD/YYYY): ")
        return f"cdr:1,cd_min:{start_date},cd_max:{end_date}"
    else:
        print("Invalid choice. Using default (no time range).")
        return ""

def generate_urls(query, time_range, limit):
    base_url = "https://www.google.com/search"
    urls = []
    results_per_page = 100
    num_pages = ceil(limit / results_per_page)
    
    for page in range(num_pages):
        params = {
            "q": query,
            "tbm": "nws",
            "num": min(results_per_page, limit - page * results_per_page),
            "start": page * results_per_page
        }
        
        if time_range:
            params["tbs"] = time_range
        
        url = f"{base_url}?{urllib.parse.urlencode(params)}"
        urls.append(url)
    
    return urls

def main():
    query = input("Enter your search query: ")
    time_range = get_time_range()
    limit = int(input("Enter the number of news titles to scrape (max 1000): "))
    
    if limit > 1000:
        print("Limiting to 1000 results as per Google's typical maximum.")
        limit = 1000
    
    urls = generate_urls(query, time_range, limit)
    
    print(f"\nHere are the URLs to scrape {limit} news titles:")
    for i, url in enumerate(urls, 1):
        print(f"\nURL {i}:")
        print(url)

if __name__ == "__main__":
    main()


Select a time range:
1. Recent past hours
2. Last 24 hours
3. Past week
4. Last month
5. Last year
6. Custom date range

Here are the URLs to scrape 500 news titles:

URL 1:
https://www.google.com/search?q=gamese&tbm=nws&num=100&start=0&tbs=qdr%3Aw

URL 2:
https://www.google.com/search?q=gamese&tbm=nws&num=100&start=100&tbs=qdr%3Aw

URL 3:
https://www.google.com/search?q=gamese&tbm=nws&num=100&start=200&tbs=qdr%3Aw

URL 4:
https://www.google.com/search?q=gamese&tbm=nws&num=100&start=300&tbs=qdr%3Aw

URL 5:
https://www.google.com/search?q=gamese&tbm=nws&num=100&start=400&tbs=qdr%3Aw


In [3]:
import urllib.parse
import requests
from bs4 import BeautifulSoup
from math import ceil
from requests_html import HTMLSession
def get_time_range():
    print("\nSelect a time range:")
    print("1. Recent past hours")
    print("2. Last 24 hours")
    print("3. Past week")
    print("4. Last month")
    print("5. Last year")
    print("6. Custom date range")
    
    choice = input("Enter your choice (1-6): ")
    
    if choice == '1':
        hours = int(input("Enter the number of past hours: "))
        return f"qdr:h{hours}"
    elif choice == '2':
        return "qdr:d"
    elif choice == '3':
        return "qdr:w"
    elif choice == '4':
        return "qdr:m"
    elif choice == '5':
        return "qdr:y"
    elif choice == '6':
        start_date = input("Enter start date (MM/DD/YYYY): ")
        end_date = input("Enter end date (MM/DD/YYYY): ")
        return f"cdr:1,cd_min:{start_date},cd_max:{end_date}"
    else:
        print("Invalid choice. Using default (no time range).")
        return ""

def get_total_results(query, time_range):
    base_url = "https://www.google.com/search"
    params = {
        "q": query,
        "tbm": "nws",
        "num": 1
    }
    if time_range:
        params["tbs"] = time_range
    
    url = f"{base_url}?{urllib.parse.urlencode(params)}"
    
    try:
        session = HTMLSession()
        r = session.get(url)
        r.html.render(sleep=4)
        soup = BeautifulSoup(r.html.raw_html, "html.parser")
        print(soup)
        result_stats = soup.find('div', {'id': 'result-stats'})
        if result_stats:
            total_results = int(''.join(filter(str.isdigit, result_stats.text)))
            print(total_results)
            return min(total_results, 1000)  # Google typically limits to 1000 results
        else:
            print("Couldn't find total results. Defaulting to 1000.")
            return 1000
    except Exception as e:
        print(f"Error fetching total results: {e}")
        return 1000

def generate_urls(query, time_range, limit):
    total_results = get_total_results(query, time_range)
    limit = min(limit, total_results)
    
    base_url = "https://www.google.com/search"
    urls = []
    results_per_page = 100
    num_pages = ceil(limit / results_per_page)
    
    for page in range(num_pages):
        remaining = limit - page * results_per_page
        params = {
            "q": query,
            "tbm": "nws",
            "num": min(results_per_page, remaining),
            "start": page * results_per_page
        }
        
        if time_range:
            params["tbs"] = time_range
        
        url = f"{base_url}?{urllib.parse.urlencode(params)}"
        urls.append(url)
    
    return urls, limit

def main():
    query = input("Enter your search query: ")
    time_range = get_time_range()
    limit = int(input("Enter the number of news titles to scrape: "))
    
    urls, actual_limit = generate_urls(query, time_range, limit)
    
    print(f"\nURLs to scrape {actual_limit} news titles (or all available if less):")
    for i, url in enumerate(urls, 1):
        print(f"\nURL {i}:")
        print(url)

if __name__ == "__main__":
    main()


Select a time range:
1. Recent past hours
2. Last 24 hours
3. Past week
4. Last month
5. Last year
6. Custom date range


In [2]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
session = HTMLSession()

# Define the URL
url = "https://www.google.com/search?q=games&tbm=nws&num=100&start=0&tbs=qdr%3Aw"

# Send a GET request to the URL
r = session.get(url)
r.html.render(sleep=4)  # Render the JavaScript content
# Parse the response content with BeautifulSoup
soup = BeautifulSoup(r.html.raw_html, "html.parser")

# Save the parsed HTML to a file
with open("parsed_response.html", "w", encoding="utf-8") as file:
    file.write(soup.prettify())

print("Parsed response saved to 'parsed_response.html'")


Parsed response saved to 'parsed_response.html'
