In [19]:
import csv
import requests
import random
from bs4 import BeautifulSoup
import time
import json
from datetime import datetime

In [20]:
def fill_headers_list(headers_list):
    with open('headers.csv', 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Remove empty values to avoid sending None/empty headers
            headers = {k: v for k, v in row.items() if v}
            headers_list.append(headers)
def get_soup(url, headers_list):
    headers = random.choice(headers_list)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup
def get_cards_by_headers(soup):
    cards = soup.find_all("c-wiz", {
        "jsrenderer": "ARwRbe",
        "jsmodel": "hc6Ubd",
        "class": "PO9Zff Ccj79 kUVvS"
    })
    return cards
def get_news_cards(cards):
    return [str(card) for card in cards]
def peel_outer_html(cards):
    inner_divs = []
    for card in cards:
        inner_cwiz = card.find("c-wiz", recursive=True)
        if inner_cwiz:
            first_div = inner_cwiz.find("div", recursive=False)
            if first_div:
                second_div = first_div.find("div", recursive=False)
                if second_div:
                    inner_divs.append(second_div)
    return inner_divs

In [21]:
def element_to_dom(element):
    """Recursively convert a BeautifulSoup element to a nested dictionary structure."""
    dom = {}

    for child in element.children:
        # Skip text nodes that are just whitespace
        if child.name is None:
            text = child.strip()
            if text:
                dom["text"] = text
            continue

        tag = child.name

        # Determine value based on tag type
        if tag == "a":
            value = child.get("href")
        elif tag == "img":
            value = child.get("src")
        elif tag == "svg":
            value = str(child)
        elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "time", "button"]:
            value = child.decode_contents().strip()
        else:
            # If element has nested tags, recurse
            nested = element_to_dom(child)
            value = nested if nested else child.decode_contents().strip()

        # Handle multiple same-tag children by converting value into a list
        if tag in dom:
            if isinstance(dom[tag], list):
                dom[tag].append(value)
            else:
                dom[tag] = [dom[tag], value]
        else:
            dom[tag] = value

    return dom

In [22]:
def get_news_data(dom_structure, i, inner_divs_html):
    """
    Extracts structured news data from a single dom_structure[i] element.
    Returns a dictionary containing:
      - primary_article (headline, link, author, etc.)
      - related_articles (list of dictionaries)
      - total_related_articles (count)
    """
    BASE_URL = "https://news.google.com"
    today_date = datetime.now().strftime("%d-%m-%Y")  # current date in dd-mm-yyyy

    def get_headline(i):
        html = inner_divs_html[i]
        temp_soup = BeautifulSoup(html, 'html.parser')
        
        # Find the button
        button = temp_soup.find('button')
        
        # Get the aria-label value
        if button and button.has_attr('aria-label'):
            return button['aria-label']
        else:
            return None
            
    # --- PRIMARY ARTICLE ---
    article = dom_structure.get("div", {}).get("article", {})
    headline = get_headline(i)
    news_data = {
        'headline': headline if headline is not None else 'N/A',

        'author': article.get('div', [])[2].get('div', {}).get('span', 'N/A')
        if isinstance(article.get('div', []), list) and len(article.get('div', [])) > 2 else 'N/A',

        'article_link': (
            BASE_URL + article.get('a', '')
            if article.get('a') not in (None, 'N/A') else 'N/A'
        ),

        'featured_image': article.get('figure', {}).get('img', 'N/A'),

        'source_logo': (
            article.get('div', [])[1].get('div', [])[0].get('img', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        'source_name': (
            article.get('div', [])[1]
            .get('div', [])[0]
            .get('div', {})
            .get('div', {})
            .get('text', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        # --- NEW FIELD ---
        'publish_date': today_date
    }

    # --- RELATED ARTICLES ---
    related_articles = []
    possible_related = dom_structure.get("div", {}).get("div", {}).get("article", [])

    if isinstance(possible_related, list):
        for article in possible_related:
            if not isinstance(article, dict):
                continue

            divs = article.get('div', [])
            if not isinstance(divs, list) or len(divs) < 3:
                continue

            raw_link = article.get('a', '')
            full_link = BASE_URL + raw_link if raw_link not in (None, 'N/A') else 'N/A'

            article_data = {
                'headline': divs[2].get('div', {}).get('span', 'N/A'),
                'author': divs[2].get('div', {}).get('span', 'N/A'),
                'article_link': full_link,
                'source_logo': (
                    divs[1].get('div', [])[0].get('img', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),
                'source_name': (
                    divs[1].get('div', [])[0]
                    .get('div', {})
                    .get('div', {})
                    .get('text', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),

                # --- NEW FIELD ---
                'publish_date': today_date
            }
            related_articles.append(article_data)

    # --- FINAL OUTPUT ---
    complete_news_data = {
        'primary_article': news_data,
        'related_articles': related_articles,
        'total_related_articles': len(related_articles)
    }

    return complete_news_data

In [23]:
def scrape(url, file):
    headers_list = []
    fill_headers_list(headers_list)
    soup = get_soup(url, headers_list)
    cards = get_cards_by_headers(soup)
    news_cards = get_news_cards(cards)
    inner_divs = peel_outer_html(cards)
    inner_divs_html = [str(div) for div in inner_divs]
    dom_structures = []
    for html in inner_divs_html:
        temp_soup = BeautifulSoup(html, "html.parser")
        dom_dict = element_to_dom(temp_soup)
        dom_structures.append(dom_dict)
    news_data = []

    for i, dom in enumerate(dom_structures):
        try:
            news_item = get_news_data(dom, i, inner_divs_html)
            news_data.append(news_item)
            # print(f"Processed item {i + 1}/{len(dom_structures)}")
        except Exception as e:
            print(f"Error processing item {i + 1}: {e}")
    
    with open(file, "w", encoding="utf-8") as f:
        json.dump(news_data, f, ensure_ascii=False, indent=2)
    print("\n✅ File "+file+" saved successfully in the working directory!")

In [24]:
world = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"

scrape(world, 'world_news.json')


✅ File world_news.json saved successfully in the working directory!
