In [4]:
import csv
import requests
import random
from bs4 import BeautifulSoup
import time
import json
from datetime import datetime

# -----------------------------------------
# LOAD HEADERS
# -----------------------------------------
def fill_headers_list(headers_list):
    with open('headers.csv', 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            headers = {k: v for k, v in row.items() if v}
            headers_list.append(headers)


# -----------------------------------------
# FETCH SOUP WITH RANDOM HEADERS
# -----------------------------------------
def get_soup(url, headers_list):
    headers = random.choice(headers_list)
    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup


# -----------------------------------------
# GOOGLE NEWS CARD EXTRACTION
# -----------------------------------------
def get_cards_by_headers(soup):
    cards = soup.find_all("c-wiz", {
        "jsrenderer": "ARwRbe",
        "jsmodel": "hc6Ubd",
        "class": "PO9Zff Ccj79 kUVvS"
    })
    return cards


def get_news_cards(cards):
    return [str(card) for card in cards]


def peel_outer_html(cards):
    inner_divs = []
    for card in cards:
        inner_cwiz = card.find("c-wiz", recursive=True)
        if inner_cwiz:
            first_div = inner_cwiz.find("div", recursive=False)
            if first_div:
                second_div = first_div.find("div", recursive=False)
                if second_div:
                    inner_divs.append(second_div)
    return inner_divs


# -----------------------------------------
# DOM CONVERSION
# -----------------------------------------
def element_to_dom(element):
    """Recursively convert a BeautifulSoup element to a nested dictionary."""
    dom = {}

    for child in element.children:
        if child.name is None:
            text = child.strip()
            if text:
                dom["text"] = text
            continue

        tag = child.name

        if tag == "a":
            value = child.get("href")

        elif tag == "img":
            # Try multiple possibilities
            value = (
                child.get("srcset") or
                child.get("data-src") or
                child.get("src") or
                "N/A"
            )
            # Extract first gstatic URL if srcset
            if "gstatic" in value:
                value = value.split(" ")[0]

        elif tag == "svg":
            value = str(child)

        elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "time", "button"]:
            value = child.decode_contents().strip()

        else:
            nested = element_to_dom(child)
            value = nested if nested else child.decode_contents().strip()

        if tag in dom:
            if isinstance(dom[tag], list):
                dom[tag].append(value)
            else:
                dom[tag] = [dom[tag], value]
        else:
            dom[tag] = value

    return dom


# -----------------------------------------
# REAL IMAGE SCRAPER (FROM PUBLISHER SITE)
# -----------------------------------------
def get_real_image_from_article(url, headers_list):
    """
    Fetch the article page and extract the REAL OG image.
    """
    try:
        soup = get_soup(url, headers_list)

        og = soup.find("meta", property="og:image")
        if og and og.get("content"):
            return og.get("content")

        tw = soup.find("meta", property="twitter:image")
        if tw and tw.get("content"):
            return tw.get("content")

        # fallback: any <img>
        main_img = soup.select_one("img[src]")
        if main_img:
            return main_img["src"]

    except Exception as e:
        print("⚠️ Error fetching real image:", e)

    return "N/A"


# -----------------------------------------
# STRUCTURED DATA EXTRACTION
# -----------------------------------------
def get_news_data(dom_structure, i, inner_divs_html, headers_list):
    BASE_URL = "https://news.google.com"
    today_date = datetime.now().strftime("%d-%m-%Y")

    def get_headline(i):
        html = inner_divs_html[i]
        temp_soup = BeautifulSoup(html, 'html.parser')
        button = temp_soup.find('button')
        return button['aria-label'] if button and button.has_attr('aria-label') else "N/A"

    # extract primary block
    article = dom_structure.get("div", {}).get("article", {})
    headline = get_headline(i)

    # Build Primary Article Data
    raw_link = article.get('a', '')
    article_link = BASE_URL + raw_link if raw_link else "N/A"

    real_image = (
        get_real_image_from_article(article_link, headers_list)
        if article_link != "N/A"
        else "N/A"
    )

    news_data = {
        'headline': headline,
        'author': article.get('div', [])[2].get('div', {}).get('span', 'N/A')
        if isinstance(article.get('div', []), list) and len(article.get('div', [])) > 2
        else 'N/A',

        'article_link': article_link,
        'featured_image': real_image,

        'source_logo': (
            article.get('div', [])[1].get('div', [])[0].get('img', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        'source_name': (
            article.get('div', [])[1]
            .get('div', [])[0]
            .get('div', {})
            .get('div', {})
            .get('text', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        'publish_date': today_date
    }

    # -----------------------------------------
    # RELATED ARTICLES
    # -----------------------------------------
    related_articles = []
    possible_related = dom_structure.get("div", {}).get("div", {}).get("article", [])

    if isinstance(possible_related, list):
        for ra in possible_related:
            if not isinstance(ra, dict):
                continue

            divs = ra.get('div', [])
            if not isinstance(divs, list) or len(divs) < 3:
                continue

            raw_link = ra.get('a', '')
            full_link = BASE_URL + raw_link if raw_link else "N/A"

            # REAL IMAGE FOR RELATED
            ra_image = (
                get_real_image_from_article(full_link, headers_list)
                if full_link != "N/A"
                else "N/A"
            )

            article_data = {
                'headline': divs[2].get('div', {}).get('span', 'N/A'),
                'author': divs[2].get('div', {}).get('span', 'N/A'),
                'article_link': full_link,
                'featured_image': ra_image,
                'source_logo': (
                    divs[1].get('div', [])[0].get('img', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),
                'source_name': (
                    divs[1].get('div', [])[0]
                    .get('div', {})
                    .get('div', {})
                    .get('text', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),

                'publish_date': today_date
            }

            related_articles.append(article_data)

    # Final Combined Output
    return {
        'primary_article': news_data,
        'related_articles': related_articles,
        'total_related_articles': len(related_articles)
    }


# -----------------------------------------
# SCRAPE FUNCTION
# -----------------------------------------
def scrape(url, file):
    headers_list = []
    fill_headers_list(headers_list)

    soup = get_soup(url, headers_list)
    cards = get_cards_by_headers(soup)
    inner_divs = peel_outer_html(cards)
    inner_divs_html = [str(div) for div in inner_divs]

    dom_structures = []
    for html in inner_divs_html:
        temp_soup = BeautifulSoup(html, "html.parser")
        dom_dict = element_to_dom(temp_soup)
        dom_structures.append(dom_dict)

    news_data = []

    for i, dom in enumerate(dom_structures):
        try:
            news_item = get_news_data(dom, i, inner_divs_html, headers_list)
            news_data.append(news_item)
        except Exception as e:
            print(f"⚠️ Error processing item {i + 1}:", e)

    with open(file, "w", encoding="utf-8") as f:
        json.dump(news_data, f, ensure_ascii=False, indent=2)

    print("\n✅ File saved:", file)

In [5]:
india = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRE55YXpBU0JXVnVMVWRDS0FBUAE?hl=en-IN&gl=IN&ceid=IN%3Aen"

scrape(india, 'india_news.json')


✅ File saved: india_news.json
