## Import Everything

In [136]:
import csv
import requests
import random
from bs4 import BeautifulSoup
import time
import json
from datetime import datetime

## Get everything into the header list + Soup Content + News Cards in an array

## Then Remove the outer HTML of the news cards and reduce them to their essentials

In [137]:
def fill_headers_list(headers_list):
    with open('headers.csv', 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Remove empty values to avoid sending None/empty headers
            headers = {k: v for k, v in row.items() if v}
            headers_list.append(headers)
def get_soup(url, headers_list):
    headers = random.choice(headers_list)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup
def get_cards_by_headers(soup):
    cards = soup.find_all("c-wiz", {
        "jsrenderer": "ARwRbe",
        "jsmodel": "hc6Ubd",
        "class": "PO9Zff Ccj79 kUVvS"
    })
    return cards
def get_news_cards(cards):
    return [str(card) for card in cards]
def peel_outer_html(cards):
    inner_divs = []
    for card in cards:
        inner_cwiz = card.find("c-wiz", recursive=True)
        if inner_cwiz:
            first_div = inner_cwiz.find("div", recursive=False)
            if first_div:
                second_div = first_div.find("div", recursive=False)
                if second_div:
                    inner_divs.append(second_div)
    return inner_divs

## This is function which takes a card's HTML and returns a dictionary of dictionaries which is arranged like the DOM's structure

In [131]:
def element_to_dom(element):
    """Recursively convert a BeautifulSoup element to a nested dictionary structure."""
    dom = {}

    for child in element.children:
        # Skip text nodes that are just whitespace
        if child.name is None:
            text = child.strip()
            if text:
                dom["text"] = text
            continue

        tag = child.name

        # Determine value based on tag type
        if tag == "a":
            value = child.get("href")
        elif tag == "img":
            value = child.get("src")
        elif tag == "svg":
            value = str(child)
        elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "time", "button"]:
            value = child.decode_contents().strip()
        else:
            # If element has nested tags, recurse
            nested = element_to_dom(child)
            value = nested if nested else child.decode_contents().strip()

        # Handle multiple same-tag children by converting value into a list
        if tag in dom:
            if isinstance(dom[tag], list):
                dom[tag].append(value)
            else:
                dom[tag] = [dom[tag], value]
        else:
            dom[tag] = value

    return dom

## This function takes a dictionary of dictionaries and returns the data in more usable format which will be the format of the json

In [132]:
def get_news_data(dom_structure, i):
    """
    Extracts structured news data from a single dom_structure[i] element.
    Returns a dictionary containing:
      - primary_article (headline, link, author, etc.)
      - related_articles (list of dictionaries)
      - total_related_articles (count)
    """
    BASE_URL = "https://news.google.com"
    today_date = datetime.now().strftime("%d-%m-%Y")  # current date in dd-mm-yyyy

    def get_headline(i):
        html = inner_divs_html[i]
        temp_soup = BeautifulSoup(html, 'html.parser')
        
        # Find the button
        button = temp_soup.find('button')
        
        # Get the aria-label value
        if button and button.has_attr('aria-label'):
            return button['aria-label']
        else:
            return None
            
    # --- PRIMARY ARTICLE ---
    article = dom_structure.get("div", {}).get("article", {})
    headline = get_headline(i)
    news_data = {
        'headline': headline if headline is not None else 'N/A',

        'author': article.get('div', [])[2].get('div', {}).get('span', 'N/A')
        if isinstance(article.get('div', []), list) and len(article.get('div', [])) > 2 else 'N/A',

        'article_link': (
            BASE_URL + article.get('a', '')
            if article.get('a') not in (None, 'N/A') else 'N/A'
        ),

        'featured_image': article.get('figure', {}).get('img', 'N/A'),

        'source_logo': (
            article.get('div', [])[1].get('div', [])[0].get('img', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        'source_name': (
            article.get('div', [])[1]
            .get('div', [])[0]
            .get('div', {})
            .get('div', {})
            .get('text', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        # --- NEW FIELD ---
        'publish_date': today_date
    }

    # --- RELATED ARTICLES ---
    related_articles = []
    possible_related = dom_structure.get("div", {}).get("div", {}).get("article", [])

    if isinstance(possible_related, list):
        for article in possible_related:
            if not isinstance(article, dict):
                continue

            divs = article.get('div', [])
            if not isinstance(divs, list) or len(divs) < 3:
                continue

            raw_link = article.get('a', '')
            full_link = BASE_URL + raw_link if raw_link not in (None, 'N/A') else 'N/A'

            article_data = {
                'headline': divs[2].get('div', {}).get('span', 'N/A'),
                'author': divs[2].get('div', {}).get('span', 'N/A'),
                'article_link': full_link,
                'source_logo': (
                    divs[1].get('div', [])[0].get('img', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),
                'source_name': (
                    divs[1].get('div', [])[0]
                    .get('div', {})
                    .get('div', {})
                    .get('text', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),

                # --- NEW FIELD ---
                'publish_date': today_date
            }
            related_articles.append(article_data)

    # --- FINAL OUTPUT ---
    complete_news_data = {
        'primary_article': news_data,
        'related_articles': related_articles,
        'total_related_articles': len(related_articles)
    }

    return complete_news_data

## The Main function which is going to perform the scraping and then save it in a given file

In [133]:
def scrape(url, file):
    headers_list = []
    fill_headers_list(headers_list)
    soup = get_soup(url, headers_list)
    cards = get_cards_by_headers(soup)
    news_cards = get_news_cards(cards)
    inner_divs = peel_outer_html(cards)
    inner_divs_html = [str(div) for div in inner_divs]
    dom_structures = []
    for html in inner_divs_html:
        temp_soup = BeautifulSoup(html, "html.parser")
        dom_dict = element_to_dom(temp_soup)
        dom_structures.append(dom_dict)
    news_data = []

    for i, dom in enumerate(dom_structures):
        try:
            news_item = get_news_data(dom, i)
            news_data.append(news_item)
            # print(f"Processed item {i + 1}/{len(dom_structures)}")
        except Exception as e:
            print(f"Error processing item {i + 1}: {e}")
    
    with open(file, "w", encoding="utf-8") as f:
        json.dump(news_data, f, ensure_ascii=False, indent=2)
    print("\n✅ File "+file+" saved successfully in the working directory!")

## Scrape it all finally

In [138]:
BASE_URL = "https://news.google.com"

def safe_text(el):
    """Return stripped text or None."""
    if not el:
        return None
    txt = el.get_text(separator=" ", strip=True)
    return txt if txt else None

def make_full_link(href):
    if not href:
        return None
    # Google News often uses relative links like "./articles/..."
    if href.startswith("http"):
        return href
    if href.startswith("./"):
        href = href[1:]  # remove leading dot to make it relative to BASE_URL
    return BASE_URL + href

def parse_card(card_tag):
    """Extract structured data from a single c-wiz / card Tag."""
    today_date = datetime.now().strftime("%d-%m-%Y")
    data = {
        "primary_article": {
            "headline": "N/A",
            "author": "N/A",
            "article_link": "N/A",
            "featured_image": "N/A",
            "source_logo": "N/A",
            "source_name": "N/A",
            "publish_date": today_date
        },
        "related_articles": [],
        "total_related_articles": 0
    }

    # 1) HEADLINE: Often stored in a button's aria-label (or within an <a> or <h3>)
    button = card_tag.find("button", attrs={"aria-label": True})
    if button:
        data["primary_article"]["headline"] = button["aria-label"]

    # fallback: try article heading text
    if data["primary_article"]["headline"] in (None, "N/A"):
        # common heading selectors
        h = card_tag.select_one("h3, h4, .DY5T1d")  # class used by google news headlines sometimes
        if h:
            val = safe_text(h)
            if val:
                data["primary_article"]["headline"] = val

    # 2) ARTICLE LINK: look for first <a> inside an article or heading
    a = card_tag.find("a", href=True)
    if a:
        data["primary_article"]["article_link"] = make_full_link(a["href"])

    # 3) FEATURED IMAGE: look for <img> within figure or img tag
    img = card_tag.find("img", src=True)
    if img and img.get("src"):
        data["primary_article"]["featured_image"] = img["src"]

    # 4) SOURCE NAME & LOGO: sometimes within a nested div or span
    # Try a few heuristics:
    # - look for an element that looks like a source name
    source_name = None
    # Common pattern: span class that includes source name or small text near top
    # try: small or .wEwyrc or .QmrVtf etc (Google uses many classes)
    possible = card_tag.select("div, span")
    for el in possible[:25]:  # limit search area
        txt = safe_text(el)
        # simple heuristic: short text (<=5 words) and not the headline
        if txt and len(txt.split()) <= 6 and txt != data["primary_article"]["headline"]:
            # also avoid timestamps like "2 hours ago" — choose ones without 'ago' or digits+hr
            if "ago" in txt.lower() or any(tok.endswith("h") for tok in txt.split()):
                continue
            source_name = txt
            break
    if source_name:
        data["primary_article"]["source_name"] = source_name

    # try to find a small image that could be the logo
    logo = None
    # look for small images (width/height attributes sometimes present)
    for im in card_tag.find_all("img"):
        w = im.get("width")
        h = im.get("height")
        src = im.get("src") or im.get("data-src")
        if not src:
            continue
        # simple heuristic: logo images are usually small in pixel dims or have 'logo' in alt
        if (w and int(w) < 100) or (h and int(h) < 100) or (im.get("alt") and "logo" in im.get("alt").lower()):
            logo = src
            break
    if logo:
        data["primary_article"]["source_logo"] = logo

    # 5) AUTHOR: google news often doesn't show author in the card; attempt find byline
    byline = None
    # Look for text that contains "-" separating source from title, or "•" bullets
    # Or an element with 'aria-label' containing the source and author.
    for el in card_tag.select("div, span"):
        txt = safe_text(el)
        if not txt:
            continue
        if "-" in txt and len(txt.split("-")) <= 3:
            # assume part after dash may be author/source snippet - use first reasonable candidate
            parts = [p.strip() for p in txt.split("-")]
            candidate = parts[-1]
            if len(candidate.split()) <= 6 and not any(ch.isdigit() for ch in candidate[:3]):
                byline = candidate
                break
    if byline:
        data["primary_article"]["author"] = byline

    # 6) RELATED ARTICLES: Google topics often include a list of related <article> items inside card.
    related = []
    # Look for article tags inside the card (excluding the main one already used)
    article_tags = card_tag.find_all("article")
    # if multiple article tags, assume first is primary, rest are related
    if len(article_tags) > 1:
        for art in article_tags[1:]:
            ra = {}
            # headline for related
            h = art.find("h3") or art.find("h4")
            ra['headline'] = safe_text(h) or "N/A"
            a = art.find("a", href=True)
            ra['article_link'] = make_full_link(a['href']) if a else "N/A"
            # source
            s = art.find("img")
            ra['source_logo'] = s.get("src") if s and s.get("src") else "N/A"
            # try a short text near article for source name
            parent_divs = art.select("div, span")
            ra_source = None
            for el in parent_divs[:10]:
                txt = safe_text(el)
                if txt and len(txt.split()) <= 5 and txt != ra['headline']:
                    ra_source = txt
                    break
            ra['source_name'] = ra_source or "N/A"
            ra['author'] = "N/A"
            ra['publish_date'] = datetime.now().strftime("%d-%m-%Y")
            related.append(ra)
    else:
        # sometimes related items are links or list items within the card
        # try to find additional <a> elements which are not the primary link
        anchors = card_tag.find_all("a", href=True)
        primary_href = data["primary_article"].get("article_link")
        for a in anchors[1:6]:  # sample up to next 5 anchors
            href = make_full_link(a["href"])
            text = safe_text(a)
            if href == primary_href:
                continue
            related.append({
                "headline": text or "N/A",
                "article_link": href,
                "source_logo": "N/A",
                "source_name": "N/A",
                "author": "N/A",
                "publish_date": datetime.now().strftime("%d-%m-%Y")
            })

    data["related_articles"] = related
    data["total_related_articles"] = len(related)

    # fill defaults if some keys are None
    for k, v in data["primary_article"].items():
        if v is None:
            data["primary_article"][k] = "N/A"

    return data


def scrape(url, file):
    headers_list = []
    fill_headers_list(headers_list)
    soup = get_soup(url, headers_list)

    # choose a flexible selector; google markup sometimes uses c-wiz or divs with certain classes
    cards = soup.select('c-wiz[jsrenderer="ARwRbe"], c-wiz[jsmodel="hc6Ubd"], div[class*="WHE7ib"]')

    print(f"Found {len(cards)} candidate cards for {url}")

    news_data = []
    for idx, card in enumerate(cards):
        try:
            item = parse_card(card)
            news_data.append(item)
        except Exception as e:
            print(f"Error parsing card {idx}: {e}")

    # write json
    with open(file, "w", encoding="utf-8") as f:
        json.dump(news_data, f, ensure_ascii=False, indent=2)
    print(f"✅ File {file} saved successfully with {len(news_data)} items.")

## Get the entire data in a json

In [139]:
business = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
india = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRE55YXpBU0JXVnVMVWRDS0FBUAE?hl=en-IN&gl=IN&ceid=IN%3Aen"
technology = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGRqTVhZU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
world = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
local = "https://news.google.com/topics/CAAqHAgKIhZDQklTQ2pvSWJHOWpZV3hmZGpJb0FBUAE/sections/CAQiTkNCSVNORG9JYkc5allXeGZkakpDRUd4dlkyRnNYM1l5WDNObFkzUnBiMjV5Q2hJSUwyMHZNR1JzZGpCNkNnb0lMMjB2TUdSc2RqQW9BQSowCAAqLAgKIiZDQklTRmpvSWJHOWpZV3hmZGpKNkNnb0lMMjB2TUdSc2RqQW9BQVABUAE?hl=en-IN&gl=IN&ceid=IN%3Aen"
entertainment = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
sports = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
science = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp0Y1RjU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
health = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNR3QwTlRFU0JXVnVMVWRDS0FBUAE?hl=en-IN&gl=IN&ceid=IN%3Aen"

file = ['business_news.json','india_news.json', 'technology_news.json', 'world_news.json', 'local_news.json', 'entertainment_news.json', 'sports_news.json','science_news.json', 'health_news.json']
url = [business, india, technology, world, local, entertainment, sports, science, health]
n = len(file)
for i in range(n):
    u = url[i]
    f = file[i]
    scrape(u, f)

Found 145 candidate cards for https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen
✅ File business_news.json saved successfully with 145 items.
Found 121 candidate cards for https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRE55YXpBU0JXVnVMVWRDS0FBUAE?hl=en-IN&gl=IN&ceid=IN%3Aen
✅ File india_news.json saved successfully with 121 items.
Found 67 candidate cards for https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGRqTVhZU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen
✅ File technology_news.json saved successfully with 67 items.
Found 73 candidate cards for https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen
✅ File world_news.json saved successfully with 73 items.
Found 207 candidate cards for https://news.google.com/topics/CAAqHAgKIhZDQklTQ2pvSWJHOWpZV3hmZGpJb0FBUAE/sections/CAQiTkNCSVNORG9JYkc5allXeGZkakpDRUd4dlkyRnNYM1l5W