## Get random headers + soup content + content in cards

In [1]:
import csv
import requests
import random
from bs4 import BeautifulSoup
import time

# Load headers from CSV
headers_list = []
with open('headers.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Remove empty values to avoid sending None/empty headers
        headers = {k: v for k, v in row.items() if v}
        headers_list.append(headers)

url = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRE55YXpBU0JXVnVMVWRDS0FBUAE?hl=en-IN&gl=IN&ceid=IN%3Aen"
headers = random.choice(headers_list)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

cards = soup.find_all("c-wiz", {
    "jsrenderer": "ARwRbe",
    "jsmodel": "hc6Ubd",
    "class": "PO9Zff Ccj79 kUVvS"
})

print(f"Found {len(cards)} news cards.\n")

news_cards_html = [str(card) for card in cards]

#Print the first one
print(news_cards_html[0][:100], "...")

Found 60 news cards.

<c-wiz c-wiz="" class="PO9Zff Ccj79 kUVvS" data-node-index="1;0" data-p="%.@.]" jsdata="deferred-i12 ...


## remove the outer covering + get dictionary of dictionaries in an array

In [2]:
inner_divs = []

for card in cards:
    # find the inner <c-wiz> inside each outer c-wiz
    inner_cwiz = card.find("c-wiz", recursive=True)
    if inner_cwiz:
        # find the first <div> inside the <div> of the inner c-wiz
        # structure: <c-wiz> -> <div> -> <div>
        first_div = inner_cwiz.find("div", recursive=False)
        if first_div:
            second_div = first_div.find("div", recursive=False)
            if second_div:
                inner_divs.append(second_div)

print(f"Extracted {len(inner_divs)} inner divs.")

# Convert them to HTML strings if needed
inner_divs_html = [str(div) for div in inner_divs]

def element_to_dom(element):
    """Recursively convert a BeautifulSoup element to a nested dictionary structure."""
    dom = {}

    for child in element.children:
        # Skip text nodes that are just whitespace
        if child.name is None:
            text = child.strip()
            if text:
                dom["text"] = text
            continue

        tag = child.name

        # Determine value based on tag type
        if tag == "a":
            value = child.get("href")
        elif tag == "img":
            value = child.get("src")
        elif tag == "svg":
            value = str(child)
        elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "time", "button"]:
            value = child.decode_contents().strip()
        else:
            # If element has nested tags, recurse
            nested = element_to_dom(child)
            value = nested if nested else child.decode_contents().strip()

        # Handle multiple same-tag children by converting value into a list
        if tag in dom:
            if isinstance(dom[tag], list):
                dom[tag].append(value)
            else:
                dom[tag] = [dom[tag], value]
        else:
            dom[tag] = value

    return dom


# Assuming inner_divs_html is already available
dom_structures = []

for html in inner_divs_html:
    soup = BeautifulSoup(html, "html.parser")
    dom_dict = element_to_dom(soup)
    dom_structures.append(dom_dict)

Extracted 59 inner divs.


In [3]:
import json
print(json.dumps(dom_structures, indent=2, ensure_ascii=False))

[
  {
    "div": {
      "article": {
        "div": [
          {
            "a": "./read/CBMigwJBVV95cUxOdWh5azM5OWRIenk5OFh1T2NDY2FMSmk5eUFaUHh3eFVMakxjQVNsNnFadVVjbDlKanp0UVdHeFFUa2tQRU1nWVhVUWZiZWxDVGo0cVFFcmVpRUZJUk1EUlVmVkFrYllJX3g0YmZXRjYzR1RJYTMwLUpadlRoZmk3LWpyVTJ2bWQ5azZKT0pwSHBrdzRodnpidV9fTHRRZnJzbDFVaGs2ZlktNU5kUzhVN1BHZ2EwQ1czWjQzTDRPZklua2lqQVZid0VzVUg5ZXJ1MUYyZ1IzcDlPNS1lOEtQRkxaMVVESmxQWmVabkotcTBhMHlpVVJtOVNXUVFaV2kxY3Zn?hl=en-IN&gl=IN&ceid=IN%3Aen"
          },
          {
            "div": [
              {
                "img": "https://encrypted-tbn1.gstatic.com/faviconV2?url=https://www.newindianexpress.com&client=NEWS_360&size=96&type=FAVICON&fallback_opts=TYPE,SIZE,URL",
                "div": {
                  "div": {
                    "text": "The New Indian Express"
                  }
                }
              },
              {
                "div": {
                  "div": {
                    "div": [
                      {
          

## json ready the data

In [4]:
from datetime import datetime

def get_news_data(dom_structure):
    """
    Extracts structured news data from a single dom_structure[i] element.
    Returns a dictionary containing:
      - primary_article (headline, link, author, etc.)
      - related_articles (list of dictionaries)
      - total_related_articles (count)
    """
    BASE_URL = "https://news.google.com"
    today_date = datetime.now().strftime("%d-%m-%Y")  # current date in dd-mm-yyyy

    def extract_headline(article_div):
        """Look for <a> tag with class 'gPFEn' and return its aria-label"""
        divs = article_div.get("div", [])
        if not isinstance(divs, list):
            return "N/A"
        for d in divs:
            a_tag = d.get("a", {})
            if isinstance(a_tag, dict):
                cls = a_tag.get("class", "")
                if cls == "gPFEn":
                    return a_tag.get("aria-label", "N/A")
        return "N/A"
    article = dom_structure.get("div", {}).get("article", {})
    print(article)
    news_data = {
        'headline': extract_headline(article),

        'author': article.get('div', [])[2].get('div', {}).get('span', 'N/A')
        if isinstance(article.get('div', []), list) and len(article.get('div', [])) > 2 else 'N/A',

        'article_link': (
            BASE_URL + article.get('a', '')
            if article.get('a') not in (None, 'N/A') else 'N/A'
        ),

        'featured_image': article.get('figure', {}).get('img', 'N/A'),

        'source_logo': (
            article.get('div', [])[1].get('div', [])[0].get('img', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        'source_name': (
            article.get('div', [])[1]
            .get('div', [])[0]
            .get('div', {})
            .get('div', {})
            .get('text', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        # --- NEW FIELD ---
        'publish_date': today_date
    }

    # --- RELATED ARTICLES ---
    related_articles = []
    possible_related = dom_structure.get("div", {}).get("div", {}).get("article", [])

    if isinstance(possible_related, list):
        for article in possible_related:
            if not isinstance(article, dict):
                continue

            divs = article.get('div', [])
            if not isinstance(divs, list) or len(divs) < 3:
                continue

            raw_link = article.get('a', '')
            full_link = BASE_URL + raw_link if raw_link not in (None, 'N/A') else 'N/A'

            article_data = {
                'headline': extract_headline(divs[2].get('div', {})),
                'author': divs[2].get('div', {}).get('span', 'N/A'),
                'article_link': full_link,
                'source_logo': (
                    divs[1].get('div', [])[0].get('img', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),
                'source_name': (
                    divs[1].get('div', [])[0]
                    .get('div', {})
                    .get('div', {})
                    .get('text', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),

                'publish_date': today_date
            }
            related_articles.append(article_data)

    # --- FINAL OUTPUT ---
    complete_news_data = {
        'primary_article': news_data,
        'related_articles': related_articles,
        'total_related_articles': len(related_articles)
    }

    return complete_news_data

business_news_data = []

# Loop through all items in dom_structures
for i, dom in enumerate(dom_structures):
    try:
        news_item = get_news_data(dom)
        business_news_data.append(news_item)
        print(f"Processed item {i + 1}/{len(dom_structures)}")
    except Exception as e:
        print(f"Error processing item {i + 1}: {e}")

# Save to JSON file
with open("india_news.json", "w", encoding="utf-8") as f:
    json.dump(business_news_data, f, ensure_ascii=False, indent=2)

print("\n✅ File 'india_news.json' saved successfully in the working directory!")

{'div': [{'a': './read/CBMigwJBVV95cUxOdWh5azM5OWRIenk5OFh1T2NDY2FMSmk5eUFaUHh3eFVMakxjQVNsNnFadVVjbDlKanp0UVdHeFFUa2tQRU1nWVhVUWZiZWxDVGo0cVFFcmVpRUZJUk1EUlVmVkFrYllJX3g0YmZXRjYzR1RJYTMwLUpadlRoZmk3LWpyVTJ2bWQ5azZKT0pwSHBrdzRodnpidV9fTHRRZnJzbDFVaGs2ZlktNU5kUzhVN1BHZ2EwQ1czWjQzTDRPZklua2lqQVZid0VzVUg5ZXJ1MUYyZ1IzcDlPNS1lOEtQRkxaMVVESmxQWmVabkotcTBhMHlpVVJtOVNXUVFaV2kxY3Zn?hl=en-IN&gl=IN&ceid=IN%3Aen'}, {'div': [{'img': 'https://encrypted-tbn1.gstatic.com/faviconV2?url=https://www.newindianexpress.com&client=NEWS_360&size=96&type=FAVICON&fallback_opts=TYPE,SIZE,URL', 'div': {'div': {'text': 'The New Indian Express'}}}, {'div': {'div': {'div': [{'span': '<button aria-expanded="false" aria-haspopup="menu" aria-label="More - 21 killed as Hyderabad-Bengaluru private bus catches fire in Andhra; Police collect DNA from charred bodies" class="VfPpkd-Bz112c-LgbsSe yHy1rc eT1oJ mN1ivc hUJSud" data-idom-class="yHy1rc eT1oJ mN1ivc hUJSud" data-n-tid="27" data-tooltip-enabled="true" data-tooltip-i

In [5]:
news_item = get_news_data(dom_structures[2])
import pprint
pprint.pprint(news_item)

{'div': [{'a': './read/CBMi4gFBVV95cUxOa1AwcVJUQVZNLTJnRUZkWEdYamk0MUZhLWQyWWthd2F5VjNJRkRuWDhYQnYtYTBBTkx1NW1QY242alJUSDRaRTgwUlNRamhpN2ViYktBd2JoYkxNMHk4Vnlna2toQTdiUWJNdmpWdTkzbFM5Z3lRUDQ0dVZHVjFwcmtiTTFCRXhsTXBpUUZteXpzSVQ3dVJLZ1dJMy1CdHhFODJkQ0paOG9ud3owVEp1Szk2TVlJcnM4aFJEVVYxWmFFeVRIVHZSaW1rZV8wcEhUX0c3R1lqS2JodHlUY3lUNi1n?hl=en-IN&gl=IN&ceid=IN%3Aen'}, {'div': [{'img': 'https://encrypted-tbn2.gstatic.com/faviconV2?url=https://timesofindia.indiatimes.com&client=NEWS_360&size=96&type=FAVICON&fallback_opts=TYPE,SIZE,URL', 'div': {'div': {'text': 'Times of India'}}}, {'div': {'div': {'div': [{'span': '<button aria-expanded="false" aria-haspopup="menu" aria-label="More - Bihar elections: Is it official? Nitish Kumar to be NDA\'s CM face — What PM Modi said" class="VfPpkd-Bz112c-LgbsSe yHy1rc eT1oJ mN1ivc hUJSud" data-idom-class="yHy1rc eT1oJ mN1ivc hUJSud" data-n-tid="27" data-tooltip-enabled="true" data-tooltip-id="tt-i26" jsaction="click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mo