## 1. Get random headers out of a csv list of 20 random headers

In [26]:
import csv
import requests
import random

# Load headers from CSV
headers_list = []
with open('headers.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Remove empty values to avoid sending None/empty headers
        headers = {k: v for k, v in row.items() if v}
        headers_list.append(headers)

## 2. Get soup content

In [27]:
from bs4 import BeautifulSoup
import time
url = "https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
headers = random.choice(headers_list)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

## 3. Get content in cards

In [28]:
cards = soup.find_all("c-wiz", {
    "jsrenderer": "ARwRbe",
    "jsmodel": "hc6Ubd",
    "class": "PO9Zff Ccj79 kUVvS"
})

print(f"Found {len(cards)} news cards.\n")

news_cards_html = [str(card) for card in cards]

print(news_cards_html[0][:100], "...")

Found 71 news cards.

<c-wiz c-wiz="" class="PO9Zff Ccj79 kUVvS" data-node-index="1;0" data-p="%.@.]" jsdata="deferred-i18 ...


## 4. Remove the outer covering and getting the useful HTML within 

In [29]:
inner_divs = []

for card in cards:
    # find the inner <c-wiz> inside each outer c-wiz
    inner_cwiz = card.find("c-wiz", recursive=True)
    if inner_cwiz:
        # find the first <div> inside the <div> of the inner c-wiz
        # structure: <c-wiz> -> <div> -> <div>
        first_div = inner_cwiz.find("div", recursive=False)
        if first_div:
            second_div = first_div.find("div", recursive=False)
            if second_div:
                inner_divs.append(second_div)

print(f"Extracted {len(inner_divs)} inner divs.")

# Convert them to HTML strings if needed
inner_divs_html = [str(div) for div in inner_divs]

# Example: print one
print(inner_divs_html[0][:500], "...")

Extracted 69 inner divs.
<div class="VGgDjd"><a class="taS2Yb Mi07Bb hpDt6e XPO28d" href="https://www.google.com/search?q=INDEXBOM+SENSEX&amp;hl=en-GB&amp;source=news" jslog="181750; track:click" target="_blank"><svg class="JMZsIc NMm5M" focusable="false" height="24" role="presentation" viewbox="0 0 24 24" width="24"><path d="M4 12l1.41 1.41L11 7.83V20h2V7.83l5.58 5.59L20 12l-8-8-8 8z"></path></svg><div class="IWF7Od"><div class="xWTvcb">Sensex</div><div class="vvivme">84,556.40</div></div><div class="IWF7Od"><div class ...


## 5. This is function which takes a card's HTML and returns a dictionary of dictionaries which is arranged like the DOM's structure

In [30]:
def element_to_dom(element):
    """Recursively convert a BeautifulSoup element to a nested dictionary structure."""
    dom = {}

    for child in element.children:
        # Skip text nodes that are just whitespace
        if child.name is None:
            text = child.strip()
            if text:
                dom["text"] = text
            continue

        tag = child.name

        # Determine value based on tag type
        if tag == "a":
            value = child.get("href")
        elif tag == "img":
            value = child.get("src")
        elif tag == "svg":
            value = str(child)
        elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "time", "button"]:
            value = child.decode_contents().strip()
        else:
            # If element has nested tags, recurse
            nested = element_to_dom(child)
            value = nested if nested else child.decode_contents().strip()

        # Handle multiple same-tag children by converting value into a list
        if tag in dom:
            if isinstance(dom[tag], list):
                dom[tag].append(value)
            else:
                dom[tag] = [dom[tag], value]
        else:
            dom[tag] = value

    return dom


# Assuming inner_divs_html is already available
dom_structures = []

for html in inner_divs_html:
    soup = BeautifulSoup(html, "html.parser")
    dom_dict = element_to_dom(soup)
    dom_structures.append(dom_dict)

## 6. To better view the structure of a single card's HTML

In [31]:
import json
print(json.dumps(dom_structures, indent=2, ensure_ascii=False))

[
  {
    "div": {
      "a": [
        "https://www.google.com/search?q=INDEXBOM+SENSEX&hl=en-GB&source=news",
        "https://www.google.com/search?q=INDEXNSE+NIFTY_50&hl=en-GB&source=news",
        "https://www.google.com/search?q=INDEXNIKKEI+NI225&hl=en-GB&source=news",
        "https://www.google.com/search?q=SHA+000001&hl=en-GB&source=news"
      ]
    }
  },
  {
    "div": {
      "article": {
        "div": [
          {
            "a": "./read/CBMi3wFBVV95cUxPUkNxdTZWQ2VMZGdHVE82ZHMzaHV5YlVScW1QVGZCU2V3eVVvYlhva1F1UDcxZnhRN3BsVW9DamcyZnlaejEwM0hHbElHa2NjdnBka0ZCVjhFU1QzV0JhU0pLd0lPUHBacS1ZTjluakk5TWdLYXBvdmR4cS04UDZjendwMHJDbU8xdk1CNGdjTk9iRm05blNJa0hNVktCMGtqVHVNV3JacmpEc1FueFRKQldneTBISWh0ZjdKRFNLbVdOSVNqTWhXZ3JfR19NWWZQMDA4cFRJS1ZTVWhiMXRV0gHkAUFVX3lxTE1Zd0NtTE8wVXZIMlI4YzNqTE9PVVNzT29TWlpKb2Nka0NsLXA5NjljY242Y2VJbWEzUk15UDgtazNzR2lrRVRZUnhWUkFBQU5ENzMtaV9rSVV5bGltVW1ZWkM1ZnQtRG5BQ3hCRnR4SlJGS2FfckZRZ3R3eC1SMkZNaEhwNDUwSVBtMlVvNkdfR3V3VDdBS0JVNnRna3VGVm1iOWlVbzNCd19rRm1uQ

## Just for viewing purpose, ignore and move on 

In [32]:
dom_structures[1]["div"]["article"]

{'div': [{'a': './read/CBMi3wFBVV95cUxPUkNxdTZWQ2VMZGdHVE82ZHMzaHV5YlVScW1QVGZCU2V3eVVvYlhva1F1UDcxZnhRN3BsVW9DamcyZnlaejEwM0hHbElHa2NjdnBka0ZCVjhFU1QzV0JhU0pLd0lPUHBacS1ZTjluakk5TWdLYXBvdmR4cS04UDZjendwMHJDbU8xdk1CNGdjTk9iRm05blNJa0hNVktCMGtqVHVNV3JacmpEc1FueFRKQldneTBISWh0ZjdKRFNLbVdOSVNqTWhXZ3JfR19NWWZQMDA4cFRJS1ZTVWhiMXRV0gHkAUFVX3lxTE1Zd0NtTE8wVXZIMlI4YzNqTE9PVVNzT29TWlpKb2Nka0NsLXA5NjljY242Y2VJbWEzUk15UDgtazNzR2lrRVRZUnhWUkFBQU5ENzMtaV9rSVV5bGltVW1ZWkM1ZnQtRG5BQ3hCRnR4SlJGS2FfckZRZ3R3eC1SMkZNaEhwNDUwSVBtMlVvNkdfR3V3VDdBS0JVNnRna3VGVm1iOWlVbzNCd19rRm1uQ2c2c19aeURBMVNhRnVSMTJ3OXNZM3Y5NHcxRkdrekxGUVVtMS1DeHZlOFBkOWM0T1Q1am1uSw?hl=en-IN&gl=IN&ceid=IN%3Aen'},
  {'div': [{'img': 'https://encrypted-tbn1.gstatic.com/faviconV2?url=https://www.livemint.com&client=NEWS_360&size=96&type=FAVICON&fallback_opts=TYPE,SIZE,URL',
     'div': {'div': {'text': 'Mint'}}},
    {'div': {'div': {'div': [{'span': '<button aria-expanded="false" aria-haspopup="menu" aria-label="More - Crypto-exchange Binan

## just for viewing purpose, ignore and move on

In [33]:
dom_structures[1]["div"]["div"]

{'article': [{'div': [{'a': './read/CBMiWkFVX3lxTE5McDRBaTdEUXNKVmhRbmRxU2szYzN5MDk0MHpPOGZhdGFmWnpxRGR5TDFGNjFWa252WE55anBmclpnbWx0SjNfbWVVVUUtME42SV9nN05nY0RoUQ?hl=en-IN&gl=IN&ceid=IN%3Aen'},
    {'div': [{'img': 'https://encrypted-tbn2.gstatic.com/faviconV2?url=https://www.bbc.com&client=NEWS_360&size=96&type=FAVICON&fallback_opts=TYPE,SIZE,URL',
       'div': {'div': {'text': 'BBC'}}},
      {'div': {'div': {'div': [{'span': '<button aria-expanded="false" aria-haspopup="menu" aria-label="More - President Trump pardons Binance founder Changpeng Zhao" class="VfPpkd-Bz112c-LgbsSe yHy1rc eT1oJ mN1ivc hUJSud" data-idom-class="yHy1rc eT1oJ mN1ivc hUJSud" data-n-tid="27" data-tooltip-enabled="true" data-tooltip-id="tt-i23" jsaction="click:cOuCgd; mousedown:UX7yZ; mouseup:lbsD7e; mouseenter:tfO1Yc; mouseleave:JywGue; touchstart:p6p2H; touchmove:FwuNnf; touchend:yfqBxc; touchcancel:JMtRjd; focus:AHmuwe; blur:O22p3e; contextmenu:mg9Pef;mlnRJb:fLiPzd;" jscontroller="soHxf"><div class="VfPpkd-

## The articles to the links are given in relative form, so to open them we have to add a prefix as shown

In [34]:
url_to_search = "https://news.google.com" + dom_structures[1]["div"]["article"]["div"][0]["a"][1:]
url_to_search

'https://news.google.com/read/CBMi3wFBVV95cUxPUkNxdTZWQ2VMZGdHVE82ZHMzaHV5YlVScW1QVGZCU2V3eVVvYlhva1F1UDcxZnhRN3BsVW9DamcyZnlaejEwM0hHbElHa2NjdnBka0ZCVjhFU1QzV0JhU0pLd0lPUHBacS1ZTjluakk5TWdLYXBvdmR4cS04UDZjendwMHJDbU8xdk1CNGdjTk9iRm05blNJa0hNVktCMGtqVHVNV3JacmpEc1FueFRKQldneTBISWh0ZjdKRFNLbVdOSVNqTWhXZ3JfR19NWWZQMDA4cFRJS1ZTVWhiMXRV0gHkAUFVX3lxTE1Zd0NtTE8wVXZIMlI4YzNqTE9PVVNzT29TWlpKb2Nka0NsLXA5NjljY242Y2VJbWEzUk15UDgtazNzR2lrRVRZUnhWUkFBQU5ENzMtaV9rSVV5bGltVW1ZWkM1ZnQtRG5BQ3hCRnR4SlJGS2FfckZRZ3R3eC1SMkZNaEhwNDUwSVBtMlVvNkdfR3V3VDdBS0JVNnRna3VGVm1iOWlVbzNCd19rRm1uQ2c2c19aeURBMVNhRnVSMTJ3OXNZM3Y5NHcxRkdrekxGUVVtMS1DeHZlOFBkOWM0T1Q1am1uSw?hl=en-IN&gl=IN&ceid=IN%3Aen'

In [35]:
import webbrowser
url_to_search

webbrowser.open_new_tab(url_to_search)

True

## 7. This function takes a dictionary of dictionaries and returns the data in more usable format which will be the format of the json

In [36]:
from datetime import datetime

def get_news_data(dom_structure):
    """
    Extracts structured news data from a single dom_structure[i] element.
    Returns a dictionary containing:
      - primary_article (headline, link, author, etc.)
      - related_articles (list of dictionaries)
      - total_related_articles (count)
    """
    BASE_URL = "https://news.google.com"
    today_date = datetime.now().strftime("%d-%m-%Y")  # current date in dd-mm-yyyy

    # --- PRIMARY ARTICLE ---
    article = dom_structure.get("div", {}).get("article", {})
    news_data = {
        'headline': article.get('div', [])[2].get('div', {}).get('span', 'N/A')
        if isinstance(article.get('div', []), list) and len(article.get('div', [])) > 2 else 'N/A',

        'author': article.get('div', [])[2].get('div', {}).get('span', 'N/A')
        if isinstance(article.get('div', []), list) and len(article.get('div', [])) > 2 else 'N/A',

        'article_link': (
            BASE_URL + article.get('a', '')
            if article.get('a') not in (None, 'N/A') else 'N/A'
        ),

        'featured_image': article.get('figure', {}).get('img', 'N/A'),

        'source_logo': (
            article.get('div', [])[1].get('div', [])[0].get('img', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        'source_name': (
            article.get('div', [])[1]
            .get('div', [])[0]
            .get('div', {})
            .get('div', {})
            .get('text', 'N/A')
            if isinstance(article.get('div', []), list)
            and len(article.get('div', [])) > 1
            and isinstance(article.get('div', [])[1].get('div', []), list)
            else 'N/A'
        ),

        # --- NEW FIELD ---
        'publish_date': today_date
    }

    # --- RELATED ARTICLES ---
    related_articles = []
    possible_related = dom_structure.get("div", {}).get("div", {}).get("article", [])

    if isinstance(possible_related, list):
        for article in possible_related:
            if not isinstance(article, dict):
                continue

            divs = article.get('div', [])
            if not isinstance(divs, list) or len(divs) < 3:
                continue

            raw_link = article.get('a', '')
            full_link = BASE_URL + raw_link if raw_link not in (None, 'N/A') else 'N/A'

            article_data = {
                'headline': divs[2].get('div', {}).get('span', 'N/A'),
                'author': divs[2].get('div', {}).get('span', 'N/A'),
                'article_link': full_link,
                'source_logo': (
                    divs[1].get('div', [])[0].get('img', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),
                'source_name': (
                    divs[1].get('div', [])[0]
                    .get('div', {})
                    .get('div', {})
                    .get('text', 'N/A')
                    if len(divs) > 1 and isinstance(divs[1].get('div', []), list)
                    else 'N/A'
                ),

                # --- NEW FIELD ---
                'publish_date': today_date
            }
            related_articles.append(article_data)

    # --- FINAL OUTPUT ---
    complete_news_data = {
        'primary_article': news_data,
        'related_articles': related_articles,
        'total_related_articles': len(related_articles)
    }

    return complete_news_data

## printing the format in which the json stores the data for a single news card

In [37]:
news_item = get_news_data(dom_structures[2])
import pprint
pprint.pprint(news_item)

{'primary_article': {'article_link': 'https://news.google.com./read/CBMiwwFBVV95cUxPUVdOQkxVdXEwdURtTlB5ZEQxS2ZtczRsSWhCbFhIWG1LbTN6d3U1cFFlMUxBdlFyS2V5R01uZ2Nwa19DNTgyTmdsQXpaMW05Mmg0bjRpeDdJbWJRWlJWUDdoYUlWRENydDV5LVBkMjE2NTdyOHBuNE5TLVRZS05ya1N2RDNmbXZJS2lCNlpONlFPTFk0b1JYOUZkWW80bnVXYWpRQW5HUzVaMnFiXzRhVzh0UzBvQ2I3Y25SeDBqdUFMM0k?hl=en-IN&gl=IN&ceid=IN%3Aen',
                     'author': 'By Sam Tobin',
                     'featured_image': '/api/attachments/CC8iK0NnNDBTbWszVlRoeVprdHlSelp3VFJERUF4aW1CU2dLTWdhbGRJcXRRUVU=-w280-h168-p-df',
                     'headline': 'By Sam Tobin',
                     'publish_date': '24-10-2025',
                     'source_logo': 'https://encrypted-tbn2.gstatic.com/faviconV2?url=https://www.reuters.com&client=NEWS_360&size=96&type=FAVICON&fallback_opts=TYPE,SIZE,URL',
                     'source_name': 'Reuters'},
 'related_articles': [{'article_link': 'https://news.google.com./read/CBMinAFBVV95cUxQMEF6cVduVUl2UGZDcWpPSkpYbVgwY0UxalBOO

## 8. Saving the data in the json

In [38]:
import json

# Assuming get_news_data() is already defined as before

business_news_data = []

# Loop through all items in dom_structures
for i, dom in enumerate(dom_structures):
    try:
        news_item = get_news_data(dom)
        business_news_data.append(news_item)
        print(f"Processed item {i + 1}/{len(dom_structures)}")
    except Exception as e:
        print(f"Error processing item {i + 1}: {e}")

# Save to JSON file
with open("business_news.json", "w", encoding="utf-8") as f:
    json.dump(business_news_data, f, ensure_ascii=False, indent=2)

print("\n✅ File 'business_news.json' saved successfully in the working directory!")

Processed item 1/69
Processed item 2/69
Processed item 3/69
Processed item 4/69
Processed item 5/69
Processed item 6/69
Processed item 7/69
Processed item 8/69
Processed item 9/69
Processed item 10/69
Processed item 11/69
Processed item 12/69
Processed item 13/69
Processed item 14/69
Processed item 15/69
Processed item 16/69
Processed item 17/69
Processed item 18/69
Processed item 19/69
Processed item 20/69
Processed item 21/69
Processed item 22/69
Processed item 23/69
Processed item 24/69
Processed item 25/69
Processed item 26/69
Processed item 27/69
Processed item 28/69
Processed item 29/69
Processed item 30/69
Processed item 31/69
Processed item 32/69
Processed item 33/69
Processed item 34/69
Processed item 35/69
Processed item 36/69
Processed item 37/69
Processed item 38/69
Processed item 39/69
Processed item 40/69
Processed item 41/69
Processed item 42/69
Processed item 43/69
Processed item 44/69
Processed item 45/69
Processed item 46/69
Processed item 47/69
Processed item 48/69
P