In [21]:
import json
import csv
import random
import requests
from bs4 import BeautifulSoup
from datetime import datetime

CATEGORY_URLS = {
    "india": "https://www.hindustantimes.com/india-news",
    "entertainment": "https://www.hindustantimes.com/entertainment/bollywood",
    "world": "https://www.hindustantimes.com/world-news",
    "business": "https://www.hindustantimes.com/business",
    "science": "https://www.hindustantimes.com/science",
    "technology": "https://www.hindustantimes.com/technology",
    "tamil":"https://www.hindustantimes.com/entertainment/tamil-cinema",
    "health": "https://www.hindustantimes.com/lifestyle/health",
    "lifestyle":"https://www.hindustantimes.com/lifestyle",
    "football":"https://www.hindustantimes.com/sports/football"
}
JSON_NAMES = {
    "india": "india_news.json",
    "entertainment": "entertainment_news.json",
    "politics": "politics_news.json", 
    "sports": "sports_news.json",
    "world": "world_news.json",
    "business": "business_news.json",
    "technology": "technology_news.json",
    "health":"health_news.json",
    "science":"science_news.json",
    "real_estate":"local_news.json",
    "agriculture": "india_news.json",
    "cities": "india_news.json",
    "internet": "technology_news.json",
    "tamil":"entertainment_news.json",
    "lifestyle":"entertainment_news.json",
    "football":"sports_news.json"
}

In [22]:
import json
import random
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# Load headers
header_list = []
with open("headers.csv", "r", encoding="utf-8") as f:
    for row in csv.DictReader(f):
        header_list.append({
            "User-Agent": row["User-Agent"],
            "Accept-Language": row["Accept-Language"]
        })

def get_header():
    return random.choice(header_list)

In [25]:
def scrape_hindustan_times(index):
    url = CATEGORY_URLS[index]
    json_path = JSON_NAMES[index]

    print(f"Scraping Hindustan Times: {url}")

    response = requests.get(url, headers=get_header())
    soup = BeautifulSoup(response.text, "html.parser")

    articles = []

    # Select parent container
    data_holder = soup.select_one("#dataHolder")
    cards = []

    if data_holder:
        # Direct children only
        all_children = data_holder.find_all("div", recursive=False)
        cards = all_children[5:]  # start from 6th child

    # Extract articles
    for card in cards:
        try:
            # Title
            title_tag = card.select_one("h3, h2")
            title = title_tag.get_text(strip=True) if title_tag else None

            # URL
            link_tag = card.select_one("a.storylink, a.storylink.articleClick, a")
            article_url = (
                "https://www.hindustantimes.com" + link_tag["href"]
                if link_tag and link_tag.get("href")
                else None
            )

            # Image
            img_tag = card.select_one("figure img, img")
            image_url = None
            if img_tag:
                image_url = img_tag.get("data-src") or img_tag.get("src")

            publish_date = datetime.now().strftime("%Y-%m-%d")

            if title and article_url:
                articles.append({
                    "primary_article": {
                        "headline": title,
                        "author": "N/A",
                        "article_link": article_url,
                        "featured_image": image_url if image_url else "N/A",
                        "source_logo": "N/A",
                        "source_name": "Hindustan Times",
                        "publish_date": publish_date,
                        "summary": "N/A"
                    },
                    "related_articles": [],
                    "total_related_articles": 0
                })

        except Exception as e:
            print("Error parsing article:", e)
            continue

    # -------------------------------------------
    # APPEND TO JSON INSTEAD OF OVERWRITING
    # -------------------------------------------
    
    # Load existing data if file exists
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []
    
    # Must be a list – if not, convert to list
    if not isinstance(existing_data, list):
        existing_data = []
    
    # Append new articles
    existing_data.extend(articles)
    
    # Save updated list
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(existing_data, f, indent=4, ensure_ascii=False)
    
    print(f"APPENDED {len(articles)} articles → {json_path}")

    return articles

In [26]:
for index in CATEGORY_URLS.keys():
    try:
        print(f"\n========== Scraping: {index.upper()} ==========")
        scrape_hindustan_times(index)
    except Exception as e:
        print(f"Error scraping {index}: {e}")


Scraping Hindustan Times: https://www.hindustantimes.com/india-news
APPENDED 29 articles → india_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/entertainment/bollywood
APPENDED 29 articles → entertainment_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/world-news
APPENDED 29 articles → world_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/business
APPENDED 29 articles → business_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/science
APPENDED 29 articles → science_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/technology
APPENDED 29 articles → technology_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/entertainment/tamil-cinema
APPENDED 29 articles → entertainment_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/lifestyle/health
APPENDED 29 articles → health_news.json

Scraping Hindustan Times: https://www.hindustantimes.com/lifestyle
APPENDED 29