In [18]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time

In [19]:
import csv
import random

def get_headers():
    """Returns headers from headers.csv file"""
    try:
        with open('headers.csv', 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            headers_list = [row[0] for row in reader if row]  # Get first column of each row
        
        if not headers_list:
            print("Warning: No headers found in CSV, using default")
            return {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Connection': 'keep-alive',
            }
        
        # Pick a random header from the list
        selected_header = random.choice(headers_list)
        
        return {
            'User-Agent': selected_header,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
        }
    
    except FileNotFoundError:
        print("Error: headers.csv not found, using default header")
        return {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
        }
    except Exception as e:
        print(f"Error reading headers.csv: {e}, using default header")
        return {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
        }

In [20]:
def fetch_page(url):
    """Fetch the HTML content of the page"""
    try:
        response = requests.get(url, headers=get_headers(), timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print(f"Error fetching page: {e}")
        return None

In [21]:
def parse_article_div(div):
    """Parse a single article div and extract data"""
    article_data = {
        "primary_article": {
            "headline": "N/A",
            "author": "N/A",
            "article_link": "N/A",
            "featured_image": "N/A",
            "source_logo": "N/A",
            "source_name": "N/A",
            "publish_date": "N/A"
        },
        "related_articles": [],
        "total_related_articles": 0
    }
    
    try:
        # Extract headline
        headline_elem = div.find('h3', class_='entry-title')
        if headline_elem:
            headline_link = headline_elem.find('a')
            if headline_link:
                article_data["primary_article"]["headline"] = headline_link.get('title', 'N/A')
                article_data["primary_article"]["article_link"] = headline_link.get('href', 'N/A')
        
        # Extract author
        author_elem = div.find('span', class_='td-post-author-name')
        if author_elem:
            author_link = author_elem.find('a')
            if author_link:
                article_data["primary_article"]["author"] = author_link.text.strip()
        
        # Extract featured image
        image_elem = div.find('span', class_='entry-thumb')
        if image_elem:
            img_url = image_elem.get('data-img-url', 'N/A')
            article_data["primary_article"]["featured_image"] = img_url
        
        # Extract publish date
        date_elem = div.find('time', class_='entry-date')
        if date_elem:
            date_str = date_elem.get('datetime', 'N/A')
            if date_str != 'N/A':
                # Convert to desired format (24-10-2025)
                try:
                    dt = datetime.fromisoformat(date_str.replace('+05:30', ''))
                    article_data["primary_article"]["publish_date"] = dt.strftime('%d-%m-%Y')
                except:
                    article_data["primary_article"]["publish_date"] = date_elem.text.strip()
        
        # Source name (OpIndia in this case)
        article_data["primary_article"]["source_name"] = "OpIndia"
        
    except Exception as e:
        print(f"Error parsing article: {e}")
    
    return article_data

In [22]:
def scrape_opindia_politics(urls):
    """Scrape all articles from multiple OpIndia Politics pages"""
    
    all_articles = []

    # Ensure `urls` is always a list
    if isinstance(urls, str):
        urls = [urls]

    for url in urls:
        print(f"\nFetching page: {url}")
        soup = fetch_page(url)

        if not soup:
            print("Failed to fetch this page")
            continue

        article_divs = soup.find_all('div', class_='tdb_module_loop')
        print(f"Found {len(article_divs)} articles")

        for i, div in enumerate(article_divs):
            print(f"Processing article {i + 1}/{len(article_divs)}")
            article_data = parse_article_div(div)
            all_articles.append(article_data)
            time.sleep(0.1)

    print(f"\n✅ Scraped total {len(all_articles)} articles from {len(urls)} pages!")
    return all_articles

In [26]:
# Run scraper for multiple pages
urls = [
        "https://www.opindia.com/category/politics/"
]

news_data = scrape_opindia_politics(urls)


Fetching page: https://www.opindia.com/category/politics/
Found 10 articles
Processing article 1/10
Processing article 2/10
Processing article 3/10
Processing article 4/10
Processing article 5/10
Processing article 6/10
Processing article 7/10
Processing article 8/10
Processing article 9/10
Processing article 10/10

✅ Scraped total 10 articles from 1 pages!


In [15]:
# View the first article
print(json.dumps(news_data[0], indent=2, ensure_ascii=False))

{
  "primary_article": {
    "headline": "NCERT removes ‘the great’ epithet from the names of tyrants Akbar and Tipu Sultan, Assam CM says throw ‘them in the sea’",
    "author": "OpIndia Staff",
    "article_link": "https://www.opindia.com/news-updates/ncert-removes-the-great-epithet-from-the-names-of-tyrants-akbar-and-tipu-sultan-assam-cm-says-throw-them-in-the-sea/",
    "featured_image": "https://i0.wp.com/www.opindia.com/wp-content/uploads/2025/11/Untitled-design-3-1.jpg?resize=696%2C398&ssl=1",
    "source_logo": "N/A",
    "source_name": "OpIndia",
    "publish_date": "23-11-2025"
  },
  "related_articles": [],
  "total_related_articles": 0
}


In [16]:
# View all articles (pretty printed)
print(json.dumps(news_data, indent=2, ensure_ascii=False))

[
  {
    "primary_article": {
      "headline": "NCERT removes ‘the great’ epithet from the names of tyrants Akbar and Tipu Sultan, Assam CM says throw ‘them in the sea’",
      "author": "OpIndia Staff",
      "article_link": "https://www.opindia.com/news-updates/ncert-removes-the-great-epithet-from-the-names-of-tyrants-akbar-and-tipu-sultan-assam-cm-says-throw-them-in-the-sea/",
      "featured_image": "https://i0.wp.com/www.opindia.com/wp-content/uploads/2025/11/Untitled-design-3-1.jpg?resize=696%2C398&ssl=1",
      "source_logo": "N/A",
      "source_name": "OpIndia",
      "publish_date": "23-11-2025"
    },
    "related_articles": [],
    "total_related_articles": 0
  },
  {
    "primary_article": {
      "headline": "Mission Trishul: How RSS mobilised its swayamsevaks to help BJP-led NDA win big in Bihar",
      "author": "OpIndia Staff",
      "article_link": "https://www.opindia.com/news-updates/mission-trishul-how-rss-mobilised-its-swayamsevaks-to-help-bjp-led-nda-win-big-in