In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime, timedelta

# Scrapping for "ECONOMIC TIMES"

In [2]:
# Base URL for Economic Times Archive
BASE_URL = 'https://economictimes.indiatimes.com'

# Initial starttime for January 1, 2020
INITIAL_STARTTIME = 43831

In [3]:
# Function to get article links for a specific date
def scrape_articles_for_date(year, month, day):
    # Calculate starttime based on the date
    date_str = f'{year}-{month:02}-{day:02}'
    date_object = datetime(year, month, day)
    
    # Calculate the number of days since January 1, 2020
    days_since_start = (date_object - datetime(2020, 1, 1)).days
    
    # Calculate starttime
    starttime = INITIAL_STARTTIME + days_since_start
    
    # Construct the URL for the specific date
    url = f'{BASE_URL}/archivelist/year-{year},month-{month},starttime-{starttime}.cms'
    print(f'Scraping URL: {url}')
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': BASE_URL,
        'Connection': 'keep-alive'
    }

    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to retrieve data from {url} - Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all article links on the page and filter out ads
    articles = soup.find_all('a', href=True)
    
    # Extracting article links and returning them
    article_links = []
    for article in articles:
        link = article['href']
        
        # Check if the link is an article link and not an ad or unrelated link
        if link.startswith('/') or link.startswith('http'):
            full_link = BASE_URL + link if link.startswith('/') else link
            
            # Filter criteria to exclude ads or unrelated links
            if "article" in full_link or "news" in full_link:  # Adjust this condition based on actual URL patterns
                article_links.append({
                    'Media Name': 'THE ECONOMIC TIMES',
                    'Article Link': full_link,
                    'Date': date_str
                })
    
    return article_links

In [4]:
# Function to iterate through each month and day for a given year range
def scrape_articles(start_year=2020, end_year=2024):
    all_articles = []
    
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            # Get the number of days in the month
            if month in [1, 3, 5, 7, 8, 10, 12]:
                num_days = 31
            elif month in [4, 6, 9, 11]:
                num_days = 30
            else:  # February
                num_days = 29 if year % 4 == 0 else 28
            
            for day in range(1, num_days + 1):
                try:
                    articles = scrape_articles_for_date(year, month, day)
                    if articles:
                        all_articles.extend(articles)
                except Exception as e:
                    print(f"Error on {year}-{month:02}-{day:02}: {e}")
                    continue
                
                time.sleep(random.uniform(1, 3))  # Random delay between requests
    
    return all_articles

In [5]:
# Main execution
all_data = scrape_articles(start_year=2020, end_year=2024)

# Convert the data into a DataFrame
df = pd.DataFrame(all_data)

print("Scraping completed.")
print(df.head())

Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45292.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45293.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45294.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45295.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45296.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45297.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45298.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45299.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45300.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-

Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45374.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45375.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45376.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45377.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45378.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45379.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45380.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45381.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-3,starttime-45382.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-4,starttime-

Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45456.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45457.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45458.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45459.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45460.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45461.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45462.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45463.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-45464.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-6,starttime-

Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45538.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45539.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45540.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45541.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45542.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45543.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45544.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45545.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-45546.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-9,starttime-

Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45619.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45620.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45621.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45622.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45623.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45624.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45625.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-11,starttime-45626.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-12,starttime-45627.cms
Scraping URL: https://economictimes.indiatimes.com/archivelist/year-2024,month-12,

In [6]:
len(df)

129491

In [7]:
# Save the scraped data to a CSV file
df.to_csv('economic_times_articles_2020_to_2024.csv', index=False)

print("Scraping completed and saved to CSV.")

Scraping completed and saved to CSV.


# Scrapping for 'TIMES OF INDIA'

In [8]:
# Base URL for Times of India Archive
BASE_URL = 'https://timesofindia.indiatimes.com'

# Initial starttime for January 1, 2020
INITIAL_STARTTIME = 43831

In [9]:
# Function to get article links for a specific date
def scrape_articles_for_date(year, month, day):
    # Calculate starttime based on the date
    date_str = f'{year}-{month:02}-{day:02}'
    date_object = datetime(year, month, day)
    
    # Calculate the number of days since January 1, 2020
    days_since_start = (date_object - datetime(2020, 1, 1)).days
    
    # Calculate starttime
    starttime = INITIAL_STARTTIME + days_since_start
    
    # Construct the URL for the specific date
    url = f'{BASE_URL}/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{starttime}.cms'
    print(f'Scraping URL: {url}')
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': BASE_URL,
        'Connection': 'keep-alive'
    }

    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to retrieve data from {url} - Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all article links on the page and filter out ads
    articles = soup.find_all('a', href=True)
    
    # Extracting article links and returning them
    article_links = []
    for article in articles:
        link = article['href']
        
        # Check if the link is an article link and not an ad or unrelated link
        if link.startswith('/') or link.startswith('http'):
            full_link = BASE_URL + link if link.startswith('/') else link
            
            # Filter criteria to exclude ads or unrelated links
            if "articles" in full_link or "news" in full_link:  # Adjust this condition based on actual URL patterns
                article_links.append({
                    'Media Name': 'THE TIMES OF INDIA',
                    'Article Link': full_link,
                    'Date': date_str
                })
    
    return article_links

In [10]:
# Function to iterate through each month and day for a given year range
def scrape_articles(start_year=2020, end_year=2024):
    all_articles = []
    
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            # Get the number of days in the month
            if month in [1, 3, 5, 7, 8, 10, 12]:
                num_days = 31
            elif month in [4, 6, 9, 11]:
                num_days = 30
            else:  # February
                num_days = 29 if year % 4 == 0 else 28
            
            for day in range(1, num_days + 1):
                try:
                    articles = scrape_articles_for_date(year, month, day)
                    if articles:
                        all_articles.extend(articles)
                except Exception as e:
                    print(f"Error on {year}-{month:02}-{day:02}: {e}")
                    continue
                
                time.sleep(random.uniform(1, 3))  # Random delay between requests
    
    return all_articles

In [11]:
# Main execution
all_data = scrape_articles(start_year=2020, end_year=2024)

# Convert the data into a DataFrame
df = pd.DataFrame(all_data)

print("Scraping completed.")
print(df.head())

Scraping URL: https://timesofindia.indiatimes.com/2024/1/1/archivelist/year-2024,month-1,starttime-45292.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/2/archivelist/year-2024,month-1,starttime-45293.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/3/archivelist/year-2024,month-1,starttime-45294.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/4/archivelist/year-2024,month-1,starttime-45295.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/5/archivelist/year-2024,month-1,starttime-45296.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/6/archivelist/year-2024,month-1,starttime-45297.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/7/archivelist/year-2024,month-1,starttime-45298.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/8/archivelist/year-2024,month-1,starttime-45299.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/1/9/archivelist/year-2024,month-1,starttime-45300.cms
Scraping URL: https

Scraping URL: https://timesofindia.indiatimes.com/2024/3/16/archivelist/year-2024,month-3,starttime-45367.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/17/archivelist/year-2024,month-3,starttime-45368.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/18/archivelist/year-2024,month-3,starttime-45369.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/19/archivelist/year-2024,month-3,starttime-45370.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/20/archivelist/year-2024,month-3,starttime-45371.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/21/archivelist/year-2024,month-3,starttime-45372.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/22/archivelist/year-2024,month-3,starttime-45373.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/23/archivelist/year-2024,month-3,starttime-45374.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/3/24/archivelist/year-2024,month-3,starttime-45375.cms
Scraping U

Scraping URL: https://timesofindia.indiatimes.com/2024/5/30/archivelist/year-2024,month-5,starttime-45442.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/5/31/archivelist/year-2024,month-5,starttime-45443.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/6/1/archivelist/year-2024,month-6,starttime-45444.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/6/2/archivelist/year-2024,month-6,starttime-45445.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/6/3/archivelist/year-2024,month-6,starttime-45446.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/6/4/archivelist/year-2024,month-6,starttime-45447.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/6/5/archivelist/year-2024,month-6,starttime-45448.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/6/6/archivelist/year-2024,month-6,starttime-45449.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/6/7/archivelist/year-2024,month-6,starttime-45450.cms
Scraping URL: htt

Scraping URL: https://timesofindia.indiatimes.com/2024/8/13/archivelist/year-2024,month-8,starttime-45517.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/14/archivelist/year-2024,month-8,starttime-45518.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/15/archivelist/year-2024,month-8,starttime-45519.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/16/archivelist/year-2024,month-8,starttime-45520.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/17/archivelist/year-2024,month-8,starttime-45521.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/18/archivelist/year-2024,month-8,starttime-45522.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/19/archivelist/year-2024,month-8,starttime-45523.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/20/archivelist/year-2024,month-8,starttime-45524.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/8/21/archivelist/year-2024,month-8,starttime-45525.cms
Scraping U

Scraping URL: https://timesofindia.indiatimes.com/2024/10/27/archivelist/year-2024,month-10,starttime-45592.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/10/28/archivelist/year-2024,month-10,starttime-45593.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/10/29/archivelist/year-2024,month-10,starttime-45594.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/10/30/archivelist/year-2024,month-10,starttime-45595.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/10/31/archivelist/year-2024,month-10,starttime-45596.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/11/1/archivelist/year-2024,month-11,starttime-45597.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/11/2/archivelist/year-2024,month-11,starttime-45598.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/11/3/archivelist/year-2024,month-11,starttime-45599.cms
Scraping URL: https://timesofindia.indiatimes.com/2024/11/4/archivelist/year-2024,month-11,starttime-45600.

In [12]:
len(df)

167898

In [13]:
# Save the scraped data to a CSV file
df.to_csv('times_of_india_articles_2020_to_2024.csv', index=False)

print("Scraping completed and saved to CSV.")

Scraping completed and saved to CSV.
