In [1]:
import requests
import json
import pandas as pd
import re

from datetime import datetime
from bs4 import BeautifulSoup

In [8]:
def format_dates(date_input):
    """
    This function takes a date input which could be 'latest', a single date,
    or a date range, and returns a list of dates in the datetime format.
    """
    today_date = datetime.today()

    if date_input.lower() == "latest":
        return [today_date]
    
    date_parts = date_input.split("|")
    
    if len(date_parts) == 1:
        return [datetime.strptime(date_parts[0], '%d-%m-%Y')]
    
    elif len(date_parts) == 2:
        return [
            datetime.strptime(date_parts[0], '%d-%m-%Y'),
            datetime.strptime(date_parts[1], '%d-%m-%Y')
        ]
    
    else:
        raise ValueError("Invalid date input format")

In [9]:
# Examples of using the function
print(format_dates("latest"))  # Example 1: 'latest'
print(format_dates("16-09-2023"))  # Example 2: '16-09-2023'
print(format_dates("16-09-2023|20-09-2023"))  # Example 3: '16-09-2023|20-09-2023'

[datetime.datetime(2024, 6, 30, 1, 52, 40, 389385)]
[datetime.datetime(2023, 9, 16, 0, 0)]
[datetime.datetime(2023, 9, 16, 0, 0), datetime.datetime(2023, 9, 20, 0, 0)]


In [28]:
base_url, data = "https://www.dailyfx.com/archive/" , {"title" : [], "date" : []}

month_start, year_start = date[0].month, date[0].year

if len(date) == 1:
    month_end, year_end = month_start + 1, year_start + 1

else:
    month_end = month_start + 1 if (date[1].month == month_start) else date[1].month + 1
    year_end = year_start + 1 if (date[1].year == year_start) else date[1].year + 1


# Loop over the years from 2019 to 2023
for year in range(year_start, year_end):
    # Loop over the months from January to November
    for month in range(month_start, month_end):
        # Format the month as '01', '02', ..., '09', '10', '11'
        formatted_month = f"{month:02d}"
        
        # Create the complete URL
        url = f"{base_url}{year}/{formatted_month}"
        html_text = requests.get(url).text
        soup = BeautifulSoup(html_text, 'lxml')
        titles = soup.find_all('section', class_='my-6')
        
        for j in titles:
            date = datetime.strptime(j.find('h2', class_ = 'text-black dfx-h-3').text, "%d %B, %Y (%A)").strftime("%Y-%m-%d")
            news = j.find_all("span", class_ = 'dfx-articleListItem__title')
            headlines = []
            for j in news:
                data["date"].append(date)
                data["title"].append(j.text)
                headlines.append(j.text)

            print(f"Date : {date}")
            print(f"News : {headlines}\n")


Date : 2023-09-30
News : ['Japanese Yen Q4 Fundamental Forecast: Bearish Kick-off, Year-End Revival Chance', 'Crude Oil Q4 Technical Forecast: How High Can it Go?', 'Euro Q4 Fundamental Forecast: EUR/USD in Peril on Growing Economic Risks', 'Bitcoin Technical Outlook: Price Action Remains Choppy Heading into Q4']

Date : 2023-09-29
News : ['Australian Dollar Q4 Fundamental Forecast: AUD/USD, AUD/JPY', 'British Pound Q4 Technical Forecast: GBP/USD, EUR/GBP, GBP/JPY', 'S&P 500 Futures Largely Unchanged as the Fed’s Preferred Gauge of Inflation Cools to 3.9%', 'Gold Prices Bounce Likely Short-Lived As US Rates, China Import Move Both Weigh', '\u200b\u200b\u200bNikkei 225, FTSE 100 and S&P 500 Try to Recover Into Month End\u200b\u200b\u200b', 'GBP/USD Price Forecast: Pound Rallies on Positive UK GDP Report', 'Asia Day Ahead: Gold at March 2023 low, USD/JPY Hovers Below Key 150.00 Level', 'Australian Dollar Update: AUD/USD, AUD/JPY Soar but is There Enough Momentum to Sustain?']

Date : 202

In [15]:
for year in range(year_start, year_end):
    # Loop over the months from January to November
    for month in range(month_start, month_end):
        print(month)

In [29]:
def date_to_integer(dates):
    """
    This function converts datetime objects to integer dates based on a reference date.
    Reference date: 43466 represents 01 January 2019.
    """
    reference_date = datetime(2019, 1, 1)
    reference_integer = 43466
    
    integer_dates = [(reference_integer + (date - reference_date).days) for date in dates]
    return integer_dates

In [32]:
date_to_integer(format_dates("01-01-2018|01-01-2019"))

[43101, 43466]

In [35]:
# Parameters
base_url, data = "https://www.wsj.com/news/archive/" , {"title" : [], "date" : [], "url" : [], "category" : []}
HEADERS = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}

start, end = date_to_integer(date)

for index in range(start, end + 1):
    url = f'https://economictimes.indiatimes.com/archivelist/starttime-{index}.cms'
    response = requests.get(url, headers=HEADERS).text
    soup = BeautifulSoup(response, 'lxml')
    cols = soup.find_all('ul', class_='content')
    date_obj = soup.find('td', class_ = 'contentbox5').find_all('b')[1].text
    date = datetime.strptime(date_obj, "%d %b, %Y").strftime("%Y-%m-%d")
    print(f"Processing date {date}")

    for i in range(2):
        for j in cols[i]:
            # Get title and URL
            urlhead  = 'https://economictimes.indiatimes.com/'
            urlbody = str(j.find('a').get('href'))
            itemstitle = j.find('a').text
            pattern = r"\/([^\/]+)\/([^\/]+)\/([^\/]+)\/"

            try:
                matches = re.search(pattern, urlbody).groups()
                
                for x in matches:
                    if x in ["banking", "economy", "market", "forex"]:
                        data["title"].append(itemstitle)
                        data["url"].append(urlhead + urlbody)
                        data["category"].append(x) 
                        data["date"].append(date)
                        break

                    else:
                        continue
            except:
                continue

# Export to DataFrame
final_df = pd.DataFrame(data)

Processing date 2019-01-01
Processing date 2019-01-02
