In [1]:
# imports
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

In [2]:
# scrape one WSJ page
def scrape_page(url, KEYWORD, headlines, dates, date_str):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
    
    # send a GET request to the URL and parse the HTML content with Beautiful Soup
    soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')

    for article in soup.select('article'):
        headline = article.h2.text
        
        if KEYWORD in headline.lower():
            headlines.append(headline)
            dates.append(date_str)

    return headlines, dates

In [6]:
# scrape multiple pages of a range of dates
def scrape_news_headlines(start_date, end_date, delta, KEYWORD, headlines, dates, file_name):

    driver = webdriver.Chrome()

    while start_date <= end_date:

        # construct the URL for the WSJ archive page for the current date
        date_str = start_date.strftime("%Y/%m/%d")

        # first page
        page_num = 1

        # check how many pages
        url = f"https://www.wsj.com/news/archive/{date_str}?page={page_num}"

        driver.get(url)

        total_pages_element = driver.find_element(By.CSS_SELECTOR, 'span.WSJTheme--pagepicker-total--Kl350I1l')
        total_pages_text = total_pages_element.text
        
        total_pages = int(total_pages_text.split()[-1])

        if int(total_pages) > 1:

            # scrape each page
            while total_pages != 0:

                # url scraper
                url = f"https://www.wsj.com/news/archive/{date_str}?page={page_num}"

                # scrape
                print("Scraping page(s): ", date_str, page_num)
                headlines, dates = scrape_page(url, KEYWORD, headlines, dates, date_str)

                total_pages -= 1
                page_num += 1
                
        else:
            # print("Next page not found..", date_str, page_num)
            print("Scraping page: ", date_str, page_num)
            headlines, dates = scrape_page(url, KEYWORD, headlines, dates, date_str)

        # move on to the next day
        start_date += delta
        print("-" * 50)

    # Combine the two lists into a DataFrame
    df = pd.DataFrame({'date': dates, 'news_headline': headlines})

    # Save the DataFrame as a CSV file
    df.to_csv(file_name, index=False)

    return headlines, dates

# Run below to scrape for individual stocks
Change the keyword and the time duration

In [7]:
KEYWORD = 'zoom'

# set the start and end dates for the search
start_date = datetime(2020, 3, 4)
end_date = datetime(2020, 3, 5)

# loop through each day between the start and end dates
delta = timedelta(days=1)

headlines = []
dates = []
file_name = "test_zoom_stock.csv"

# call function!
headlines, dates = scrape_news_headlines(start_date, end_date, delta, KEYWORD, headlines, dates, file_name)

Scraping page(s):  2020/03/04 1
Scraping page(s):  2020/03/04 2
Scraping page(s):  2020/03/04 3
--------------------------------------------------
Scraping page(s):  2020/03/05 1
Scraping page(s):  2020/03/05 2
Scraping page(s):  2020/03/05 3
--------------------------------------------------
