# Web Scraping News Headlines from Finacial News Websites Using BeautifulSoup

Original Website: 
- https://www.cnbc.com/sp-500/
- https://www.reuters.com/news/archive/businessnews
- https://www.theguardian.com/business/all

Since the news is being updated every day, the data uses in this study can be found here: https://github.com/notlongp/news-headlines-webscrape. This script is a template for future references.

## CNBC Scraping

In [None]:
import requests

URL = 'https://www.cnbc.com/sp-500/'
page = requests.get(URL)

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all("div", {"class": "stories-lineup bigHeader"})
results = results[0] # since there's only one "stories-lineup bigHeader" class per page

In [None]:
# Example
temp = results.find_all("li")
print(temp[0].find("div", {"class": "headline"}).get_text().strip())
print(temp[0].find("time").get_text())
print(temp[0].find("p", {"class": "desc"}).get_text())

In [None]:
# Functions to avoid Commercial Containers
def cnbc_headline_try_except(var):
    try:
        return var.find("div", {"class": "headline"}).get_text().strip()
    except AttributeError:
        pass

def cnbc_time_try_except(var):
    try:
        return var.find("time").get_text()
    except AttributeError:
        pass
    
def cnbc_text_try_except(var):
    try:
        return var.find("p", {"class": "desc"}).get_text()
    except AttributeError:
        pass

In [None]:
meta = []
# A loop that runs through 140 CNBC's news pages and record all headlines, last updated date, and preview text
for i in range(1, 141):
    currURL = URL + "?page=" + str(i)
    page = requests.get(currURL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("div", {"class": "stories-lineup bigHeader"})
    results = results[0]
    items = results.find_all("li")
    for item in items:
        arr = []
        arr.append(cnbc_headline_try_except(item))
        arr.append(cnbc_time_try_except(item))
        arr.append(cnbc_text_try_except(item))
        meta.append(arr)

In [None]:
import pandas as pd
df = pd.DataFrame.from_records(meta)
df.columns = ["Headlines", "Time", "Description"]
df

In [None]:
# Information about the data
df.info()

In [None]:
# Exporting the data to csv
df.to_csv("./data/cnbc_headlines.csv", index = False)

## Reuters Scraping

In [None]:
# I can apply the same method to scrape data from Reuters
URL = 'https://www.reuters.com/news/archive/businessnews'
page = requests.get(URL)

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all("div", {"class": "column1 col col-10"})
results = results[0]

In [None]:
# Example
temp = results.find_all("article", {"class": "story"})
print(temp[0].find("h3").get_text().strip())
print(temp[0].find("time").get_text().strip())
print(temp[0].find("p").get_text().strip())

In [None]:
def reuters_headline_try_except(var):
    try:
        return var.find("h3").get_text().strip()
    except AttributeError:
        pass

def reuters_time_try_except(var):
    try:
        return var.find("time").get_text().strip()
    except AttributeError:
        pass
    
def reuters_text_try_except(var):
    try:
        return var.find("p").get_text().strip()
    except AttributeError:
        pass

In [None]:
meta = []
# A loop that runs through 3277 CNBC's news pages (the maximum amount) of archived articles 
# and record all headlines, last updated date, and preview text
# Reuters data only goes as far back as March 2018
for i in range(1, 3278):
    currURL = URL + "?view=page&page=" + str(i) + "&pageSize=10"
    page = requests.get(currURL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("div", {"class": "column1 col col-10"})
    results = results[0]
    items = results.find_all("article", {"class": "story"})
    for item in items:
        arr = []
        arr.append(headline_try_except(item))
        arr.append(time_try_except(item))
        arr.append(text_try_except(item))
        meta.append(arr)

In [None]:
df = pd.DataFrame.from_records(meta)
df.columns = ["Headlines", "Time", "Description"]
df

In [None]:
# Information about the data
df.info()

In [None]:
# Exporting the data to csv
df.to_csv("./data/reuters_headlines.csv", index = False)

# The Guardian Scraping

In [None]:
# Similarly to the Guardian
# However, articles from the Guardian do not contain preview text, 
# so I can only scrape the headlines 
URL = 'https://www.theguardian.com/business/all'
page = requests.get(URL)

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all("div", {"class": "fc-container__inner"})
# Unlike the first 2, there are exactly 2 of these <div> every page 
# on the guardian so I will keep the indexing

In [None]:
# Example
day1 = results[0].find_all("div", {"class": "fc-item__container"})
day2 = results[1].find_all("div", {"class": "fc-item__container"})

print(results[0].find("time").get_text())
print(day1[1].find("a").get_text().strip())

print(results[1].find("time").get_text())
print(day2[1].find("a").get_text().strip())

In [None]:
meta = []
# A loop that runs through 890 CNBC's news pages and record all headlines and last updated date
for i in range(1, 891):
    currURL = URL + "?page=" + str(i)
    page = requests.get(currURL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("div", {"class": "fc-container__inner"})
    
    for result in results:
        days = result.find_all("div", {"class": "fc-item__container"})
        for day in days:
            arr = []
            arr.append(results[0].find("time").get_text())
            arr.append(day.find("a").get_text().strip())
            meta.append(arr)

In [None]:
df = pd.DataFrame.from_records(meta)
df.columns = ["Time", "Headlines"]
df

In [None]:
# Information about the data
df.info()

In [None]:
# Exporting the data to csv
df.to_csv("./data/guardian_headlines.csv", index = False)