# Web Scraping News Headlines from CNBC Using BeautifulSoup

Original Website: https://www.cnbc.com/sp-500/

In [1]:
import requests

URL = 'https://www.cnbc.com/sp-500/'
page = requests.get(URL)

In [2]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all("div", {"class": "stories-lineup bigHeader"})
results = results[0] # since there's only one "stories-lineup bigHeader" class per page

## Examples

In [3]:
temp = results.find_all("li")

In [4]:
temp[0].find("div", {"class": "headline"}).get_text().strip()

'Jim Cramer: A better way to invest in the Covid-19 vaccine gold rush'

In [5]:
temp[0].find("time").get_text()

' 7:51  PM ET Fri, 17 July 2020'

In [6]:
temp[0].find("p", {"class": "desc"}).get_text()

'"Mad Money" host Jim Cramer recommended buying four companies that are supporting vaccine developers.'

In [7]:
temp[1].find("div", {"class": "headline"}).get_text().strip()

"Cramer's lightning round: I would own Teradyne"

In [8]:
temp[1].find("time").get_text()

' 7:33  PM ET Fri, 17 July 2020'

In [9]:
temp[1].find("p", {"class": "desc"}).get_text()

'"Mad Money" host Jim Cramer rings the lightning round bell, which means he\'s giving his answers to callers\' stock questions at rapid speed.'

## Functions to avoid Commercial Containers

In [10]:
def headline_try_except(var):
    try:
        return var.find("div", {"class": "headline"}).get_text().strip()
    except AttributeError:
        pass

def time_try_except(var):
    try:
        return var.find("time").get_text()
    except AttributeError:
        pass
    
def text_try_except(var):
    try:
        return var.find("p", {"class": "desc"}).get_text()
    except AttributeError:
        pass

## Loop

In [11]:
meta = []
for i in range(1, 141):
    currURL = URL + "?page=" + str(i)
    page = requests.get(currURL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("div", {"class": "stories-lineup bigHeader"})
    results = results[0]
    items = results.find_all("li")
    for item in items:
        arr = []
        arr.append(headline_try_except(item))
        arr.append(time_try_except(item))
        arr.append(text_try_except(item))
        meta.append(arr)

In [12]:
import pandas as pd
df = pd.DataFrame.from_records(meta)
df.columns = ["Headlines", "Time", "Description"]
df

Unnamed: 0,Headlines,Time,Description
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying..."
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin..."
2,,,
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be..."
...,...,...,...
3075,Markets lack Christmas cheer,"10:15 AM ET Tue, 26 Dec 2017","According to Kensho, here's how markets have f..."
3076,Cramer Remix: The biggest mistake you can make...,"11:12 AM ET Thu, 20 Sept 2018",Jim Cramer revealed his top rule when it comes...
3077,Cramer says owning too many stocks and too lit...,"7:07 PM ET Fri, 22 Dec 2017",Jim Cramer broke down why owning fewer stocks ...
3078,Cramer: I helped investors through the 2010 fl...,"7:07 PM ET Fri, 22 Dec 2017","Jim Cramer built on his ""nobody ever made a di..."


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3080 entries, 0 to 3079
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Headlines    2800 non-null   object
 1   Time         2800 non-null   object
 2   Description  2800 non-null   object
dtypes: object(3)
memory usage: 72.3+ KB


## Export Data

In [14]:
df.to_csv("cnbc_headlines.csv", index = False)