# CNN Web Scraping consolidated script

In [2]:
# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
# ^ make sure this can actually parse relevant data
import urllib.request, sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import openpyxl
import urllib.request as client

In [3]:
def parse(URL):
    '''
    Takes in URL of a CNN webpage (str) and returns title of article (str) and body text of article (str).
    '''
    page = requests.get(URL)
    if page.status_code == 200: # if this request is successful - 200 message demarcates this
        soup = BeautifulSoup(page.text, "html.parser")    

        # finding title of article
        title = str(soup.find_all('title'))
        title = str(title).split('[<title>')[1].split(' |')[0]

        # find text of article
        scripts = soup.find_all("script", string=re.compile("@context"))
        l = list(scripts)
        l[0] = str(l[0])
        l1 = l[0].split('"articleBody":"')
        l2 = l1[1].split('","articleSection"')
        string = l2[0].replace('\xa0', ' ')
    
        return title, string

In [4]:
def create_base_urls(base_url, add_ons):
    """
    Takes in base url for CNN (str) and generates list of base URLs (str) to subsequently parse.
    """
    return [base_url + item for item in add_ons]

In [13]:
def generate_dates(url, start_year, end_year):
    """
    Takes in base url (str), start year (int) and end year (int) and generates a list of URLs 
    marked by month and year to parse through. 
    """
    # help from https://medium.com/swlh/web-scraping-with-less-than-20-lines-of-code-b363c9e0153a
    art = []
    dates = []

    # generate strings of dates
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            date = str(year) + '-' + str(month) + '.html'
            dates.append(date)
            
    #base url for CNN
    base = url + '/article/sitemap-'

    # add relevant URLS
    for date in dates:
        url = base + date
        art.append(url)
    return art

In [17]:
def generate_urls(dated_urls, subject):
    """
    Takes in list of dated urls to do with a specific subject matter (e.g. politics, business) and returns
    list of relevant URLs of articles to parse. 
    """
    all_pol = []

    for html in dated_urls:
        soup = BeautifulSoup(client.urlopen(html), 'html.parser')
        for link in soup.find_all('a'):
            all_pol.append(link.get('href'))
            
    # get rid of NaN values
    all_pol = list(filter(None,all_pol))
    
    subject = '/' + subject + '/'
    
    # ensure that articles are relevant to subject matter and are dated
    all_pol_final = [link for link in all_pol if link.startswith('https://www.cnn.com/20') and subject in link]
    
    return all_pol_final

In [8]:
def create_file(url_list, subject):
    """
    Takes in list of URLs and subject name (str) and creates a line-separated text file to store URL 
    information. 
    """
    # generate file name
    # help from https://www.geeksforgeeks.org/read-a-file-line-by-line-in-python/
    name = 'cnn_' + subject + '_urls.txt'
    file = open(name,'w') # remember w command overwrites any file with existing name
    for item in url_list:
        file.write(item+"\n")
    file.close()
    return None

In [9]:
def create_df(url_file):
    '''
    Parse through generated text file of URLs and return pandas dataframe of titles and body text.
    '''
    # this will take a LONG time to run, so do so in command line with .py file in virtual environment
    # initialise lists for data
    titles = []
    body_text = []
    
    file = open(url_file, 'r') # open input file
    lst = []
    
    # create list with all URLs, replacing line breaks
    while True:
        line = file.readline()
        if not line:
            break
        line = line.replace('\n', '')
        lst.append(line)
        
    # identify titles and body texts of the 
    for i, URL in enumerate(lst):
        print(i)
        URL = URL.replace('\n', '')
        title, text = parse(URL)
        print(title)
        titles.append(title)
        body_text.append(text)
    
    titles = np.array(titles)
    body_text = np.array(body_text)
    
    df = pd.DataFrame({
        'Title': titles,
        'Body_Text': body_text
    })
    
    return df

In [11]:
base_url = 'https://www.cnn.com/'
add_ons = ['us', 'world', 'politics', 'business', 'opinions']

base_urls = create_base_urls(base_url, add_ons)
base_urls

['https://www.cnn.com/us',
 'https://www.cnn.com/world',
 'https://www.cnn.com/politics',
 'https://www.cnn.com/business',
 'https://www.cnn.com/opinions']

In [15]:
dated_us = generate_dates(base_urls[0], 2008, 2023)
dated_us[0:5]

['https://www.cnn.com/us/article/sitemap-2008-1.html',
 'https://www.cnn.com/us/article/sitemap-2008-2.html',
 'https://www.cnn.com/us/article/sitemap-2008-3.html',
 'https://www.cnn.com/us/article/sitemap-2008-4.html',
 'https://www.cnn.com/us/article/sitemap-2008-5.html']

In [18]:
us_urls = generate_urls(dated_us, 'us')
us_urls[0:5]

['https://www.cnn.com/2011/05/02/us/new-levee-breach/index.html',
 'https://www.cnn.com/2011/05/31/us/missouri-joplin-death-toll/index.html',
 'https://www.cnn.com/2011/07/20/us/heat/index.html',
 'https://www.cnn.com/2011/07/11/us/us-new-mexico-fire-flooding/index.html',
 'https://www.cnn.com/2011/07/11/us/us-space-shuttle/index.html']

In [19]:
len(us_urls)

56717

In [20]:
create_file(us_urls, 'us')