In [1]:
!pip install requests beautifulsoup4



In [2]:
import requests
from bs4 import BeautifulSoup
import re
import json
import copy
import numpy as np
import time
from datetime import datetime, timedelta

In [3]:
article_json_struct = {
            "title": "Article 1 Title",
            "url": "https://www.npr.org/sections/economy/article1",
            "author": "Author Name",
            "published_date": "2023-06-18",
            "content": "Full content of the article...",
            "section": "Economy",
            "disclaimer": "Article disclaimer"
        }
    

In [4]:
def format_string(text):
    return re.sub(r'([.!?])(?=[^ \n])', r'\1 ', text)

Need to use the transcript:
https://www.npr.org/transcripts/813831863

Should fix the "sections" "series" to just the date

In [5]:
# Initialize the starting date
year = str(2019)
curr_date_str = year+'-01-01'
curr_date_obj = datetime.strptime(curr_date_str, '%Y-%m-%d')

final_date_str = year+'-12-31'
final_date_obj = datetime.strptime(final_date_str, '%Y-%m-%d')

topic = 'politics'
topic = 'economy'
save_file_name = 'npr-'+topic+'-article-'+year+'.json'

num_days = (final_date_obj - curr_date_obj).days
# num_days = 10
# Initialize the json
json_data = {}

# Define the ignore list
html_ignore_list = ['</b>']

http_ignore_list = ['sections', 'series']

# Loop through the year
for _ in range(num_days + 1):
    
    # Pull the date string
    curr_date_str = curr_date_obj.strftime('%Y-%m-%d')
    
    # Send a GET request
    archive_link = 'https://www.npr.org/sections/'+topic+'/archive?date=' + curr_date_str

    # Add in random sleep delay before fetching time
    wait_time = 5 + 3*np.random.rand(); time.sleep(wait_time)
    response = requests.get(archive_link)
    
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all article links on the page
    articles = soup.find_all('article', attrs={'class': 'item'})
    
    # Iterate over the articles and print their URLs
    for article in articles:
        link = article.find('a')
        if link:
            article_url = link.get('href')
            link_split = article_url.split('/')
            
            # Check if its an archived article
            # link_ignore = False
            # for http_ignore in http_ignore_list:
            #     if link_split[3] == http_ignore:
            #         link_ignore = True
            
            # if link_ignore == False:
            
            # Check if its an archived article
            if link_split[3] == curr_date_str[:4]:
                # Fetch the publish date article id number
                article_date = link_split[3] + '-' + link_split[4] + '-' + link_split[5]
                article_id = article_date + '-' + link_split[6]
                
                # Check if the article date is equal to the current date
                article_date_obj = datetime.strptime(article_date, '%Y-%m-%d')
                if curr_date_obj == article_date_obj:
                
                    # Check if its in the json struct
                    if article_id not in json_data:
                        
                        # Add in random sleep delay before fetching time
                        wait_time = 5 + 3*np.random.rand(); time.sleep(wait_time)
                        
                        # Fetch the article
                        transcript_url = 'https://www.npr.org/transcripts/' + link_split[6]
                        response = requests.get(transcript_url)
                        # response = requests.get(link_url)
                        # Parse the HTML content of the page with BeautifulSoup
                        soup = BeautifulSoup(response.text, 'html.parser')
                        
                        # Find the div containing the article content
                        # Note the use of CSS selector to select divs with either 'storytext' or 'transcript storytext' class
                        content_divs = soup.select('div.storytext')
                        
                        # If the content divs is empty, that means there is no transcript. Go back to the original link url
                        if not content_divs:
                            response = requests.get(article_url)
                            
                        # Fetch the title of the webpage
                        title = soup.title.string

                        # Fetch the author of the article
                        author_tag = soup.find('meta', attrs={'name': 'author'})
                        author = author_tag['content'] if author_tag else 'No author found'
                        
                        # Find the header with 'data-metrics' attribute
                        header = soup.find(attrs={'data-metrics': True})
                        
                        # Parse the 'data-metrics' attribute as JSON and fetch the category
                        data_metrics = json.loads(header['data-metrics'])
                        category = data_metrics.get('category', 'No category found')

                        # Initialize an empty string to hold the content
                        content = ''
                        disclaimer = ''
                        
                        # Initialize an empty list to hold the image URLs and captions
                        image_url = []
                        image_caption = []

                        # Iterate over each content div
                        for content_div in content_divs:
                            # Get all the paragraph, bold, and figure elements within the content div
                            elements = content_div.find_all(['p', 'img'])

                            # Separate the disclaimers, images, and the rest of the content
                            for element in elements:
                            
                                # Ignore elements with <b> in the string    
                                ignore_flag = False
                                for html_ignore in html_ignore_list:
                                    if html_ignore in str(element):
                                        ignore_flag = True
                                
                                if ignore_flag == True:
                                    continue                
                                elif 'caption' in element.get('class', []):
                                    image_caption.append(element.text)
                                elif 'disclaimer' in element.get('class', []):
                                    disclaimer += ' ' + element.text
                                elif 'img' in element.get('class', []):
                                    if not element['alt']:
                                        image_url.append(element.get('src', []))
                                else:
                                    if not content:
                                        content += element.text
                                    else:
                                        content += ' ' + element.text
                                    
                        # Now `content` is a string containing the article's content
                        # Format the content string correctly.
                        content = content.replace("\n", "")
                        content = content.replace("\'", "'")
                        content = format_string(content)
                        
                        # Create a json entry for the article
                        json_data[article_id] = copy.deepcopy(article_json_struct)
                        
                        json_data[article_id]['url'] = article_url
                        json_data[article_id]['published_date'] = article_date
                        json_data[article_id]['title'] = title
                        json_data[article_id]['author'] = author
                        json_data[article_id]['content'] = content
                        json_data[article_id]['disclaimer'] = disclaimer
                        json_data[article_id]['section'] = category
                        
                        print('Processed: ' + article_url)
                        
                    # Check if article is already in the database
                # Article Date check   
            # End of sections check
        # End of links check
    # End of articles loop
    
    # Increment the date
    curr_date_obj += timedelta(days=1)
# End of date increment

save_file = open(save_file_name, 'w')
json.dump(json_data, save_file)
save_file.close()
            

Processed: https://www.npr.org/2019/01/01/677390110/from-campbells-to-kellogg-s-classic-brands-are-feeling-the-crunch
Processed: https://www.npr.org/2019/01/01/681208503/homelessness-strains-compassion-for-some-los-angeles-residents
Processed: https://www.npr.org/2019/01/01/681361905/federal-workers-burdened-by-shutdown-face-trump-ordered-pay-freeze
Processed: https://www.npr.org/2019/01/02/681752277/economists-dont-expect-to-see-lasting-fallout-from-shutdown-so-long-as-it-ends-s
Processed: https://www.npr.org/2019/01/02/681752242/even-with-minimum-wage-increases-theres-a-huge-gap-in-the-cost-of-living
Processed: https://www.npr.org/2019/01/02/681264922/how-is-the-government-shutdown-affecting-you
Processed: https://www.npr.org/2019/01/04/681807327/this-new-program-aims-to-train-the-growing-freelance-workforce
Processed: https://www.npr.org/2019/01/04/681940880/strong-jobs-report-expected-even-as-worries-about-economy-grow
Processed: https://www.npr.org/2019/01/07/683021544/president-t

In [6]:
content_divs = soup.select('div.storytext')
content_divs

[<div aria-label="Transcript" class="transcript storytext">
 <b class="icn-story-transcript-wrap">
 <b class="icn-story-transcript"></b>
 </b>
 <p><p>NOEL KING, HOST: </p><p>Two stories from 2019 that were illustrative about the United States economy - a trade war with China and a strike at General Motors where tens of thousands of workers walked a picket line. Other business stories came and went quickly, but some of them are worth remembering. NPR's chief economics correspondent Scott Horsley rounded a few up. Hey, Scott.</p><p>SCOTT HORSLEY, BYLINE: Good morning, Noel.</p><p>KING: So where do we start here?</p><p>HORSLEY: We start on a hot summer day in Mississippi. That's where immigration officials raided more than half a dozen chicken processing plants last summer in what was described as one of the nation's largest worksite enforcement actions. Almost 700 people who were employed by companies there were arrested on suspicion of working in the country illegally. Mike Hurst is the