In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import re
from pathlib import Path
import time

In [None]:
vox_url = 'https://www.vox.com/'
npr_url = 'https://www.npr.org/'

In [None]:
vox_res = requests.get(vox_url)
npr_res = requests.get(npr_url)

print(vox_res.status_code)
print(npr_res.status_code)

In [None]:
npr_soup = BeautifulSoup(npr_res.content, 'lxml')
vox_soup = BeautifulSoup(vox_res.content, 'lxml')

In [None]:
#whether the article was featured or not
headline = []
date_scraped = []
article_date = []
topic_tag = []
article_text = []
website = []
is_top_story = []

# NPR Scrape

Scraping Featured Stories
- url
- topic tag
- title

In [None]:
#NPR 

#find the section with the headline news and featured articles
group_section = npr_soup.find('div', {'class':'stories-wrap stories-wrap-featured'})

#get only articles
featured_stories = group_section.find_all('div', {'class':'story-wrap'})

npr_urls = []
#append urls for article 
for featured_story in featured_stories: 
    npr_urls.append(featured_story.find('a')['href'])          #get urls and append to urls list
    title = featured_story.find('h3', {'class': 'title'})  # find all titles
#     print(title, '\n')
    headline.append(title.text)                     #append titles to headline list
    
    slug = featured_story.find('h2', {'class':'slug'}) #find section containing topic
    if slug is None:
        topic_tag.append(None)
    else:
        slug = slug.text
#     print(slug.strip(), '\n\n')
        topic_tag.append(slug.strip())                   #append slug/topic to topic list
    
print(len(npr_urls))

In [None]:
lists = [headline, date_scraped, article_date, topic_tag, article_text, website]

for item in lists:
    print(len(item))

Scraping Articles with One Article inside the story

- date of article
- full story
- date of scrape
- website

In [None]:
#setting up counter for creating a bool for the first(top) story
counter = 1 

# print(url)
for url in npr_urls:
    page = requests.get(url)
    print(page.status_code)

    article_soup = BeautifulSoup(page.content, 'lxml')

    #website section, including pictures, that holds the p tags (main body of the article)
    text_section = article_soup.select_one('#storytext') 

    # some articles' p tags are under a different name than storytext
    if text_section is None:                      #article_soup.select_one(#storytext) ?? article_soup.find(div...)
        text_section = article_soup.find('div', {'class':'ArticlePage-articleBody'})
#     print(text_section)
    
    # Find the images sections of the articles
    images = text_section.find_all('div', {'id': re.compile(r'res\d+')})
    for image in images: 
        image.extract()   #take out images from the text section due to nested 'p' tags

    # Grab the article text and append to the article list 
    full_story = ''
    for p in text_section.select('p'):
        full_story += p.text

    article_text.append(full_story)
#     print(full_story)
    article_date.append(article_soup.find('time')['datetime']) #time of article
    now = datetime.datetime.now() #time of scrape
    
    # dd/mm/YY H:M:S
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    #code from https://www.programiz.com/python-programming/datetime/current-datetime

    date_scraped.append(dt_string) #append to list 
    
    website.append('NPR')
    print('done')
    
    #True or False if the story is the top_story of the webpage
    is_top_story.append(counter == 1)
    counter += 1  #increase timer to count for loop runs
    
    time.sleep(4)

In [None]:
lists = [headline, date_scraped, article_date, topic_tag, article_text, website]

for item in lists:
    print(len(item))

# Vox 

Scraping:
- Title

In [None]:
#find the section with the headline news and featured articles
group_section = vox_soup.find('div', {'class':'c-newspaper__main'})

#find the tag containing the titles for each featured article
titles = group_section.find_all('a', {'data-chorus-optimize-field':'hed'}) 

vox_urls = []
for title in titles:
    print(title.text)
    headline.append(title.text)  #append title to the headline list
    vox_urls.append(title['href']) #append url to a list 

In [None]:
lists = [headline, date_scraped, article_date, topic_tag, article_text, website]

for item in lists:
    print(len(item))

- article_text
- date_scraped
- website
- topic_tag
- article_date
- website

In [None]:
counter = 1

for url in vox_urls:
    page = requests.get(url)
    print(page.status_code)
    article_soup = BeautifulSoup(page.content, 'lxml')

    
    text_section = article_soup.find('div', {'class':'c-entry-content'}) #isolate the text only from the article
    story_text = text_section.find_all('p') #grab all paragraph tags
    
    full_story = ''   #make an empty string to append each p to
    
    #get text under p tag and add to the full article story
    for p in story_text:
        full_story+= p.text
        
    article_text.append(full_story) #add to list of articles
    
    article_date.append(article_soup.find('time')['datetime']) #time of article
    now = datetime.datetime.now() #get current time of scrape
    
    # dd/mm/YY H:M:S
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    date_scraped.append(dt_string)
    #code from https://www.programiz.com/python-programming/datetime/current-datetime    
    website.append('Vox')  #record website of article origin
    
    topic_tag.append(None)
    
    is_top_story.append(counter == 1)
    counter +=1


In [None]:
lists = [headline, date_scraped, article_date, topic_tag, article_text, website]

for item in lists:
    print(len(item))

# Create DataFrame 

In [None]:
vox_npr_df = pd.DataFrame({'headline':headline,
                              'date_scraped':date_scraped,
                              'article_date':article_date,
                              'topic_tag':topic_tag,
                              'article_text':article_text,
                              'website':website})

In [None]:
filename = 'scrapes' + datetime.datetime.now().strftime("%d-%m-%Y_%H:%M")

vox_npr_df.to_csv(filename+'.csv', index = False)

# Saving/Appending to CSV

In [None]:
# file_name = Path("scraped_lib_articles.csv") #may need to change to os if we use s3 buckets

# if file_name.exists():  #if file already exists in directory
#     print('file exists, now appending')
    
#     #append new scrapes to the file without header row
#     vox_npr_df.to_csv('scraped_lib_articles.csv', mode='a', index=False, header=False) 

# else: #if file does not exist yet
#     print('file does not exist, now creating')
# vox_npr_df.to_csv('scraped_lib_{}.csv'.format(dt_string), index=False) #create new file 