This notebook is used to scrape news from https://www.space.com/news. 
A function is provided to:
* retrieve the space.com news page, 
* extract the news stories, and 
* print out the headline, author, synopsis, and date and time for each story.

Then, the spider follows the "Next" link at the bottom of the page to scrape more news stories in the following pages.

Note: Chrome Developer tools (Ctrl + Shift + I, or More tools --> Developer tools) is used to examine the structure of the page to find the HTML for the articles, then requests and BeautifulSoup are used for data extraction.

In [0]:
import requests
from bs4 import BeautifulSoup
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import time

In [0]:
def process_newspage(soup):
  headlines_html = soup.select('.content > header > .article-name')
  headline_list = [headline.get_text().strip() for headline in headlines_html]

  authors_html = soup.select('.by-author > span')
  author_list = [author.get_text().strip() for author in authors_html]

  time_html = soup.select('time')
  datetime_list = [time['datetime'] for time in time_html]

  synopses_html = soup.select('.synopsis')
  synopsis_list = [synopsis.get_text().strip() for synopsis in synopses_html]

  news = [{'headline': headline, 'author': author, 'datetime': datetime, 'synopsis': synopsis} for headline, author, datetime, synopsis in zip(headline_list, author_list, datetime_list, synopsis_list)]

  return news

In [0]:
start = time.time()
  
headers = {'user-agent': 'newsscraper - Thinkful project (example@gmail.com)'}

request_count = 0
max_request = 8
page_number = 1
has_next_page = True

news_allpages_compiled = {}

while has_next_page and request_count <= max_request:
  clear_output(wait = True)

  url = 'https://www.space.com/news/{}'.format(page_number)
  response = requests.get(url, headers = headers)
  key = 'page {}'.format(page_number)

  if response.ok:
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    news = process_newspage(soup)
    news_allpages_compiled[key] = news
  
  else:
    # display warning if there is any problem
    warn('Request #: {}, Failed with status code: {}'.format(request_count, response.status_code))
  
  # go to sleep for 1-5 seconds before making the next request
  time.sleep(randint(1,5))
  
  # output some logs for monitoring
  end = time.time()
  elapsed_time = end - start
  print('Requests: {}, Frequency: {} request/s, {} jobs processed.'.format(request_count, request_count/elapsed_time, len(news)))
  
  # prepare for next iteration
  page_number += 1
  request_count += 1

print('Requests: {}, Frequency: {} request/s, {} jobs processed.'.format(request_count, request_count/elapsed_time, len(news)))
print('Scraping complete')
news_allpages_compiled

Requests: 8, Frequency: 0.23098260583078667 request/s, 20 jobs processed.
Requests: 9, Frequency: 0.259855431559635 request/s, 20 jobs processed.
Scraping complete


{'page 1': [{'author': 'Hanneke Weitering',
   'datetime': '2020-02-01T13:23:00Z',
   'headline': 'On This Day in Space: Feb. 1, 2003: Space shuttle Columbia disaster',
   'synopsis': 'On Feb. 1, 2003, the space shuttle Columbia broke apart as it returned to Earth after spending more than two weeks in space. See how it happened in our On This Day in Space video series!'},
  {'author': 'Chris Vaughan',
   'datetime': '2020-02-01T13:10:24Z',
   'headline': 'Best night sky events of February 2020 (stargazing maps)',
   'synopsis': "See what's up in the night sky for February 2020, including stargazing events and the moon's phases, in this Space.com gallery courtesy of Starry Night Software."},
  {'author': 'Joe Rao',
   'datetime': '2020-02-01T13:10:16Z',
   'headline': "The brightest planets in February's night sky: How to see them (and when)",
   'synopsis': "Here's how to see planets visible in February's night sky."},
  {'author': 'Chelsea Gohd',
   'datetime': '2020-02-01T13:03:42Z',