# Scraping from 3 different news sites

**Datetime:** 03-26-2021 1600

**Sites:** 
    - Inquirer (Crawl delay: 5)
    - Manila Bulletin (Crawl delay: unspecified, use random between 10-20)
    - The Guardian (Using API)

## Imports

In [1]:
import requests
import json
import time
import random
from bs4 import BeautifulSoup

## Scraping from Inquirer

### Getting the latest news links

In [3]:
URL = "https://www.inquirer.net/latest-stories"
headers = {'User-agent' : '*'}
page = requests.get(URL, headers=headers) # Access time: 03-26-2021, 1644
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
container = soup.find(id='al-wrap')
headlines = container.find_all(id='al-box')
# Sanity Check
print(headlines[0])

<div id="al-box">
<div id="al-time">POP - 4:38 PM</div>
<h2><a href="https://pop.inquirer.net/107813/this-teacher-took-his-class-on-a-virtual-field-trip-to-the-zoo">This teacher took his class on a virtual field trip to the zoo</a></h2>
</div>


In [6]:
# parse content

for idx in range(len(headlines[0].contents)):
    print("Index ", idx, " Content:\n", headlines[0].contents[idx])

print("----------")    
for idx in range(len(headlines[0].contents[3].contents)):
    print("Index ", idx, " Content:\n", headlines[0].contents[3].contents[idx])

    
print("----------")
print("\nType - time: ", headlines[0].contents[1].text.strip())
print("\nHeadline: ", headlines[0].contents[3].contents[0].text.strip())
print("\nURL: ", headlines[0].contents[3].contents[0]['href'])

Index  0  Content:
 

Index  1  Content:
 <div id="al-time">POP - 4:38 PM</div>
Index  2  Content:
 

Index  3  Content:
 <h2><a href="https://pop.inquirer.net/107813/this-teacher-took-his-class-on-a-virtual-field-trip-to-the-zoo">This teacher took his class on a virtual field trip to the zoo</a></h2>
Index  4  Content:
 

----------
Index  0  Content:
 <a href="https://pop.inquirer.net/107813/this-teacher-took-his-class-on-a-virtual-field-trip-to-the-zoo">This teacher took his class on a virtual field trip to the zoo</a>
----------

Type - time:  POP - 4:38 PM

Headline:  This teacher took his class on a virtual field trip to the zoo

URL:  https://pop.inquirer.net/107813/this-teacher-took-his-class-on-a-virtual-field-trip-to-the-zoo


In [11]:
# get only of type NEWSINFO

news_headlines = []

for item in headlines:
    if item.contents[1].text.strip().find('NEWSINFO') != -1:
        news_headlines.append({
            'type': item.contents[1].text.strip(),
            'Headline': item.contents[3].contents[0].text.strip(),
            'URL': item.contents[3].contents[0]['href']
        })

news_urls = [item['URL'] for item in news_headlines]

# Sanity Check
print(news_urls)

['https://newsinfo.inquirer.net/1411623/sc-confirms-another-justice-to-retire-for-health-reasons', 'https://newsinfo.inquirer.net/1411718/phs-covid-19-cases-top-700000-as-doh-reports-about-10000-new-infections', 'https://newsinfo.inquirer.net/1411705/thailand-faces-meth-trafficking-surge-after-myanmar-coup', 'https://newsinfo.inquirer.net/1411707/covid-19-hits-over-15000-health-workers-doh', 'https://newsinfo.inquirer.net/1411686/bureau-of-immigration-warns-public-against-online-scammers', 'https://newsinfo.inquirer.net/1411690/ched-six-heis-partner-with-lgus-to-serve-as-vaccination-centers', 'https://newsinfo.inquirer.net/1411680/solon-seeks-instant-sanctions-for-govt-execs-who-skipped-vaccine-priority-list', 'https://newsinfo.inquirer.net/1411696/zambales-mayor-tests-positive-for-covid-19', 'https://newsinfo.inquirer.net/1411643/thailand-urges-calm-after-death-of-covid-19-vaccine-recipient', 'https://newsinfo.inquirer.net/1411677/covid-19-icu-occupancy-rate-nearing-moderate-risk-trea

### Scraping the news pages

In [63]:
curr_url = news_urls[0]
curr_page = requests.get(curr_url, headers=headers)
curr_soup = BeautifulSoup(curr_page.content, 'html.parser')

# attempt to find format
headline = curr_soup.find('h1', {'class':'entry-title'}).text.strip()
author = curr_soup.find(id='art_author')['data-byline-strips']
date = curr_soup.find(id='art_plat').contents[2]

body = ''
body_soup = curr_soup.find(id='article_content').contents[1].find_all('p')

# remove all children from <p>s found in article content (e.g. advertisments, extenral links)
for x in body_soup:
    for y in x.find_all():
        if len(list(y.parents)) >= 1:
            y.extract()
    # to remove the caption
    if x.has_key('class'):
        if x['class'][0] == 'wp-caption-text':
            x.extract()

for x in body_soup:
    #print("Index: ", x)
    #print("\n", body_soup[x].text.strip())
    if x.text.strip() == 'RELATED STORIES': # cut off of article
        break
    if x.has_key('class'):
        if x['class'][0] == 'corona_article_tracker': # to account for the common paragraphs abt covid-19 in covid related news
            break
        pass
    else:
        body += '\n' + x.text.strip()

print('URL: ', curr_url)
print("\nHeadline: ", headline)
print("\nAuthor: ", author)
print("\nDate: ", date[4:])
print("\nArticle body: \n", body)

URL:  https://newsinfo.inquirer.net/1411623/sc-confirms-another-justice-to-retire-for-health-reasons

Headline:  SC confirms another justice to retire for health reasons

Author:  Tetch Torres-Tupas

Date:  4:34 PM March 26, 2021

Article body: 
 
MANILA, Philippines—Supreme Court Associate Justice Edgardo Delos Santos is considering retiring early due to health reasons, Public Information Chief and Spokesperson Atty. Brian Keith Hosaka said Friday.
“According to Justice Delos Santos, due to health reasons, he is considering the possibility of retiring ahead of his 70th birthday on 12 June 2022,” Hosaka told reporters.
Hosaka said the magistrate had sent a letter to his staff advising them as early as March 19, 2021, to look for other employment, “knowing the difficulty of finding a job during this pandemic.”
“Justice Delos Santos further added that he remains an incumbent member of the Supreme Court until after a specific date of retirement, as may be indicated in a formal letter from

In [66]:
# actual scraping

inquirer_news = []

for x in news_urls:
    print("[SCRAPING] URL: ", x)
    curr_url = x
    curr_page = requests.get(curr_url, headers=headers)
    curr_soup = BeautifulSoup(curr_page.content, 'html.parser')

    # attempt to find format
    headline = curr_soup.find('h1', {'class':'entry-title'}).text.strip()
    if curr_soup.find(id='art_author') is not None:
        author = curr_soup.find(id='art_author')['data-byline-strips']
    else:
        author = ''
    date = curr_soup.find(id='art_plat').contents[2]

    body = ''
    body_soup = curr_soup.find(id='article_content').contents[1].find_all('p')

    # remove all children from <p>s found in article content (e.g. advertisments, extenral links)
    for x in body_soup:
        for y in x.find_all():
            if len(list(y.parents)) >= 1:
                y.extract()
        # to remove the caption
        if x.has_key('class'):
            if x['class'][0] == 'wp-caption-text':
                x.extract()

    for x in body_soup:
        if x.text.strip() == 'RELATED STORIES': # cut off of article
            break
        if x.has_key('class'):
            if x['class'][0] == 'corona_article_tracker': # to account for the common paragraphs abt covid-19 in covid related news
                break
            pass
        else:
            body += '\n' + x.text.strip()
    
    inquirer_news.append({
        'source': curr_url,
        'date': date[4:],
        'title': headline,
        'article_body': body,
        'author': author
    })
    
    time.sleep(5)
    print("DONE, NEXT:")

[SCRAPING] URL:  https://newsinfo.inquirer.net/1411623/sc-confirms-another-justice-to-retire-for-health-reasons
DONE, NEXT:
[SCRAPING] URL:  https://newsinfo.inquirer.net/1411718/phs-covid-19-cases-top-700000-as-doh-reports-about-10000-new-infections
DONE, NEXT:
[SCRAPING] URL:  https://newsinfo.inquirer.net/1411705/thailand-faces-meth-trafficking-surge-after-myanmar-coup
DONE, NEXT:
[SCRAPING] URL:  https://newsinfo.inquirer.net/1411707/covid-19-hits-over-15000-health-workers-doh
DONE, NEXT:
[SCRAPING] URL:  https://newsinfo.inquirer.net/1411686/bureau-of-immigration-warns-public-against-online-scammers
DONE, NEXT:
[SCRAPING] URL:  https://newsinfo.inquirer.net/1411690/ched-six-heis-partner-with-lgus-to-serve-as-vaccination-centers
DONE, NEXT:
[SCRAPING] URL:  https://newsinfo.inquirer.net/1411680/solon-seeks-instant-sanctions-for-govt-execs-who-skipped-vaccine-priority-list
DONE, NEXT:
[SCRAPING] URL:  https://newsinfo.inquirer.net/1411696/zambales-mayor-tests-positive-for-covid-19
D

In [74]:
# sanity check

article_no = random.randint(0,len(inquirer_news)-1)

print('Index: ', article_no)
print('\nSource: ', inquirer_news[article_no]['source'])
print("\nTitle: ", inquirer_news[article_no]['title'])
print("\nAuthor: ", inquirer_news[article_no]['author'])
print("\nDate: ", inquirer_news[article_no]['date'])
print("\nArticle body: \n", inquirer_news[article_no]['article_body'])

Index:  28

Source:  https://newsinfo.inquirer.net/1411597/healthcare-workers-in-ncr-bubble-cebu-davao-to-get-400000-sinovac-vaccines

Title:  Healthcare workers in NCR bubble, Cebu, Davao to get 400,000 Sinovac vaccines

Author:  Daphne Galvez

Date:  2:06 PM March 26, 2021

Article body: 
 

MANILA, Philippines — The government will be allocating most of the recently delivered  of Sinovac BioTech for healthcare workers in areas most affected by new coronavirus variants, Malacañang said Friday.
This includes healthcare workers in the National Capital Region “bubble” (NCR, Batangas, Rizal, Laguna, Cavite) Cebu and Davao, Presidential spokesman Harry Roque said.
“Nagkaroon na ng desisyon ang ating NITAG (National Immunization Technical Advisory Group) na ‘yung mga kakarating na pinakahuling donasyon ng China na 400,000 na Sinovac ay ibibigay ang karamihan nito doon sa pinaka-apektado ng new variants kasama na ang NCR plus, Cebu, at Davao,” Presidential spokesman Harry Roque announced in

In [75]:
# write to file
with open('inquirer_news.json', 'w') as file:
    json.dump(inquirer_news, file, indent=4)

## Scraping from Manila Bulletin

### Getting the latest news links

In [69]:
URL = "https://mb.com.ph/news/"
headers = {'User-agent' : '*'}
page = requests.get(URL, headers=headers) # Access time: 03-26-2021, 2202
soup = BeautifulSoup(page.content, 'html.parser')

In [70]:
container = soup.find('ul',{'class': 'articles-list'}) # the div with the main news articles
headlines = container.find_all('li',{'class': 'article'})

# Sanity check
print(headlines[0])

<li class="article article-highlight">
<div class="article-inner row flex-row-reverse flex-md-row">
<figure class="article-figure col col-sm-auto">
<a class="article-img" href="https://mb.com.ph/2021/03/24/govt-urged-to-employ-soft-mecq-if-ncr-plus-bubble-fails/">
<img alt="Shopping and Entertainment at the SM Mall of Asia Complex" src="https://mb.com.ph/wp-content/uploads/2020/12/openblooomberg.jpg"/>
</a>
</figure>
<div class="article-info col">
<div class="cat">
<a href="https://mb.com.ph/category/news/national/">National</a>, <a href="https://mb.com.ph/category/news/">News</a> </div>
<h3 class="title"><a href="https://mb.com.ph/2021/03/24/govt-urged-to-employ-soft-mecq-if-ncr-plus-bubble-fails/">Gov’t urged to employ ‘soft MECQ’ if ‘NCR Plus’ bubble fails</a></h3>
<div class="desc">
<p>OCTA Research fellow, professor Ranjit Rye on Wednesday, March 24 proposed the implementation of “soft” modified enhanced community quarantine (MECQ) as a “last resort” if the current government inte

In [71]:
# parse format

print("URL", headlines[0].find('h3',{'class': 'title'}).find('a')['href'])
print('\nTitle: ', headlines[0].find('h3',{'class': 'title'}).find('a').text.strip())

URL https://mb.com.ph/2021/03/24/govt-urged-to-employ-soft-mecq-if-ncr-plus-bubble-fails/

Title:  Gov’t urged to employ ‘soft MECQ’ if ‘NCR Plus’ bubble fails


In [77]:
count = 0
for x in headlines:
    if x.find('h3',{'class': 'title'}) is not None:
        print("\nURL: ", x.find('h3',{'class': 'title'}).find('a')['href'])
        count+=1
    else:
        if x.find('h4',{'class': 'title'}) is not None:
            print("\nURL: ", x.find('h4',{'class': 'title'}).find('a')['href'])
            count+=1
            
print("\nNumber of headlines: ", count)


URL:  https://mb.com.ph/2021/03/24/govt-urged-to-employ-soft-mecq-if-ncr-plus-bubble-fails/

URL:  https://mb.com.ph/2021/03/26/powerful-hospital-ads-urge-people-to-wear-masks-stay-at-home/

URL:  https://mb.com.ph/2021/03/26/duterte-recommends-raising-mav-for-pork-imports-to-350000-metric-tons/

URL:  https://mb.com.ph/2021/03/26/ph-red-cross-asks-public-to-be-exaggerated-avoid-3cs-to-reduce-spread-of-covid-19/

URL:  https://mb.com.ph/2021/03/26/clinical-trials-for-lagundi-as-covid-19-therapeutic-near-completion/

URL:  https://mb.com.ph/2021/03/26/drug-distributors-pledge-to-restock-supply-of-remdesivir-tocilizumab-doh/

URL:  https://mb.com.ph/2021/03/26/report-them-deped-says-instigators-of-distance-cheating-should-be-arrested/

URL:  https://mb.com.ph/2021/03/26/the-story-behind-the-image-teachers-zoom-meeting-at-the-beach/

URL:  https://mb.com.ph/2021/03/26/a-green-ally-pro-environment-energy-group-hopeful-to-end-coal-financing-in-ph-with-help-of-new-manila-archbishop/

URL:  

In [78]:
# get more from second page
URL_2 = "https://mb.com.ph/news/page/2"
page_2 = requests.get(URL_2, headers=headers) # Access time: 03-26-2021, 2210
soup_2 = BeautifulSoup(page_2.content, 'html.parser')
container_2 = soup_2.find('ul',{'class': 'articles-list'})
headlines_2 = container_2.find_all('li',{'class': 'article'})

count_2 = 0
for x in headlines_2:
    if x.find('h3',{'class': 'title'}) is not None:
        print("\nURL: ", x.find('h3',{'class': 'title'}).find('a')['href'])
        count_2+=1
    else:
        if x.find('h4',{'class': 'title'}) is not None:
            print("\nURL: ", x.find('h4',{'class': 'title'}).find('a')['href'])
            count_2+=1
            
print("\nNumber of headlines: ", count)


URL:  https://mb.com.ph/2021/03/26/cyli-launches-digital-skills-training-mentorship-program-for-campus-journalists-in-caloocan/

URL:  https://mb.com.ph/2021/03/26/robredo-covid-19-not-an-excuse-to-postpone-2022-polls/

URL:  https://mb.com.ph/2021/03/26/robredo-we-need-to-talk-about-2022-elections/

URL:  https://mb.com.ph/2021/03/26/duterte-inks-law-lowering-corporate-income-tax-exempting-covid-19-vaccines-from-vat/

URL:  https://mb.com.ph/2021/03/26/solon-mulls-ways-to-solve-sss-pension-deficits/

URL:  https://mb.com.ph/2021/03/26/60-provinces-cities-adopt-dosts-s-pass-travel-app/

URL:  https://mb.com.ph/2021/03/26/its-official-summer-err-hot-dry-season-begins-in-ph/

URL:  https://mb.com.ph/2021/03/26/dont-let-your-guard-down-ph-red-cross-reminds-public-to-be-vigilant-during-holy-week/

URL:  https://mb.com.ph/2021/03/26/27-more-overseas-filipinos-infected-with-covid-19-while-16-others-recovered/

URL:  https://mb.com.ph/2021/03/26/salceda-covid-19-jabs-to-solve-worrisome-ph-cr

In [79]:
# store all headlines
news_urls = []

for x in headlines:
    if x.find('h3',{'class': 'title'}) is not None:
        news_urls.append(x.find('h3',{'class': 'title'}).find('a')['href'])
    else:
        if x.find('h4',{'class': 'title'}) is not None:
            news_urls.append(x.find('h4',{'class': 'title'}).find('a')['href'])
            
for x in headlines_2:
    if x.find('h3',{'class': 'title'}) is not None:
        news_urls.append(x.find('h3',{'class': 'title'}).find('a')['href'])
    else:
        if x.find('h4',{'class': 'title'}) is not None:
            news_urls.append(x.find('h4',{'class': 'title'}).find('a')['href'])

print("Number of headlines: ", len(news_urls))

Number of headlines:  27


### Scraping the news pages

In [80]:
curr_url = news_urls[0]
curr_page = requests.get(curr_url, headers=headers)
curr_soup = BeautifulSoup(curr_page.content, 'html.parser')

# parsing the page
headline = curr_soup.find('h2',{'class':'title'}).text.strip()
author = curr_soup.find('div',{'class':'meta'}).find('p',{'class':'author'}).text.strip()
date = curr_soup.find('div',{'class':'meta'}).find('p',{'class':'published'}).text.strip()

body = ''
body_parts = curr_soup.find('section',{'class':'article-content'}).find_all('p')

for p in body_parts:
    body += p.text.strip() + '\n'

print("URL: ", curr_url)
print("\nHeadline: ", headline)
print("\nAuthor: ", author[3:])
print("\nDate: ", date[10:])
print("\nArticle Body: \n", body)

URL:  https://mb.com.ph/2021/03/24/govt-urged-to-employ-soft-mecq-if-ncr-plus-bubble-fails/

Headline:  Gov’t urged to employ ‘soft MECQ’ if ‘NCR Plus’ bubble fails

Author:  Ellalyn De Vera-Ruiz

Date:  March 24, 2021, 3:34 PM

Article Body: 
 OCTA Research fellow, professor Ranjit Rye on Wednesday, March 24 proposed the implementation of “soft” modified enhanced community quarantine (MECQ) as a “last resort” if the current government interventions fail to address the spread of the coronavirus disease (COVID-19).
“Naintindihan namin ang sitwasyon ng gobyerno, kailangang balansehin ang ekonomiya at saka ang kalusugan. (We understand the situation of the government, we need to balance the economy and public health),” Rye said during the Laging Handa public briefing.
“Pero lumalabas po ay pataas pa rin po iyong kaso natin. May momentum kasi siya. So hintayin natin ang ilang araw, let’s give the bubble a chance. Pero kung pataas pa rin siya after next week eh mukhang kailangan na tayong m

In [81]:
# actual scraping

mb_news = []

for x in news_urls:
    print("[SCRAPING] URL: ", x)
    
    curr_url = x
    curr_page = requests.get(curr_url, headers=headers)
    curr_soup = BeautifulSoup(curr_page.content, 'html.parser')

    headline = curr_soup.find('h2',{'class':'title'}).text.strip()
    author = curr_soup.find('div',{'class':'meta'}).find('p',{'class':'author'}).text.strip()
    date = curr_soup.find('div',{'class':'meta'}).find('p',{'class':'published'}).text.strip()

    body = ''
    body_parts = curr_soup.find('section',{'class':'article-content'}).find_all('p')

    for p in body_parts:
        body += p.text.strip() + '\n'
        
    mb_news.append({
        'source': curr_url,
        'date': date[10:],
        'title': headline,
        'article_body': body,
        'author': author[3:]
    })
    
    rand_delay = random.randint(10,20)
    print("DONE. DELAY: ", rand_delay)
    time.sleep(rand_delay)
    
print("\nArticles scraped: ", len(mb_news))

[SCRAPING] URL:  https://mb.com.ph/2021/03/24/govt-urged-to-employ-soft-mecq-if-ncr-plus-bubble-fails/
DONE. DELAY:  13
[SCRAPING] URL:  https://mb.com.ph/2021/03/26/powerful-hospital-ads-urge-people-to-wear-masks-stay-at-home/
DONE. DELAY:  19
[SCRAPING] URL:  https://mb.com.ph/2021/03/26/duterte-recommends-raising-mav-for-pork-imports-to-350000-metric-tons/
DONE. DELAY:  10
[SCRAPING] URL:  https://mb.com.ph/2021/03/26/ph-red-cross-asks-public-to-be-exaggerated-avoid-3cs-to-reduce-spread-of-covid-19/
DONE. DELAY:  12
[SCRAPING] URL:  https://mb.com.ph/2021/03/26/clinical-trials-for-lagundi-as-covid-19-therapeutic-near-completion/
DONE. DELAY:  19
[SCRAPING] URL:  https://mb.com.ph/2021/03/26/drug-distributors-pledge-to-restock-supply-of-remdesivir-tocilizumab-doh/
DONE. DELAY:  15
[SCRAPING] URL:  https://mb.com.ph/2021/03/26/report-them-deped-says-instigators-of-distance-cheating-should-be-arrested/
DONE. DELAY:  11
[SCRAPING] URL:  https://mb.com.ph/2021/03/26/the-story-behind-the-

In [82]:
# sanity check

article_no = random.randint(0,len(mb_news)-1)

print('Index: ', article_no)
print('\nSource: ', mb_news[article_no]['source'])
print("\nTitle: ", mb_news[article_no]['title'])
print("\nAuthor: ", mb_news[article_no]['author'])
print("\nDate: ", mb_news[article_no]['date'])
print("\nArticle body: \n", mb_news[article_no]['article_body'])

Index:  12

Source:  https://mb.com.ph/2021/03/26/carpio-over-analyzing-chinas-vaccine-donations-lacson/

Title:  Carpio ‘over-analyzing’ China’s vaccine donations — Lacson

Author:  Vanne Elaine Terrazola

Date:  March 26, 2021, 8:23 PM

Article body: 
 Former Supreme Court Senior Associate Justice Antonio might just be “over-analyzing” the connection between the China’s vaccine donations and its encroachment in the West Philippines Sea, Senator Panfilo Lacson said on Friday, March 26.
Lacson, chairman of the Senate committee on national defense and security, believed that the Chinese government is giving the Philippines vaccine supplies in good faith, and not in exchange of seizing the country’s territory.
Carpio, who has been vocal against China’s activities in the West Philippines Sea, earlier said the Chinese government is “trying to soften the blow” of its incursions in the Philippines’ maritime zones by sending COVID-19 vaccines. He said the Philippines should not be accepting d

In [83]:
# write to file
with open('mb_news.json', 'w') as file:
    json.dump(mb_news, file, indent=4)

## Using API for The Guardian

In [95]:
# get the key from local file
API_KEY = ''
with open('key.txt', 'r') as file:
    API_KEY = file.read()

# get 50 articles
results = requests.get("https://content.guardianapis.com/search?api-key=" + API_KEY + "&section=news&page-size=50")
articles = json.loads(results.text)
print(len(articles['response']['results']))

print(articles['response']['results'][0])

50
{'id': 'news/2021/mar/25/corrections-and-clarifications', 'type': 'article', 'sectionId': 'news', 'sectionName': 'News', 'webPublicationDate': '2021-03-25T21:00:24Z', 'webTitle': 'Corrections and clarifications', 'webUrl': 'https://www.theguardian.com/news/2021/mar/25/corrections-and-clarifications', 'apiUrl': 'https://content.guardianapis.com/news/2021/mar/25/corrections-and-clarifications', 'isHosted': False, 'pillarId': 'pillar/news', 'pillarName': 'News'}


In [97]:
# get the author and the text as well
results = requests.get("https://content.guardianapis.com/search?api-key=" + API_KEY + "&section=news&page-size=50&show-tags=contributor&show-blocks=body")
# Access time: 03-27-2021, 0047
articles = json.loads(results.text)
print(articles['response']['results'][0])

{'id': 'news/2021/mar/25/corrections-and-clarifications', 'type': 'article', 'sectionId': 'news', 'sectionName': 'News', 'webPublicationDate': '2021-03-25T21:00:24Z', 'webTitle': 'Corrections and clarifications', 'webUrl': 'https://www.theguardian.com/news/2021/mar/25/corrections-and-clarifications', 'apiUrl': 'https://content.guardianapis.com/news/2021/mar/25/corrections-and-clarifications', 'tags': [{'id': 'profile/editor-of-the-corrections-and-clarifications-column', 'type': 'contributor', 'webTitle': 'Corrections and clarifications column editor', 'webUrl': 'https://www.theguardian.com/profile/editor-of-the-corrections-and-clarifications-column', 'apiUrl': 'https://content.guardianapis.com/profile/editor-of-the-corrections-and-clarifications-column', 'references': [], 'firstName': 'editor', 'lastName': 'ofthecorrectionsandclarificationscolumn'}], 'blocks': {'body': [{'id': '5e74af488f085c6327bc3b4f', 'bodyHtml': '<p>• Due to incorrect figures supplied to us, an article and an edito

In [98]:
# get the article text according to the docs
print(articles['response']['results'][0]['blocks']['body'][0]['bodyTextSummary'])

• Due to incorrect figures supplied to us, an article and an editorial misreported the findings of a survey carried out for the all-party parliamentary group for UN Women. The poll of 1,089 women in the UK found that 86% of those aged 18-24, and 71% of all women, said they had been sexually harassed in public spaces; not 97% and 80% as we stated (Survey reveals sex harassment ordeal faced by young women, 10 March, page 2; Violent crime, 12 March, page 2, Journal). • Other recently amended articles include: China threat to invade Taiwan is ‘closer than most think’, says US admiral Covid vaccine used on apes at San Diego zoo trialled on mink ‘No more shame’: the French women breaking the law to highlight femicide Elderly Asian woman who fought attacker donates nearly $1m from GoFundMe


In [103]:
# filter out other article types (e.g. liveblogs)
articles = list(filter(lambda x: x['type'] == 'article', test['response']['results']))

# parsing the json
url = articles[3]['webUrl']
headline = articles[3]['webTitle']
author = []

for x in articles[3]['tags']:
    if x['type'] != 'contributor':
        continue
    author.append(x['webTitle'])

date = articles[3]['webPublicationDate']
body = articles[3]['blocks']['body'][0]['bodyTextSummary']

print("URL: ", url)
print("\nHeadline: ", headline)
print("\nAuthor: ", author)
print("\nDate: ", date)
print("\nArticle Body: \n", body)

URL:  https://www.theguardian.com/news/2021/mar/24/the-clown-king-making-sense-of-boris-johnson-inside-the-26-march-guardian-weekly

Headline:  The clown king – making sense of Boris Johnson: inside the 26 March Guardian Weekly

Author:  ['Will Dean']

Date:  2021-03-24T09:00:35Z

Article Body: 


In [104]:
# clean the body & headline
body.replace("â€™", "'").replace("â€œ", "\"").replace("â€�", "\"").replace("â€¢", "*").replace("Ã©", "e").replace("Ã¼", "u").replace("â€“", "-")
headline.replace("â€™", "'").replace("â€œ", "\"").replace("â€�", "\"").replace("â€¢", "*").replace("Ã©", "e").replace("Ã¼", "u").replace("â€“", "-")

print(headline)
print(body)

The clown king – making sense of Boris Johnson: inside the 26 March Guardian Weekly


In [105]:
guardian_news = []

for x in articles:
    curr_url = x['webUrl']
    headline = x['webTitle']
    author = []

    for y in x['tags']:
        if y['type'] != 'contributor':
            continue
        author.append(y['webTitle'])

    date = x['webPublicationDate']
    body = x['blocks']['body'][0]['bodyTextSummary']
    body.replace("â€™", "'").replace("â€œ", "\"").replace("â€�", "\"").replace("â€¢", "*").replace("Ã©", "e").replace("Ã¼", "u").replace("â€“", "-")
    headline.replace("â€™", "'").replace("â€œ", "\"").replace("â€�", "\"").replace("â€¢", "*").replace("Ã©", "e").replace("Ã¼", "u").replace("â€“", "-")
    
    guardian_news.append({
        'source': curr_url,
        'date': date,
        'title': headline,
        'article_body': body,
        'author': author
    })
    
print(len(guardian_news))

49


In [106]:
# write to file
with open('guardian_news.json', 'w') as file:
    json.dump(guardian_news, file, indent=4)