# Web Scraping: The Guardian

In [3]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

### Obtain list of news from the coverpage

URL definition:

In [4]:
# url definition
url = "https://www.theguardian.com/uk"

List of news:

In [17]:
# Request
r1 = requests.get(url)
r1.status_code

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('h3', class_='fc-item__title')
len(coverpage_news)

96

Now we have a list in which every element is a news article:

In [45]:
coverpage_news[4]

<h3 class="fc-item__title"><a class="fc-item__link" data-link-name="article" href="https://www.theguardian.com/media/2019/jan/01/outrage-after-netflix-pulls-comedy-show-criticising-saudi-arabia"><span class="fc-item__kicker">Netflix</span> <span class="u-faux-block-link__cta fc-item__headline"> <span class="js-headline-text">Outrage after comedy show pulled for criticising Saudi Arabia</span></span> </a></h3>

### Let's extract the text from the articles:

First, we'll define the number of articles we want:

In [58]:
number_of_articles = 5

In [59]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
        
    # Getting the link of the article
    link = coverpage_news[n].find('a')['href']
    list_links.append(link)
    
    # Getting the title
    title = coverpage_news[n].find('a').get_text()
    list_titles.append(title)
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    body = soup_article.find_all('div', class_='content__article-body from-content-api js-article__body')
    x = body[0].find_all('p')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

Let's put them into:
* a dataset which will the input of the models (`df_features`)
* a dataset with the title and the link (`df_show_info`)

In [52]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links})

In [53]:
df_features

Unnamed: 0,Article Content
0,Desperate councils are being “ripped off” as p...
1,Jair Bolsonaro has been sworn in as the 42nd p...
2,Nasa scientists are celebrating after a spacec...
3,Counter-terrorism police continued to question...
4,Netflix has taken down an episode of a satiric...


In [54]:
df_show_info

Unnamed: 0,Article Title,Article Link
0,Temporary housing Councils ‘being ripped off’...,https://www.theguardian.com/society/2019/jan/0...
1,Brazil Bolsonaro sworn in on wave of conserva...,https://www.theguardian.com/world/2019/jan/01/...
2,"Space Most distant ever flyby successful, say...",https://www.theguardian.com/science/2018/dec/3...
3,"Manchester stabbings Man, 25, quizzed by anti...",https://www.theguardian.com/uk-news/2019/jan/0...
4,Netflix Outrage after comedy show pulled for ...,https://www.theguardian.com/media/2019/jan/01/...
