# Web Scraping: BBC News

In [60]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

### Obtain list of news from the coverpage

URL definition:

In [28]:
# url definition
url = "https://www.dailymail.co.uk"

List of news:

In [29]:
# Request
r1 = requests.get(url)
r1.status_code

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('h2', class_='linkro-darkred')
len(coverpage_news)

132

Now we have a list in which every element is a news article:

In [25]:
coverpage_news[4]

<h2 class="linkro-darkred">
    <a href="/news/article-6542519/Its-David-Camer-gone-Ex-PM-loses-access-rights-House-Commons.html" itemprop="url">David Cameron loses his coveted Parliamentary pass after 'forgetting to renew it' in embarrassing admin blunder</a>
  </h2>

In [30]:
n=4

In [32]:
link = url + coverpage_news[n].find('a')['href']
link

'https://www.dailymail.co.uk/news/article-6542519/Its-David-Camer-gone-Ex-PM-loses-access-rights-House-Commons.html'

In [34]:
title = coverpage_news[n].find('a').get_text()
title

"David Cameron loses his coveted Parliamentary pass after 'forgetting to renew it' in embarrassing admin blunder"

In [40]:
article = requests.get(link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')
# body = soup_article.find_all('div', class_='content__article-body from-content-api js-article__body')
# x = body[0].find_all('p')


In [45]:
body = soup_article.find_all('p', class_='mol-para-with-font')
body

[<p class="mol-para-with-font"><a href="/news/david_cameron/index.html" id="mol-74a51b80-0e79-11e9-815a-7b8456e996ba" style="font-weight: bold;">David Cameron</a> has lost his privileged access rights to the House of Commons - after apparently forgetting to renew his pass.</p>,
 <p class="mol-para-with-font">The former PM was among more than 400 ex-politicians able to roam the Parliamentary estate and use the subsidised facilities.  </p>,
 <p class="mol-para-with-font">But, despite rumours that he is plotting a comeback, Mr Cameron has fallen off the list due to an embarrassing administrative blunder. </p>,
 <p class="mol-para-with-font">The most recent roster released by the Commons authorities show he does not hold a pass - although his close ally <a href="/news/george_osborne/index.html" id="mol-74a5ded0-0e79-11e9-815a-7b8456e996ba" style="font-weight: bold;">George Osborne</a> has taken possession of one.</p>,
 <p class="mol-para-with-font">The House confirmed he is not currently a

In [51]:
body[3].get_text()

'The most recent roster released by the Commons authorities show he does not hold a pass - although his close ally George Osborne has taken possession of one.'

### Let's extract the text from the articles:

First, we'll define the number of articles we want:

In [53]:
number_of_articles = 5

In [75]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
        
    # Getting the link of the article
    link = url + coverpage_news[n].find('a')['href']
    list_links.append(link)
    
    # Getting the title
    title = coverpage_news[n].find('a').get_text()
    list_titles.append(title)
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    body = soup_article.find_all('p', class_='mol-para-with-font')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(body)):
        paragraph = body[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    # Removing special characters
    final_article = re.sub("\\xa0", "", final_article)
        
    news_contents.append(final_article)

Let's put them into:
* a dataset which will the input of the models (`df_features`)
* a dataset with the title and the link (`df_show_info`)

In [76]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links})

In [77]:
df_features

Unnamed: 0,Article Content
0,"They are healthy, happy and looking forward to..."
1,A female student has died after a 500ft 'horri...
2,Brexiteers have slammed Sadiq Khan's 'disgrace...
3,Britain woke up to temperatures of just -6C (2...
4,David Cameron has lost his privileged access r...


In [78]:
df_show_info

Unnamed: 0,Article Title,Article Link
0,Conjoined twin girls who were given a one-in-a...,https://www.dailymail.co.uk/news/article-65452...
1,"Bristol University student, 22, becomes second...",https://www.dailymail.co.uk/news/article-65468...
2,Political fireworks! Remainer Sadiq Khan defen...,https://www.dailymail.co.uk/news/article-65441...
3,Britain's deep freeze: Temperatures plunge as ...,https://www.dailymail.co.uk/news/article-65463...
4,David Cameron loses his coveted Parliamentary ...,https://www.dailymail.co.uk/news/article-65425...
