# Web Scraping: The Mirror

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time

### Obtain list of news from the coverpage

URL definition:

In [2]:
# url definition
url = "https://www.mirror.co.uk/"

List of news:

In [19]:
# Request
r1 = requests.get(url)
print(r1.status_code)

# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')

# News identification
coverpage_news = soup1.find_all('a', class_='headline publication-font')
len(coverpage_news)

200


20

Now we have a list in which every element is a news article:

In [58]:
coverpage_news[0]

<a class="headline publication-font" href="https://www.mirror.co.uk/news/uk-news/manhunt-dangerous-ex-woman-found-13807713">Manhunt for 'dangerous' ex of woman found bludgeoned to death in her flat on New Year's Eve</a>

In [71]:
n=0
link = coverpage_news[n]['href']
title = coverpage_news[n].get_text()
article = requests.get(link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')


In [56]:
title

"Manhunt for 'dangerous' ex of woman found bludgeoned to death in her flat on New Year's Eve"

In [61]:
body = soup_article.find_all('p')

In [70]:
body

[<p itemprop="description">Jude Jones, 33, of West Bromwich, was reported missing by her sister on Boxing Day and police want to track down Michael Foran</p>,
 <p>Murder detectives have launched a manhunt for the 'dangerous' ex of a woman who was killed in her flat on <a href="https://www.mirror.co.uk/all-about/new-year">New Year</a>’s Eve.</p>,
 <p>Jude Jones, 33, was found dead at her home in West Bromwich, at around 2am on December 31 and police have been hunting her former partner Michael Foran, 32. </p>,
 <p>Previously she had been reported missing by her sister on Boxing Day.</p>,
 <p><a href="https://www.mirror.co.uk/all-about/police">Police</a> believe Ms Jones died following a “sustained and brutal assault with a weapon” but an official cause of death is yet to be determined.</p>,
 <p>West Midlands Police believes Foran travelled to Liverpool in the wake of her disappearance.</p>,
 <p>Newly-released CCTV images show him at an Esso garage in the Anfield area of the city.</p>,
 

In [67]:
len(body)

17

In [72]:
x = soup_article.find_all('p')

In [76]:
len(x)

17

In [75]:
x[0].get_text()

'Jude Jones, 33, of West Bromwich, was reported missing by her sister on Boxing Day and police want to track down Michael Foran'

In [78]:
list_links

['https://www.mirror.co.uk/news/uk-news/manhunt-dangerous-ex-woman-found-13807713',
 'https://www.mirror.co.uk/news/uk-news/park-lane-murder-abu-hamzas-13808112',
 'https://www.mirror.co.uk/tv/tv-news/dancing-on-ice-2019-cast-13341046',
 'https://www.mirror.co.uk/film/bird-box-six-clues-villain-13803555',
 'https://www.mirror.co.uk/film/favourite-feud-inside-lady-marlboroughs-13807193']

### Let's extract the text from the articles:

First, we'll define the number of articles we want:

In [44]:
number_of_articles = 5

In [97]:
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []

for n in np.arange(0, number_of_articles):
        
    # Getting the link of the article
    link = coverpage_news[n]['href']
    list_links.append(link)
    
    # Getting the title
    title = coverpage_news[n].get_text()
    list_titles.append(title)
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content, 'html5lib')
    x = soup_article.find_all('p')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

Let's put them into:
* a dataset which will the input of the models (`df_features`)
* a dataset with the title and the link (`df_show_info`)

In [80]:
# df_features
df_features = pd.DataFrame(
     {'Article Content': news_contents 
    })

# df_show_info
df_show_info = pd.DataFrame(
    {'Article Title': list_titles,
     'Article Link': list_links})

In [81]:
df_features

Unnamed: 0,Article Content
0,"Jude Jones, 33, of West Bromwich, was reported..."
1,"Imran Mostafa Kamel, 26, has been charged with..."
2,Dancing On Ice returns to our screens on ITV o...
3,Dancing On Ice returns to our screens on ITV o...
4,Dancing On Ice returns to our screens on ITV o...


In [82]:
df_show_info

Unnamed: 0,Article Title,Article Link
0,Manhunt for 'dangerous' ex of woman found blud...,https://www.mirror.co.uk/news/uk-news/manhunt-...
1,Hate preacher's son charged with firearms offe...,https://www.mirror.co.uk/news/uk-news/park-lan...
2,Dancing On Ice is back within days! Celebritie...,https://www.mirror.co.uk/tv/tv-news/dancing-on...
3,Bird Box: Did you clock any of the six clues a...,https://www.mirror.co.uk/film/bird-box-six-clu...
4,The Favourite feud: Inside Lady Marlborough's ...,https://www.mirror.co.uk/film/favourite-feud-i...


### Time Elapsed

We are interested in how much time the script takes to get the news because this will impact directly on user experience. For this, we'll put it all into a single function and then call it:

In [115]:
def get_news_themirror():
    
    # url definition
    url = "https://www.mirror.co.uk/"
    
    # Request
    r1 = requests.get(url)
    r1.status_code

    # We'll save in coverpage the cover page content
    coverpage = r1.content

    # Soup creation
    soup1 = BeautifulSoup(coverpage, 'html5lib')

    # News identification
    coverpage_news = soup1.find_all('a', class_='headline publication-font')
    len(coverpage_news)
    
    number_of_articles = 5

    # Empty lists for content, links and titles
    news_contents = []
    list_links = []
    list_titles = []

    for n in np.arange(0, number_of_articles):

        # Getting the link of the article
        link = coverpage_news[n]['href']
        list_links.append(link)

        # Getting the title
        title = coverpage_news[n].get_text()
        list_titles.append(title)

        # Reading the content (it is divided in paragraphs)
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        body = soup_article.find_all('div', class_='articulo-cuerpo')
        x = soup_article.find_all('p')

        # Unifying the paragraphs
        list_paragraphs = []
        for p in np.arange(0, len(x)):
            paragraph = x[p].get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)

        news_contents.append(final_article)

    # df_features
    df_features = pd.DataFrame(
         {'Content': news_contents 
        })

    # df_show_info
    df_show_info = pd.DataFrame(
        {'Article Title': list_titles,
         'Article Link': list_links,
         'Newspaper': 'El Pais English'})
    
    return (df_features, df_show_info)

In [116]:
start = time.time()
x, y = get_news_themirror()
end =time.time()
te = end-start
print("The time elapsed is %f seconds" %(te))

The time elapsed is 11.067029 seconds
