### Import libraries 

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

###  Get content from url

In [4]:
url = 'https://www.reuters.com/article/us-shazam-m-a-apple-eu/eu-clears-apples-purchase-of-shazam-idUSKCN1LM1TZ'
html = requests.get(url).content

### Extract info from html

In [5]:
html

b'<!--[if !IE]> This has been served from cache <![endif]-->\n<!--[if !IE]> Request served from apache server: prodie--i-0f16a7b4f3b0ef730 <![endif]-->\n<!--[if !IE]> Cached on Tue, 04 Feb 2020 14:40:48 GMT and will expire on Tue, 04 Feb 2020 14:55:47 GMT <![endif]-->\n<!--[if !IE]> token: 880a0133-c7fb-4388-917b-f92c37ddb0b8 <![endif]-->\n<!--[if !IE]> App Server /prodie--i-09c8872164c281a36/ <![endif]-->\n\n<!doctype html><html lang="en" data-edition="BETAUS">\n    <head>\n\n    <title>\n                EU clears Apple\'s purchase of Shazam - Reuters</title>\n        <meta http-equiv="X-UA-Compatible" content="IE=edge"><meta charset="utf-8"><meta http-equiv="x-dns-prefetch-control" content="on"><link rel="dns-prefetch" href="//s1.reutersmedia.net"/><link rel="dns-prefetch" href="//s2.reutersmedia.net"/><link rel="dns-prefetch" href="//s3.reutersmedia.net"/><link rel="dns-prefetch" href="//s4.reutersmedia.net"/><link rel="dns-prefetch" href="//static.reuters.com"/><link rel="dns-prefe

In [7]:
soup = BeautifulSoup(html, 'lxml')

In [8]:
soup

<!--[if !IE]> This has been served from cache <![endif]--><!--[if !IE]> Request served from apache server: prodie--i-0f16a7b4f3b0ef730 <![endif]--><!--[if !IE]> Cached on Tue, 04 Feb 2020 14:40:48 GMT and will expire on Tue, 04 Feb 2020 14:55:47 GMT <![endif]--><!--[if !IE]> token: 880a0133-c7fb-4388-917b-f92c37ddb0b8 <![endif]--><!--[if !IE]> App Server /prodie--i-09c8872164c281a36/ <![endif]--><!DOCTYPE html>
<html data-edition="BETAUS" lang="en">
<head>
<title>
                EU clears Apple's purchase of Shazam - Reuters</title>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta charset="utf-8"/><meta content="on" http-equiv="x-dns-prefetch-control"/><link href="//s1.reutersmedia.net" rel="dns-prefetch"/><link href="//s2.reutersmedia.net" rel="dns-prefetch"/><link href="//s3.reutersmedia.net" rel="dns-prefetch"/><link href="//s4.reutersmedia.net" rel="dns-prefetch"/><link href="//static.reuters.com" rel="dns-prefetch"/><link href="//www.googletagservices.com" rel="dns-pre

In [9]:
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p']

text = [element.text for element in soup.find_all(tags)]

text

["EU clears Apple's purchase of Shazam",
 '2 Min Read',
 'BRUSSELS (Reuters) - The European Union approved Apple’s planned acquisition of British music discovery app Shazam on Thursday, saying an EU antitrust investigation showed it would not harm competition in the bloc. ',
 'The deal, announced in December last year, would help the iPhone maker better compete with Spotify, the industry leader in music streaming services. Shazam identifies songs when a smartphone is pointed at an audio source. ',
 '“After thoroughly analyzing Shazam’s user and music data, we found that their acquisition by Apple would not reduce competition in the digital music streaming market,” EU competition commissioner Margrethe Vestager said in a statement. ',
 '“Data is key in the digital economy. We must therefore carefully review transactions which lead to the acquisition of important sets of data, including potentially commercially sensitive ones,” she added. ',
 'The European Commission opened a full-scale 

### More complex tags  

Suppose we wanted to extract data that was contained within an HTML table and store it in a Pandas data frame. This objective makes our scraping task a bit more complex as we would need to identify the table within the HTML, identify the rows within the table, and then read and format the information within those rows so that they fit within a data frame. Let's look at an example of how we would extract a table containing life expectancies for each European country from Wikipedia.

In [10]:
url = 'https://en.wikipedia.org/wiki/List_of_European_countries_by_life_expectancy'

In [12]:
html = requests.get(url).content
soup = BeautifulSoup(html, 'lxml')

In [18]:
table = soup.find_all('table', {'class':'sortable wikitable'})[0]
table

<table class="sortable wikitable">
<tbody><tr bgcolor="#efefef">
<th>Rank
</th>
<th>Country</th>
<th><a href="/wiki/List_of_countries_by_life_expectancy" title="List of countries by life expectancy">Life expectancy</a><sup class="reference" id="cite_ref-:0_1-1"><a href="#cite_note-:0-1">[1]</a></sup>
</th></tr>
<tr>
<td>1
</td>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="800" data-file-width="1000" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Flag_of_Monaco.svg/19px-Flag_of_Monaco.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Flag_of_Monaco.svg/29px-Flag_of_Monaco.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Flag_of_Monaco.svg/38px-Flag_of_Monaco.svg.png 2x" width="19"/> </span><a href="/wiki/Monaco" title="Monaco">Monaco</a><sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>
</td>
<td>89.4
</td></tr>
<tr>
<td>2
</td>
<td><span class="flagicon"><

In [23]:
rows = table.find_all('tr')

rows = [row.text.strip().split("\n") for row in rows]

rows = [list(filter(None, row)) for row in rows]
rows

[['Rank', 'Country', 'Life expectancy[1]'],
 ['1', '\xa0Monaco[2]', '89.4'],
 ['2', '\xa0San Marino[3]', '83.4'],
 ['3', '\xa0\xa0Switzerland', '83.0'],
 ['4', '\xa0Spain', '82.8'],
 ['5', '\xa0Liechtenstein', '82.7'],
 ['6', '\xa0Italy', '82.5'],
 ['7', '\xa0Norway', '82.5'],
 ['8', '\xa0Iceland', '82.5'],
 ['9', '\xa0Luxembourg', '82.3'],
 ['10', '\xa0France', '82.3'],
 ['11', '\xa0Sweden', '82.2'],
 ['12', '\xa0Malta', '81.8'],
 ['13', '\xa0Finland', '81.8'],
 ['14', '\xa0Ireland', '81.6'],
 ['15', '\xa0Netherlands', '81.5'],
 ['16', '\xa0Portugal', '81.1'],
 ['17', '\xa0Greece', '81.0'],
 ['18', '\xa0United Kingdom', '81.0'],
 ['19', '\xa0Belgium', '81.0'],
 ['20', '\xa0Austria', '80.9'],
 ['21', '\xa0Slovenia', '80.8'],
 ['22', '\xa0Denmark', '80.7'],
 ['23', '\xa0Germany', '80.6'],
 ['24', '\xa0Cyprus', '80.5'],
 ['25', '\xa0Albania', '78.3'],
 ['26', '\xa0Czech Republic', '78.3'],
 ['27', '\xa0Croatia', '78.0'],
 ['28', '\xa0Estonia', '77.7'],
 ['29', '\xa0Poland', '77.5'],
 ['3

In [27]:
colnames = ['Rank','Country','Life expectancy']

datos = rows

df = pd.DataFrame(datos[1:], columns = colnames)
df.head()

Unnamed: 0,Rank,Country,Life expectancy
0,1,Monaco[2],89.4
1,2,San Marino[3],83.4
2,3,Switzerland,83.0
3,4,Spain,82.8
4,5,Liechtenstein,82.7
