# Web Scrapping using BeautifulSoup

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
base_site = "https://en.wikipedia.org/wiki/Music"
response = requests.get(base_site)
response

<Response [200]>

In [3]:
html = response.content
html[:100]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

In [4]:
soup = BeautifulSoup(html, "html.parser")


## Exporting HTML as file

In [5]:
with open("wiki_response.html", 'wb') as file:
    file.write(soup.prettify("utf-8"))

## Searching and navigating the HTML tree

### Searching - find() and find_all()

In [7]:
soup.find("head")

<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"f6ff61e4-2aaf-4937-923d-ccd7ac5632ec","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Music","wgTitle":"Music","wgCurRevisionId":1050711744,"wgRevisionId":1050711744,"wgArticleId":18839,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with incomplete citations","Articles with incomplete citations from July 2019","CS1 maint: archived copy as title","CS1: Julian–Gregorian uncertainty","Webarchive template wayback links","Pages containing links to subscription-only content",
"

In [9]:
print(soup.find("video"))  #Does not exists so it actually returns none

None


In [10]:
soup.find("a")

<a id="top"></a>

In [12]:
links = soup.find_all("a")
links[:2]

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>]

In [13]:
soup.find_all("video") #does not exists so return empty list

[]

In [14]:
len(links)

2536

In [15]:
table = soup.find('tbody')

In [16]:
table

<tbody><tr><td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This article <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Music&amp;action=edit">improve this article</a> by <a href="/wiki/Help:Referencing_for_beginners" title="Help:Referencing for beginners">a

In [17]:
type(table)

bs4.element.Tag

In [18]:
table.find_all('td')

[<td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td>,
 <td class="mbox-text"><div class="mbox-text-span">This article <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Music&amp;action=edit">improve this article</a> by <a href="/wiki/Help:Referencing_for_beginners" title="Help:Referencing for beginners">adding c

In [21]:
len(table.find_all('td'))

2

### Navigating Tree

In [22]:
table.contents

[<tr><td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This article <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Music&amp;action=edit">improve this article</a> by <a href="/wiki/Help:Referencing_for_beginners" title="Help:Referencing for beginners">adding 

In [24]:
len(table.contents)

1

In [25]:
table.parent

<table class="box-More_citations_needed plainlinks metadata ambox ambox-content ambox-Refimprove" role="presentation"><tbody><tr><td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This article <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Music&amp;action=edi

In [27]:
len(table.parent.parent)

514

### Searching the Attributes 
Finding below.  

```HTML
    <div id="siteSub">...</div>
```

In [28]:
soup.find('div', id='siteSub')

<div class="noprint" id="siteSub">From Wikipedia, the free encyclopedia</div>

### Passing attributes as function

In [31]:
soup.find_all('a', class_="mw-jump-link") #since class is reserved python keyword, bs4 uses class_ here

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>]

In [32]:
soup.find('a', class_="mw-jump-link", href = "#mw-head")  

<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>

### Placing the attributes in a dictionary

In [35]:
soup.find('a', attrs= {'class':'mw-jump-link', 'href':"#mw-head"})  #But no need of underscore to class here in dictionary

<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>

In [34]:
soup.find('a', {'class':'mw-jump-link', 'href':"#mw-head"})

<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>

## Extracting data from the HTML tree

In [36]:
a = soup.find('a', class_='mw-jump-link')
a

<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>

In [37]:
a.name

'a'

### Getting the attribute value

In [38]:
a['href']

'#mw-head'

In [39]:
a['class']  #Notice it returned list since class can contain multiple values

['mw-jump-link']

In [40]:
a.get('href')

'#mw-head'

In [41]:
a.get('class')

['mw-jump-link']

### Differences between calling above ways of dictionary

In [42]:
a['id']  #Raises errpr

KeyError: 'id'

In [44]:
print(a.get('id'))

None


In [45]:
repr(a.get('id'))

'None'

In [46]:
a.attrs

{'class': ['mw-jump-link'], 'href': '#mw-head'}

## Extracting text 
### .string vs .text

In [47]:
a.string

'Jump to navigation'

In [48]:
a.text

'Jump to navigation'

#### They exhibit different behabiour when element contains more than one distinct string

In [50]:
p = soup.find_all('p')[1]
p

<p><b>Music</b> is the <a href="/wiki/The_arts" title="The arts">art</a> of arranging <a href="/wiki/Sound" title="Sound">sounds</a> in time through the <a href="/wiki/Elements_of_music" title="Elements of music">elements</a> of melody, harmony, rhythm, and timbre.<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup><sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup> It is one of the <a href="/wiki/Cultural_universal" title="Cultural universal">universal cultural</a> aspects of all human societies. General <a class="mw-redirect" href="/wiki/Definitions_of_music" title="Definitions of music">definitions of music</a> include common elements such as <a href="/wiki/Pitch_(music)" title="Pitch (music)">pitch</a> (which governs <a href="/wiki/Melody" title="Melody">melody</a> and <a href="/wiki/Harmony" title="Harmony">harmony</a>), <a href="/wiki/Rhythm" title="Rhythm">rhythm</a> (and its associated concepts <a href="/wiki/Tempo" title="Tempo">t

In [51]:
p.text

'Music is the art of arranging sounds in time through the elements of melody, harmony, rhythm, and timbre.[1][2] It is one of the universal cultural aspects of all human societies. General definitions of music include common elements such as pitch (which governs melody and harmony), rhythm (and its associated concepts tempo, meter, and articulation), dynamics (loudness and softness), and the sonic qualities of timbre and texture (which are sometimes termed the "color" of a musical sound). Different styles or types of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of instruments and vocal techniques ranging from singing to rapping; there are solely instrumental pieces, solely vocal pieces (such as songs without instrumental accompaniment) and pieces that combine singing and instruments. The word derives from Greek μουσική (mousiké; "(art) of the Muses").[3]\n'

In [52]:
p.string

In [54]:
repr(p.string)

'None'

In [58]:
print(p.parent.text[:1000])

Form of art using sound and silence
For other uses, see Music (disambiguation).


This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.Find sources: "Music" – news · newspapers · books · scholar · JSTOR (October 2021) (Learn how and when to remove this template message)
 Allegory of Music, by François Boucher, 1764
Performing arts
Acrobatics
Ballet
Circus skills
Clown
Dance
Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Stand-up comedy
Theatre
Ventriloquism
vte
Music is the art of arranging sounds in time through the elements of melody, harmony, rhythm, and timbre.[1][2] It is one of the universal cultural aspects of all human societies. General definitions of music include common elements such as pitch (which governs melody and harmony), rhythm (and its associated concepts tempo, meter, and articulation), dynamics (loudness and softn

In [62]:
print(soup.text[:5000])





Music - Wikipedia

































Music

From Wikipedia, the free encyclopedia



Jump to navigation
Jump to search
Form of art using sound and silence
For other uses, see Music (disambiguation).


This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.Find sources: "Music" – news · newspapers · books · scholar · JSTOR (October 2021) (Learn how and when to remove this template message)
 Allegory of Music, by François Boucher, 1764
Performing arts
Acrobatics
Ballet
Circus skills
Clown
Dance
Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Stand-up comedy
Theatre
Ventriloquism
vte
Music is the art of arranging sounds in time through the elements of melody, harmony, rhythm, and timbre.[1][2] It is one of the universal cultural aspects of all human societies. General definitions of music include common elements such as pi

### .strings and .stripped_strings  (Iterator)

In [63]:
for s in p.strings:
    print(repr(s))

'Music'
' is the '
'art'
' of arranging '
'sounds'
' in time through the '
'elements'
' of melody, harmony, rhythm, and timbre.'
'[1]'
'[2]'
' It is one of the '
'universal cultural'
' aspects of all human societies. General '
'definitions of music'
' include common elements such as '
'pitch'
' (which governs '
'melody'
' and '
'harmony'
'), '
'rhythm'
' (and its associated concepts '
'tempo'
', '
'meter'
', and '
'articulation'
'), '
'dynamics'
' (loudness and softness), and the sonic qualities of '
'timbre'
' and '
'texture'
' (which are sometimes termed the "color" of a musical sound). Different '
'styles or types'
' of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of '
'instruments'
' and vocal techniques ranging from '
'singing'
' to '
'rapping'
'; there are solely '
'instrumental pieces'
', '
'solely vocal pieces'
' (such as songs without instrumental '
'accompaniment'
') and pieces that combine singing and instruments. The

Strips string from any extra whitespace

In [64]:
for s in p.stripped_strings:
    print(repr(s))

'Music'
'is the'
'art'
'of arranging'
'sounds'
'in time through the'
'elements'
'of melody, harmony, rhythm, and timbre.'
'[1]'
'[2]'
'It is one of the'
'universal cultural'
'aspects of all human societies. General'
'definitions of music'
'include common elements such as'
'pitch'
'(which governs'
'melody'
'and'
'harmony'
'),'
'rhythm'
'(and its associated concepts'
'tempo'
','
'meter'
', and'
'articulation'
'),'
'dynamics'
'(loudness and softness), and the sonic qualities of'
'timbre'
'and'
'texture'
'(which are sometimes termed the "color" of a musical sound). Different'
'styles or types'
'of music may emphasize, de-emphasize or omit some of these elements. Music is performed with a vast range of'
'instruments'
'and vocal techniques ranging from'
'singing'
'to'
'rapping'
'; there are solely'
'instrumental pieces'
','
'solely vocal pieces'
'(such as songs without instrumental'
'accompaniment'
') and pieces that combine singing and instruments. The word derives from'
'Greek'
'μουσική'
'

## Practical Examples

### Links - Absolute path URL

In [66]:
link = links[26]
link

<a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a>

In [67]:
link.string #since no nested element

'Magic'

In [68]:
link['href']  #does not resembling url

'/wiki/Magic_(illusion)'

In [69]:
from urllib.parse import urljoin

In [70]:
base_site

'https://en.wikipedia.org/wiki/Music'

In [71]:
relative_url = link['href']
relative_url

'/wiki/Magic_(illusion)'

In [72]:
full_url = urljoin(base_site, relative_url)
full_url

'https://en.wikipedia.org/wiki/Magic_(illusion)'

### Processing multiple links at once

In [74]:
[l.get('href') for l in links[:20]]  #Notice None. Also sqaurred bracket would have given error here for dictionary

[None,
 '/wiki/Wikipedia:Protection_policy#semi',
 '#mw-head',
 '#searchInput',
 '/wiki/Music_(disambiguation)',
 '/wiki/File:Question_book-new.svg',
 '/wiki/Wikipedia:Verifiability',
 'https://en.wikipedia.org/w/index.php?title=Music&action=edit',
 '/wiki/Help:Referencing_for_beginners',
 '//www.google.com/search?as_eq=wikipedia&q=%22Music%22',
 '//www.google.com/search?tbm=nws&q=%22Music%22+-wikipedia',
 '//www.google.com/search?&q=%22Music%22&tbs=bkt:s&tbm=bks',
 '//www.google.com/search?tbs=bks:1&q=%22Music%22+-wikipedia',
 '//scholar.google.com/scholar?q=%22Music%22',
 'https://www.jstor.org/action/doBasicSearch?Query=%22Music%22&acc=on&wc=on',
 '/wiki/Help:Maintenance_template_removal',
 '/wiki/File:Fran%C3%A7ois_Boucher,_Allegory_of_Music,_1764,_NGA_32680.jpg',
 '/wiki/File:Fran%C3%A7ois_Boucher,_Allegory_of_Music,_1764,_NGA_32680.jpg',
 '/wiki/Fran%C3%A7ois_Boucher',
 '/wiki/Performing_arts']

In [75]:
clean_links = [l for l in links if l.get('href') != None]

In [79]:
relative_urls = [l.get('href') for l in clean_links]
relative_urls[:5]

['/wiki/Wikipedia:Protection_policy#semi',
 '#mw-head',
 '#searchInput',
 '/wiki/Music_(disambiguation)',
 '/wiki/File:Question_book-new.svg']

In [81]:
full_urls = [urljoin(base_site, url) for url in relative_urls]
full_urls[:5]

['https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi',
 'https://en.wikipedia.org/wiki/Music#mw-head',
 'https://en.wikipedia.org/wiki/Music#searchInput',
 'https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/File:Question_book-new.svg']

In [82]:
internal_links = [url for url in full_urls if 'wikipedia.org' in url]
internal_links[:5]

['https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi',
 'https://en.wikipedia.org/wiki/Music#mw-head',
 'https://en.wikipedia.org/wiki/Music#searchInput',
 'https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/File:Question_book-new.svg']

## Extracting data from Nested tags

In [83]:
div_notes = soup.find_all("div", {"role":"note"})
div_notes[:2]

[<div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Musical_composition" title="Musical composition">Musical composition</a></div>]

In [84]:
div_notes[0].find('a')

<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>

In [85]:
div_links = [div.find('a') for div in div_notes]
div_links[:2]

[<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a href="/wiki/Musical_composition" title="Musical composition">Musical composition</a>]

In [86]:
len(div_links)

23

Above approach is naive because as shown below, div can have multiple links. So in above approach we pulled only first with find

In [90]:
div_notes[7]

<div class="hatnote navigation-not-searchable" role="note">See also: <a href="/wiki/Strophic_form" title="Strophic form">Strophic form</a>, <a href="/wiki/Binary_form" title="Binary form">Binary form</a>, <a href="/wiki/Ternary_form" title="Ternary form">Ternary form</a>, <a class="mw-redirect" href="/wiki/Rondo_form" title="Rondo form">Rondo form</a>, <a href="/wiki/Variation_(music)" title="Variation (music)">Variation (music)</a>, and <a class="mw-redirect" href="/wiki/Musical_development" title="Musical development">Musical development</a></div>

In [91]:
div_notes[7].find_all('a')

[<a href="/wiki/Strophic_form" title="Strophic form">Strophic form</a>,
 <a href="/wiki/Binary_form" title="Binary form">Binary form</a>,
 <a href="/wiki/Ternary_form" title="Ternary form">Ternary form</a>,
 <a class="mw-redirect" href="/wiki/Rondo_form" title="Rondo form">Rondo form</a>,
 <a href="/wiki/Variation_(music)" title="Variation (music)">Variation (music)</a>,
 <a class="mw-redirect" href="/wiki/Musical_development" title="Musical development">Musical development</a>]

In [92]:
div_links = []

for div in div_notes:
    anchors = div.find_all('a')
    div_links.extend(anchors)
len(div_links)

30

In [93]:
note_urls = [urljoin(base_site, l.get("href")) for l in div_links]

In [94]:
len(note_urls), note_urls[:4]

(30,
 ['https://en.wikipedia.org/wiki/Music_(disambiguation)',
  'https://en.wikipedia.org/wiki/Musical_composition',
  'https://en.wikipedia.org/wiki/Musical_notation',
  'https://en.wikipedia.org/wiki/Musical_improvisation'])

## Scraping Multiple Pages automatically

In [98]:
par_text = []

i=0

for url in note_urls:
    note_response = requests.get(url)
    
    if note_response.status_code == 200:
        print("URL #{0}: {1} ".format(i+1, url))
    else:
        print("status code {0}: Skipping URL #{1}: {2}".format(note_response.status_code, i+1, url))
        i +=1
        continue
        
    note_html = note_response.content
    
    note_soup = BeautifulSoup(note_html, 'lxml')
    
    note_pars = note_soup.find_all("p")
    
    text = [p.text for p in note_pars]
    
    par_text.append(text)
    i+=1

URL #1: https://en.wikipedia.org/wiki/Music_(disambiguation) 
URL #2: https://en.wikipedia.org/wiki/Musical_composition 
URL #3: https://en.wikipedia.org/wiki/Musical_notation 
URL #4: https://en.wikipedia.org/wiki/Musical_improvisation 
URL #5: https://en.wikipedia.org/wiki/Music_theory 
URL #6: https://en.wikipedia.org/wiki/History_of_music 
URL #7: https://en.wikipedia.org/wiki/Elements_of_music 
URL #8: https://en.wikipedia.org/wiki/Strophic_form 
URL #9: https://en.wikipedia.org/wiki/Binary_form 
URL #10: https://en.wikipedia.org/wiki/Ternary_form 
URL #11: https://en.wikipedia.org/wiki/Rondo_form 
URL #12: https://en.wikipedia.org/wiki/Variation_(music) 
URL #13: https://en.wikipedia.org/wiki/Musical_development 
URL #14: https://en.wikipedia.org/wiki/History_of_music 
URL #15: https://en.wikipedia.org/wiki/Music_of_Egypt 
URL #16: https://en.wikipedia.org/wiki/20th-century_music 
URL #17: https://en.wikipedia.org/wiki/Aesthetics_of_music 
URL #18: https://en.wikipedia.org/wiki/N

In [99]:
par_text[0]

['Music is an art form consisting of sound and silence, expressed through time.\n',
 'Music may also refer to:\n']

In [101]:
page_text = "".join(par_text[0])
page_text

'Music is an art form consisting of sound and silence, expressed through time.\nMusic may also refer to:\n'

In [103]:
page_text = ["".join(text) for text in par_text]
len(page_text)

30

In [107]:
print(page_text[0])

Music is an art form consisting of sound and silence, expressed through time.
Music may also refer to:



In [108]:
url_to_text = dict(zip(note_urls, page_text))

In [110]:
print(url_to_text['https://en.wikipedia.org/wiki/Music_theory'][:300])


Music theory is the study of the practices and possibilities of music. The Oxford Companion to Music describes three interrelated uses of the term "music theory". The first is the "rudiments", that are needed to understand music notation (key signatures, time signatures, and rhythmic notation); the
