BeautifulSoup documentation link: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [1]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [2]:
html_doc

'<html><head><title>The Dormouse\'s story</title></head>\n<body>\n<p class="title"><b>The Dormouse\'s story</b></p>\n\n<p class="story">Once upon a time there were three little sisters; and their names were\n<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,\n<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and\n<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>\n\n<p class="story">...</p>\n'

In [3]:
from bs4 import BeautifulSoup

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [5]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [6]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [7]:
soup.title

<title>The Dormouse's story</title>

In [8]:
print(soup.title)

<title>The Dormouse's story</title>


In [9]:
#this returns the name of title tag. 
soup.title.name

'title'

In [10]:
#this retutns name of title. data type is defined as string by default
# string part of title is returned. 
soup.title.string

"The Dormouse's story"

In [11]:
# to check the parent of title tag
soup.title.parent.name

'head'

In [12]:
soup.head.parent.name

'html'

In [13]:
soup.html.parent.name

'[document]'

In [16]:
soup.'[document]'.parent.name #doesnt work. no parent of document. 

SyntaxError: invalid syntax (2211695062.py, line 1)

In [17]:
soup.p
#returns 1st paragraph od html

<p class="title"><b>The Dormouse's story</b></p>

In [18]:
soup.p['class']
#returns class name of 1st paragraph

['title']

In [20]:
soup.a
#anchor tag
#The <a> tag defines a hyperlink, which is used to link from one page to another.
#Elsie is shown in page, whose class is sister. Elsie is link(in blue), when clicked
# http://example.com/elsie link opened

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [21]:
soup.find_all('a')
#returns all anchor tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [22]:
soup.find_all('p')
#returns all paragraph tags.

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [23]:
soup.find(id="link3")
#returns line whose id is link3

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [25]:
# this soup.find_all('a') extracts all anchors including tags. 
# to fetch just href, apply for loop 

for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [26]:
#to extract all text within html use get_text()

soup.get_text()

"The Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

In [27]:
print(soup.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



### Object types 
1. tag
2. NavigableString
3. Soup
4. Comment

## 1. Tag Objects

In [70]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b #extract tag name "b" from soup.here .b is name of ta
type(tag)
#Im passing tag:<b class="boldest">Extremely bold</b>in BeautifulSoup and assigning it to var soup


bs4.element.Tag

In [29]:
tag 

<b class="boldest">Extremely bold</b>

In [30]:
soup

<b class="boldest">Extremely bold</b>

In [32]:
# if i just pass soup in var tag1
tag1= soup
type(tag1)

bs4.BeautifulSoup

In [33]:
tag.name

'b'

In [34]:
#re assigning tag name as blockquote
tag.name = "blockquote"
tag

<blockquote class="boldest">Extremely bold</blockquote>

In [35]:
#tags can have attributes such as id, value of id 
tag = BeautifulSoup('<b id="boldest">bold</b>', 'html.parser').b
tag['id']

'boldest'

In [37]:
#to access id name and value in dictionary format
    
tag.attrs

{'id': 'boldest'}

In [38]:
# to modfify tag: adding or removing tags
tag['id'] = 'verybold' #change id value from bold to verybold
tag['Second_attribute'] = 1 #adding 2nd attribute
tag

<b Second_attribute="1" id="verybold">bold</b>

In [39]:
#to delete tag
del tag['id']

In [40]:
tag #left with second_attribute

<b Second_attribute="1">bold</b>

In [41]:
del tag['Second_attribute']

In [42]:
tag #both attributes are deleted and only string content is left 

<b>bold</b>

In [43]:
# as id tag is deleted, now I will try to fetch id tag 
tag['id']

KeyError: 'id'

In [44]:
tag.get('id') #returns nothing 

#### Multi valued attributes
- I can have multiple class defined
- Most common multi-valued attribute is class
- As shown below, I defined class body and body strikeout. 
- When I fetched class of p tag, p['class'] both class names are returned

In [47]:
css_soup = BeautifulSoup('<p class="body"></p>', 'html.parser')
css_soup.p['class']

['body']

In [48]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.p['class']

['body', 'strikeout']

Class is multi valued attribute. id is not. when id vlaue is reassigned, it do not hold the second value of id. Hence the latest value of ID is returned

In [49]:
id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser')
id_soup.p['id']

'my id'

In [50]:
id_soup = BeautifulSoup('<p id="my id 2"></p>', 'html.parser')
id_soup.p['id']

'my id 2'

- The rel attribute specifies the relationship between the current document and the linked document.
![image.png](attachment:image.png)

In [51]:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>', 'html.parser')
rel_soup.a['rel']

['index']

In [52]:
#assigning another value to a[rel], it holds 2 values: index and contents
rel_soup.a['rel'] = ['index', 'contents']

print(rel_soup.p)

<p>Back to the <a rel="index contents">homepage</a></p>


In [53]:
rel_soup.a['rel']


['index', 'contents']

In [54]:
# to disable this by passing multi_valued_attributes=None as a keyword argument 
#into the BeautifulSoup constructor:

no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser', multi_valued_attributes=None)
no_list_soup.p['class']


'body strikeout'

In [55]:

no_list_soup = BeautifulSoup('<p class="body strikeout 2"></p>', 'html.parser', multi_valued_attributes=None)
no_list_soup.p['class']

# in last example, value of both classes were printed. 
# to disable this I used another attribute multi_valued_attributes=None

'body strikeout 2'

To get list of attributes: get_attribute_list
- This returns single and even multi valued attribute

In [56]:
id_soup.p.get_attribute_list('id') # for id 

['my id 2']

In [58]:
no_list_soup.p.get_attribute_list('class') # no_list_soup had single class after using argument
#multi_valued_attribute= None

['body strikeout 2']

In [59]:
css_soup.p.get_attribute_list('class')
#one of the previous examples where class was defined as body and strike out. and both were returned

['body', 'strikeout']

Argument multi_valued_attributes is set to class_is_multi, to retain multiple values in class tag

In [61]:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class'] 
#class with single attribute

'body strikeout'

In [60]:
class_is_multi= { '*' : 'class'}
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi)
xml_soup.p['class']

['body', 'strikeout']

## 2. Navigable String Objects

- Strings are nothing but text within tags
- To manipulate these texts bs uses NavigableString class
- Along with features of string in python, NavigableStrng has other features like tree traversal

In [62]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
tag.string
#fetches text within tag

'Extremely bold'

In [63]:
type(tag.string)
# returns type of string within tag: NavigableString object

bs4.element.NavigableString

In [66]:
unicode_string = str(tag.string)
unicode_string
#tag.string(NavigableString) is converted into string by passing into method str()

'Extremely bold'

In [67]:
type(unicode_string) #it is converted from NavigableString to string 

str

In [68]:
# to edit string: use replace_with() and pass value of new string
tag.string.replace_with("No longer bold")
tag

<b class="boldest">No longer bold</b>

In [69]:
type(tag.string)

bs4.element.NavigableString

## 3. BeautifulSoup Object
- The BeautifulSoup object represents the parsed document as a whole.
- It can also be treated as tag object
- It can also be used to combine two documents


In [71]:
doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml") #doc 1
footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml") #doc 2

doc.find(text="INSERT FOOTER HERE").replace_with(footer)
#this prints just text which I'm fiding

'INSERT FOOTER HERE'

In [73]:
print(doc)
# once the text is found, replace it with new content(doc) which has text "here is footer"

<?xml version="1.0" encoding="utf-8"?>
<document><content/><footer>Here's the footer</footer></document>


In [74]:
soup.name

'[document]'

## 4. Comment object

In [75]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string
type(comment)

bs4.element.Comment

In [76]:
comment #to print comment

'Hey, buddy. Want to buy a used parser?'

In [77]:
print(soup.b.prettify())
# fetch b tag, then apply prettify() 
#prettifies the HTML with proper indents and everything.

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>
